aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore2
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Kconfig.locks117
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile163
-rw-r--r--kernel/acct.c31
-rw-r--r--kernel/async.c167
-rw-r--r--kernel/audit.c1214
-rw-r--r--kernel/audit.h193
-rw-r--r--kernel/audit_tree.c86
-rw-r--r--kernel/audit_watch.c60
-rw-r--r--kernel/auditfilter.c615
-rw-r--r--kernel/auditsc.c1202
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/capability.c75
-rw-r--r--kernel/cgroup.c7227
-rw-r--r--kernel/cgroup_freezer.c590
-rw-r--r--kernel/compat.c491
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/context_tracking.c216
-rw-r--r--kernel/cpu.c300
-rw-r--r--kernel/cpu_pm.c16
-rw-r--r--kernel/cpuset.c1759
-rw-r--r--kernel/cred.c216
-rw-r--r--kernel/debug/debug_core.c175
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/gdbstub.c14
-rw-r--r--kernel/debug/kdb/kdb_bp.c27
-rw-r--r--kernel/debug/kdb/kdb_bt.c5
-rw-r--r--kernel/debug/kdb/kdb_debugger.c32
-rw-r--r--kernel/debug/kdb/kdb_io.c50
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c282
-rw-r--r--kernel/debug/kdb/kdb_private.h12
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/delayacct.c14
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/elfcore.c10
-rw-r--r--kernel/events/Makefile3
-rw-r--r--kernel/events/callchain.c38
-rw-r--r--kernel/events/core.c2378
-rw-r--r--kernel/events/hw_breakpoint.c225
-rw-r--r--kernel/events/internal.h108
-rw-r--r--kernel/events/ring_buffer.c152
-rw-r--r--kernel/events/uprobes.c1993
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c486
-rw-r--r--kernel/extable.c12
-rw-r--r--kernel/fork.c615
-rw-r--r--kernel/freezer.c37
-rw-r--r--kernel/futex.c732
-rw-r--r--kernel/futex_compat.c57
-rw-r--r--kernel/gcov/Kconfig32
-rw-r--r--kernel/gcov/Makefile32
-rw-r--r--kernel/gcov/base.c38
-rw-r--r--kernel/gcov/fs.c54
-rw-r--r--kernel/gcov/gcc_3_4.c115
-rw-r--r--kernel/gcov/gcc_4_7.c565
-rw-r--r--kernel/gcov/gcov.h65
-rw-r--r--kernel/groups.c66
-rw-r--r--kernel/hrtimer.c178
-rw-r--r--kernel/hung_task.c58
-rw-r--r--kernel/irq/Kconfig37
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c154
-rw-r--r--kernel/irq/debug.h38
-rw-r--r--kernel/irq/devres.c45
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/generic-chip.c314
-rw-r--r--kernel/irq/handle.c40
-rw-r--r--kernel/irq/internals.h24
-rw-r--r--kernel/irq/irqdesc.c103
-rw-r--r--kernel/irq/irqdomain.c762
-rw-r--r--kernel/irq/manage.c486
-rw-r--r--kernel/irq/migration.c9
-rw-r--r--kernel/irq/pm.c9
-rw-r--r--kernel/irq/proc.c30
-rw-r--r--kernel/irq/resend.c15
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c125
-rw-r--r--kernel/irq_work.c154
-rw-r--r--kernel/itimer.c8
-rw-r--r--kernel/jump_label.c142
-rw-r--r--kernel/kallsyms.c69
-rw-r--r--kernel/kcmp.c197
-rw-r--r--kernel/kexec.c359
-rw-r--r--kernel/kfifo.c608
-rw-r--r--kernel/kmod.c357
-rw-r--r--kernel/kprobes.c830
-rw-r--r--kernel/ksysfs.c33
-rw-r--r--kernel/kthread.c388
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/locking/Makefile28
-rw-r--r--kernel/locking/lglock.c89
-rw-r--r--kernel/locking/lockdep.c (renamed from kernel/lockdep.c)140
-rw-r--r--kernel/locking/lockdep_internals.h (renamed from kernel/lockdep_internals.h)6
-rw-r--r--kernel/locking/lockdep_proc.c (renamed from kernel/lockdep_proc.c)17
-rw-r--r--kernel/locking/lockdep_states.h (renamed from kernel/lockdep_states.h)0
-rw-r--r--kernel/locking/locktorture.c454
-rw-r--r--kernel/locking/mcs_spinlock.c210
-rw-r--r--kernel/locking/mcs_spinlock.h130
-rw-r--r--kernel/locking/mutex-debug.c (renamed from kernel/mutex-debug.c)22
-rw-r--r--kernel/locking/mutex-debug.h (renamed from kernel/mutex-debug.h)0
-rw-r--r--kernel/locking/mutex.c930
-rw-r--r--kernel/locking/mutex.h (renamed from kernel/mutex.h)0
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/qrwlock.c133
-rw-r--r--kernel/locking/rtmutex-debug.c (renamed from kernel/rtmutex-debug.c)9
-rw-r--r--kernel/locking/rtmutex-debug.h (renamed from kernel/rtmutex-debug.h)5
-rw-r--r--kernel/locking/rtmutex-tester.c (renamed from kernel/rtmutex-tester.c)6
-rw-r--r--kernel/locking/rtmutex.c (renamed from kernel/rtmutex.c)447
-rw-r--r--kernel/locking/rtmutex.h (renamed from kernel/rtmutex.h)5
-rw-r--r--kernel/locking/rtmutex_common.h (renamed from kernel/rtmutex_common.h)23
-rw-r--r--kernel/locking/rwsem-spinlock.c296
-rw-r--r--kernel/locking/rwsem-xadd.c513
-rw-r--r--kernel/locking/rwsem.c (renamed from kernel/rwsem.c)58
-rw-r--r--kernel/locking/semaphore.c (renamed from kernel/semaphore.c)10
-rw-r--r--kernel/locking/spinlock.c (renamed from kernel/spinlock.c)16
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/module-internal.h12
-rw-r--r--kernel/module.c1102
-rw-r--r--kernel/module_signing.c250
-rw-r--r--kernel/mutex.c500
-rw-r--r--kernel/notifier.c24
-rw-r--r--kernel/nsproxy.c100
-rw-r--r--kernel/padata.c93
-rw-r--r--kernel/panic.c108
-rw-r--r--kernel/params.c169
-rw-r--r--kernel/pid.c116
-rw-r--r--kernel/pid_namespace.c223
-rw-r--r--kernel/posix-cpu-timers.c804
-rw-r--r--kernel/posix-timers.c132
-rw-r--r--kernel/power/Kconfig75
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/autosleep.c128
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/console.c118
-rw-r--r--kernel/power/hibernate.c257
-rw-r--r--kernel/power/main.c287
-rw-r--r--kernel/power/power.h51
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c84
-rw-r--r--kernel/power/qos.c198
-rw-r--r--kernel/power/snapshot.c79
-rw-r--r--kernel/power/suspend.c272
-rw-r--r--kernel/power/suspend_test.c35
-rw-r--r--kernel/power/swap.c172
-rw-r--r--kernel/power/user.c70
-rw-r--r--kernel/power/wakelock.c268
-rw-r--r--kernel/printk.c1762
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/braille.c49
-rw-r--r--kernel/printk/braille.h48
-rw-r--r--kernel/printk/console_cmdline.h14
-rw-r--r--kernel/printk/printk.c3009
-rw-r--r--kernel/profile.c85
-rw-r--r--kernel/ptrace.c327
-rw-r--r--kernel/range.c22
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h (renamed from kernel/rcu.h)60
-rw-r--r--kernel/rcu/rcutorture.c1707
-rw-r--r--kernel/rcu/srcu.c693
-rw-r--r--kernel/rcu/tiny.c (renamed from kernel/rcutiny.c)123
-rw-r--r--kernel/rcu/tiny_plugin.h174
-rw-r--r--kernel/rcu/tree.c3744
-rw-r--r--kernel/rcu/tree.h (renamed from kernel/rcutree.h)285
-rw-r--r--kernel/rcu/tree_plugin.h2854
-rw-r--r--kernel/rcu/tree_trace.c (renamed from kernel/rcutree_trace.c)385
-rw-r--r--kernel/rcu/update.c (renamed from kernel/rcupdate.c)268
-rw-r--r--kernel/rcutiny_plugin.h1073
-rw-r--r--kernel/rcutorture.c1833
-rw-r--r--kernel/rcutree.c2289
-rw-r--r--kernel/rcutree_plugin.h2199
-rw-r--r--kernel/reboot.c433
-rw-r--r--kernel/relay.c33
-rw-r--r--kernel/res_counter.c148
-rw-r--r--kernel/resource.c305
-rw-r--r--kernel/sched/Makefile9
-rw-r--r--kernel/sched/auto_group.c14
-rw-r--r--kernel/sched/clock.c136
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c5003
-rw-r--r--kernel/sched/cpuacct.c283
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.c229
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/cpupri.c32
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c838
-rw-r--r--kernel/sched/deadline.c1676
-rw-r--r--kernel/sched/debug.c255
-rw-r--r--kernel/sched/fair.c5041
-rw-r--r--kernel/sched/features.h53
-rw-r--r--kernel/sched/idle.c273
-rw-r--r--kernel/sched/idle_task.c13
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c554
-rw-r--r--kernel/sched/sched.h725
-rw-r--r--kernel/sched/stats.c80
-rw-r--r--kernel/sched/stats.h90
-rw-r--r--kernel/sched/stop_task.c39
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)219
-rw-r--r--kernel/seccomp.c473
-rw-r--r--kernel/signal.c796
-rw-r--r--kernel/smp.c591
-rw-r--r--kernel/smpboot.c313
-rw-r--r--kernel/smpboot.h20
-rw-r--r--kernel/softirq.c553
-rw-r--r--kernel/srcu.c315
-rw-r--r--kernel/stop_machine.c460
-rw-r--r--kernel/sys.c1333
-rw-r--r--kernel/sys_ni.c12
-rw-r--r--kernel/sysctl.c921
-rw-r--r--kernel/sysctl_binary.c58
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/task_work.c128
-rw-r--r--kernel/taskstats.c98
-rw-r--r--kernel/test_kprobes.c2
-rw-r--r--kernel/time.c31
-rw-r--r--kernel/time/Kconfig197
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/alarmtimer.c201
-rw-r--r--kernel/time/clockevents.c412
-rw-r--r--kernel/time/clocksource.c314
-rw-r--r--kernel/time/jiffies.c46
-rw-r--r--kernel/time/ntp.c311
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/sched_clock.c217
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c106
-rw-r--r--kernel/time/tick-broadcast.c509
-rw-r--r--kernel/time/tick-common.c244
-rw-r--r--kernel/time/tick-internal.h39
-rw-r--r--kernel/time/tick-sched.c709
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timekeeping.c1277
-rw-r--r--kernel/time/timekeeping_debug.c74
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/time/timer_list.c113
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c492
-rw-r--r--kernel/torture.c733
-rw-r--r--kernel/trace/Kconfig169
-rw-r--r--kernel/trace/Makefile15
-rw-r--r--kernel/trace/blktrace.c101
-rw-r--r--kernel/trace/ftrace.c1775
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c1472
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c4102
-rw-r--r--kernel/trace/trace.h670
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_branch.c14
-rw-r--r--kernel/trace/trace_clock.c16
-rw-r--r--kernel/trace/trace_entries.h89
-rw-r--r--kernel/trace/trace_event_perf.c244
-rw-r--r--kernel/trace/trace_events.c1936
-rw-r--r--kernel/trace/trace_events_filter.c472
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_export.c84
-rw-r--r--kernel/trace/trace_functions.c530
-rw-r--r--kernel/trace/trace_functions_graph.c247
-rw-r--r--kernel/trace/trace_irqsoff.c176
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c1789
-rw-r--r--kernel/trace/trace_mmiotrace.c24
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c324
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_printk.c23
-rw-r--r--kernel/trace/trace_probe.c726
-rw-r--r--kernel/trace/trace_probe.h400
-rw-r--r--kernel/trace/trace_sched_switch.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c229
-rw-r--r--kernel/trace/trace_selftest.c459
-rw-r--r--kernel/trace/trace_stack.c127
-rw-r--r--kernel/trace/trace_stat.c43
-rw-r--r--kernel/trace/trace_syscalls.c301
-rw-r--r--kernel/trace/trace_uprobe.c1340
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/tracepoint.c726
-rw-r--r--kernel/tsacct.c56
-rw-r--r--kernel/uid16.c105
-rw-r--r--kernel/up.c69
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c74
-rw-r--r--kernel/user_namespace.c908
-rw-r--r--kernel/utsname.c38
-rw-r--r--kernel/utsname_sysctl.c13
-rw-r--r--kernel/watchdog.c455
-rw-r--r--kernel/workqueue.c5082
-rw-r--r--kernel/workqueue_internal.h74
-rw-r--r--kernel/workqueue_sched.h9
298 files changed, 79278 insertions, 40446 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f43..790d83c7d16 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,5 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
+x509_certificate_list
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b0..2a202a84675 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
default 1000 if HZ_1000
config SCHED_HRTICK
- def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+ def_bool HIGH_RES_TIMERS
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75..76768ee812b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
bool
+config UNINLINE_SPIN_UNLOCK
+ bool
+
#
# lock_* functions are inlined when:
# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -103,100 +106,134 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
#
+if !DEBUG_SPINLOCK
+
config INLINE_SPIN_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK
config INLINE_SPIN_TRYLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK_BH
config INLINE_SPIN_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
config INLINE_SPIN_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
config INLINE_SPIN_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
config INLINE_SPIN_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQSAVE
-
-config INLINE_SPIN_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
config INLINE_SPIN_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
config INLINE_READ_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_READ_TRYLOCK
config INLINE_READ_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
config INLINE_READ_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
config INLINE_READ_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
config INLINE_READ_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
config INLINE_WRITE_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_WRITE_TRYLOCK
config INLINE_WRITE_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
config INLINE_WRITE_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
config INLINE_WRITE_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
config INLINE_WRITE_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+endif
+
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES
+ def_bool y
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26..3f9c97419f0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
select PREEMPT_COUNT
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e7..f2a8b6246ce 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,56 +2,50 @@
# Makefile for the linux kernel.
#
-obj-y = fork.o exec_domain.o panic.o printk.o \
+obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o cred.o \
- async.o range.o groups.o
+ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ extable.o params.o posix-timers.o \
+ kthread.o sys_ni.o posix-cpu-timers.o \
+ hrtimer.o nsproxy.o \
+ notifier.o ksysfs.o cred.o reboot.o \
+ async.o range.o groups.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
obj-y += sched/
+obj-y += locking/
obj-y += power/
+obj-y += printk/
+obj-y += irq/
+obj-y += rcu/
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
-obj-$(CONFIG_LOCKDEP) += lockdep.o
-ifeq ($(CONFIG_PROC_FS),y)
-obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
-endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -76,14 +70,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_TINY_RCU) += rcutiny.o
-obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -95,7 +82,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
-obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
@@ -107,6 +94,8 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
@@ -123,8 +112,112 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
+#
+# We look in the source root and the build root for all files whose name ends
+# in ".x509". Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
+#
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+ $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
+
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
+
+ifneq ($(wildcard $(obj)/.x509.list),)
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+
+kernel/system_certificates.o: $(obj)/x509_certificate_list
+
+quiet_cmd_x509certs = CERTS $@
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+
+targets += $(obj)/x509_certificate_list
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+ $(call if_changed,x509certs)
+
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+ @echo $(X509_CERTIFICATES) >$@
+endif
+
+clean-files := x509_certificate_list .x509.list
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+ifndef CONFIG_MODULE_SIG_HASH
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and uses a hardware random"
+ @echo "### number generator if one is available."
+ @echo "###"
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv 2>&1
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "O = Magrathea"
+ @echo >>x509.genkey "CN = Glacier signing key"
+ @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 02e6167a53b..808a86ff229 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
- res = act>0;
+ res = act > 0;
goto out;
}
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
}
}
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt;
@@ -201,11 +201,11 @@ static int acct_on(char *name)
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -260,9 +260,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- char *tmp = getname(name);
+ struct filename *tmp = getname(name);
if (IS_ERR(tmp))
- return (PTR_ERR(tmp));
+ return PTR_ERR(tmp);
error = acct_on(tmp);
putname(tmp);
} else {
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
/* we really need to bite the bullet and change layout */
- ac.ac_uid = orig_cred->uid;
- ac.ac_gid = orig_cred->gid;
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+ file_end_write(file);
out:
revert_creds(orig_cred);
}
@@ -566,6 +573,7 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -593,8 +601,9 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime += current->utime;
- pacct->ac_stime += current->stime;
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bb..61f023ce022 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,55 +57,48 @@ asynchronous and synchronous parts of the kernel.
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include "workqueue_internal.h"
+
static async_cookie_t next_cookie = 1;
-#define MAX_WORK 32768
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
-static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
struct async_entry {
- struct list_head list;
+ struct list_head domain_list;
+ struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
- async_func_ptr *func;
+ async_func_t func;
void *data;
- struct list_head *running;
+ struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t __lowest_in_progress(struct list_head *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct async_entry *entry;
-
- if (!list_empty(running)) {
- entry = list_first_entry(running,
- struct async_entry, list);
- return entry->cookie;
- }
+ struct list_head *pending;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
+ unsigned long flags;
- list_for_each_entry(entry, &async_pending, list)
- if (entry->running == running)
- return entry->cookie;
+ spin_lock_irqsave(&async_lock, flags);
- return next_cookie; /* "infinity" value */
-}
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
-static async_cookie_t lowest_in_progress(struct list_head *running)
-{
- unsigned long flags;
- async_cookie_t ret;
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
- spin_lock_irqsave(&async_lock, flags);
- ret = __lowest_in_progress(running);
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
@@ -120,12 +113,7 @@ static void async_run_entry_fn(struct work_struct *work)
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
- /* 1) move self to the running queue */
- spin_lock_irqsave(&async_lock, flags);
- list_move_tail(&entry->list, entry->running);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* 2) run (and print duration) */
+ /* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
(long long)entry->cookie,
@@ -142,21 +130,22 @@ static void async_run_entry_fn(struct work_struct *work)
(long long)ktime_to_ns(delta) >> 10);
}
- /* 3) remove self from the running queue */
+ /* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
- list_del(&entry->list);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
- /* 4) free the entry */
+ /* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 5) wake up any waiters */
+ /* 4) wake up any waiters */
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -176,20 +165,31 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
- ptr(data, newcookie);
+ func(data, newcookie);
return newcookie;
}
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = ptr;
+ entry->func = func;
entry->data = data;
- entry->running = running;
+ entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->list, &async_pending);
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
/* schedule for execution */
queue_work(system_unbound_wq, &entry->work);
@@ -198,34 +198,34 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
/**
* async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
{
- return __async_schedule(ptr, data, &async_running);
+ return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
+ struct async_domain *domain)
{
- return __async_schedule(ptr, data, running);
+ return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -236,36 +236,51 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- do {
- async_synchronize_cookie(next_cookie);
- } while (!list_empty(&async_running) || !list_empty(&async_pending));
+ async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+ spin_lock_irq(&async_lock);
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
+ domain->registered = 0;
+ spin_unlock_irq(&async_lock);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
+/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by @domain have been done.
*/
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, list);
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie,
- struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t uninitialized_var(starttime), delta, endtime;
@@ -274,7 +289,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
starttime = ktime_get();
}
- wait_event(async_done, lowest_in_progress(running) >= cookie);
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
@@ -296,6 +311,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_domain(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+ struct worker *worker = current_wq_worker();
+
+ return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0..3ef2e0e797e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,14 +41,18 @@
* Example user-space utilities: http://people.redhat.com/sgrubb/audit/
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
-#include <asm/types.h>
+#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
#include <linux/audit.h>
@@ -58,9 +62,10 @@
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
-#include <linux/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
+#include <linux/pid_namespace.h>
+#include <net/netns/generic.h>
#include "audit.h"
@@ -74,37 +79,39 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-int audit_enabled;
-int audit_ever_enabled;
+u32 audit_enabled;
+u32 audit_ever_enabled;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static int audit_default;
+static u32 audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
-static int audit_failure = AUDIT_FAIL_PRINTK;
+static u32 audit_failure = AUDIT_FAIL_PRINTK;
/*
* If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_pid contains
- * the pid to use to send netlink messages to that process.
+ * contains the pid of the auditd process and audit_nlk_portid contains
+ * the portid to use to send netlink messages to that process.
*/
int audit_pid;
-static int audit_nlk_pid;
+static __u32 audit_nlk_portid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
-static int audit_rate_limit;
+static u32 audit_rate_limit;
-/* Number of outstanding audit_buffers allowed. */
-static int audit_backlog_limit = 64;
-static int audit_backlog_wait_time = 60 * HZ;
-static int audit_backlog_wait_overflow = 0;
+/* Number of outstanding audit_buffers allowed.
+ * When set to zero, this means unlimited. */
+static u32 audit_backlog_limit = 64;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
-uid_t audit_sig_uid = -1;
+kuid_t audit_sig_uid = INVALID_UID;
pid_t audit_sig_pid = -1;
u32 audit_sig_sid = 0;
@@ -119,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
+int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -137,6 +145,17 @@ static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
+ .mask = -1,
+ .features = 0,
+ .lock = 0,};
+
+static char *audit_feature_names[2] = {
+ "only_unset_loginuid",
+ "loginuid_immutable",
+};
+
+
/* Serialize requests from userspace. */
DEFINE_MUTEX(audit_cmd_mutex);
@@ -162,27 +181,27 @@ struct audit_buffer {
};
struct audit_reply {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff *skb;
};
-static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_pid = pid;
+ nlh->nlmsg_pid = portid;
}
}
void audit_panic(const char *message)
{
- switch (audit_failure)
- {
+ switch (audit_failure) {
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
if (printk_ratelimit())
- printk(KERN_ERR "audit: %s\n", message);
+ pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
/* test audit_pid since printk is always losey, why bother? */
@@ -253,9 +272,7 @@ void audit_log_lost(const char *message)
if (print) {
if (printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_lost=%d audit_rate_limit=%d "
- "audit_backlog_limit=%d\n",
+ pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
atomic_read(&audit_lost),
audit_rate_limit,
audit_backlog_limit);
@@ -263,39 +280,29 @@ void audit_log_lost(const char *message)
}
}
-static int audit_log_config_change(char *function_name, int new, int old,
- uid_t loginuid, u32 sessionid, u32 sid,
+static int audit_log_config_change(char *function_name, u32 new, u32 old,
int allow_changes)
{
struct audit_buffer *ab;
int rc = 0;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
- old, loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
-
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc) {
- audit_log_format(ab, " sid=%u", sid);
- allow_changes = 0; /* Something weird, deny request */
- } else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (unlikely(!ab))
+ return rc;
+ audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_session_info(ab);
+ rc = audit_log_task_context(ab);
+ if (rc)
+ allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
-static int audit_do_config_change(char *function_name, int *to_change,
- int new, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
- int allow_changes, rc = 0, old = *to_change;
+ int allow_changes, rc = 0;
+ u32 old = *to_change;
/* check if we are locked */
if (audit_enabled == AUDIT_LOCKED)
@@ -304,8 +311,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
- rc = audit_log_config_change(function_name, new, old, loginuid,
- sessionid, sid, allow_changes);
+ rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
@@ -319,44 +325,43 @@ static int audit_do_config_change(char *function_name, int *to_change,
return rc;
}
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_rate_limit(u32 limit)
+{
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
+}
+
+static int audit_set_backlog_limit(u32 limit)
{
- return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_backlog_wait_time(u32 timeout)
{
- return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_wait_time",
+ &audit_backlog_wait_time, timeout);
}
-static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(u32 state)
{
int rc;
if (state < AUDIT_OFF || state > AUDIT_LOCKED)
return -EINVAL;
- rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
- loginuid, sessionid, sid);
-
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
-static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(u32 state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
- return audit_do_config_change("audit_failure", &audit_failure, state,
- loginuid, sessionid, sid);
+ return audit_do_config_change("audit_failure", &audit_failure, state);
}
/*
@@ -371,7 +376,8 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
static void audit_hold_skb(struct sk_buff *skb)
{
if (audit_default &&
- skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+ (!audit_backlog_limit ||
+ skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
skb_queue_tail(&audit_skb_hold_queue, skb);
else
kfree_skb(skb);
@@ -384,13 +390,13 @@ static void audit_hold_skb(struct sk_buff *skb)
static void audit_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- char *data = NLMSG_DATA(nlh);
+ char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE) {
if (printk_ratelimit())
- printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
- audit_log_lost("printk limit exceeded\n");
+ audit_log_lost("printk limit exceeded");
}
audit_hold_skb(skb);
@@ -401,12 +407,15 @@ static void kauditd_send_skb(struct sk_buff *skb)
int err;
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
- printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared\n");
- audit_pid = 0;
+ if (audit_pid) {
+ pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd disappeared");
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
} else
@@ -414,96 +423,132 @@ static void kauditd_send_skb(struct sk_buff *skb)
consume_skb(skb);
}
-static int kauditd_thread(void *dummy)
+/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, GFP_KERNEL);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+
+/*
+ * flush_hold_queue - empty the hold queue if auditd appears
+ *
+ * If auditd just started, drain the queue of messages already
+ * sent to syslog/printk. Remember loss here is ok. We already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * If you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+static void flush_hold_queue(void)
{
struct sk_buff *skb;
+ if (!audit_default || !audit_pid)
+ return;
+
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (likely(!skb))
+ return;
+
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+
+ /*
+ * if auditd just disappeared but we
+ * dequeued an skb we need to drop ref
+ */
+ if (skb)
+ consume_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
set_freezable();
while (!kthread_should_stop()) {
- /*
- * if auditd just started drain the queue of messages already
- * sent to syslog/printk. remember loss here is ok. we already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
- *
- * if you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
- */
- if (audit_default && audit_pid) {
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (unlikely(skb)) {
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
- }
- }
+ struct sk_buff *skb;
+ DECLARE_WAITQUEUE(wait, current);
+
+ flush_hold_queue();
skb = skb_dequeue(&audit_skb_queue);
- wake_up(&audit_backlog_wait);
+
if (skb) {
+ if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
else
audit_printk_skb(skb);
- } else {
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
-
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ continue;
}
- }
- return 0;
-}
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
-{
- struct task_struct *tsk;
- int err;
+ if (!skb_queue_len(&audit_skb_queue)) {
+ try_to_freeze();
+ schedule();
+ }
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- return -ESRCH;
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
}
- get_task_struct(tsk);
- rcu_read_unlock();
- err = tty_audit_push_task(tsk, loginuid, sessionid);
- put_task_struct(tsk);
- return err;
+ return 0;
}
int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
- int pid = dest->pid;
struct sk_buff *skb;
+ struct net *net = dest->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(audit_sock, skb, pid, 0);
+ netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
}
-struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
int multi, const void *payload, int size)
{
struct sk_buff *skb;
@@ -516,33 +561,37 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
if (!skb)
return NULL;
- nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
- data = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ if (!nlh)
+ goto out_kfree_skb;
+ data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_NEW */
- if (skb)
- kfree_skb(skb);
+out_kfree_skb:
+ kfree_skb(skb);
return NULL;
}
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
+ struct net *net = reply->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
- * @pid: process id to send reply to
+ * @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
@@ -550,12 +599,14 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(int pid, int seq, int type, int done, int multi,
- const void *payload, int size)
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -564,11 +615,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi,
if (!reply)
return;
- skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
+ skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
if (!skb)
goto out;
- reply->pid = pid;
+ reply->net = get_net(net);
+ reply->portid = portid;
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -587,27 +639,49 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
+ /* Only support initial user namespace for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
+ if (current_user_ns() != &init_user_ns)
+ return -ECONNREFUSED;
+
switch (msg_type) {
- case AUDIT_GET:
case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- case AUDIT_SET:
case AUDIT_ADD:
- case AUDIT_ADD_RULE:
case AUDIT_DEL:
+ return -EOPNOTSUPP;
+ case AUDIT_GET:
+ case AUDIT_SET:
+ case AUDIT_GET_FEATURE:
+ case AUDIT_SET_FEATURE:
+ case AUDIT_LIST_RULES:
+ case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (!capable(CAP_AUDIT_CONTROL))
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if ((task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (!capable(CAP_AUDIT_WRITE))
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -617,45 +691,125 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
- u32 pid, u32 uid, uid_t auid, u32 ses,
- u32 sid)
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
- char *ctx = NULL;
- u32 len;
+ uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
- if (!audit_enabled) {
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return rc;
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
- audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
- pid, uid, auid, ses);
- if (sid) {
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc)
- audit_log_format(*ab, " ssid=%u", sid);
- else {
- audit_log_format(*ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
+ if (unlikely(!*ab))
+ return rc;
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_session_info(*ab);
+ audit_log_task_context(*ab);
+
+ return rc;
+}
+
+int is_audit_feature_set(int i)
+{
+ return af.features & AUDIT_FEATURE_TO_MASK(i);
+}
+
+
+static int audit_get_feature(struct sk_buff *skb)
+{
+ u32 seq;
+
+ seq = nlmsg_hdr(skb)->nlmsg_seq;
+
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
+
+ return 0;
+}
+
+static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
+ u32 old_lock, u32 new_lock, int res)
+{
+ struct audit_buffer *ab;
+
+ if (audit_enabled == AUDIT_OFF)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_feature_names[which], !!old_feature, !!new_feature,
+ !!old_lock, !!new_lock, res);
+ audit_log_end(ab);
+}
+
+static int audit_set_feature(struct sk_buff *skb)
+{
+ struct audit_features *uaf;
+ int i;
+
+ BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+ uaf = nlmsg_data(nlmsg_hdr(skb));
+
+ /* if there is ever a version 2 we should handle that here */
+
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
+
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+ old_lock = af.lock & feature;
+
+ /* are we changing a locked feature? */
+ if (old_lock && (new_feature != old_feature)) {
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 0);
+ return -EPERM;
}
}
+ /* nothing invalid, do the changes */
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
- return rc;
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ old_lock = af.lock & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+
+ if (new_feature != old_feature)
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 1);
+
+ if (new_feature)
+ af.features |= feature;
+ else
+ af.features &= ~feature;
+ af.lock |= new_lock;
+ }
+
+ return 0;
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- u32 uid, pid, seq, sid;
+ u32 seq;
void *data;
- struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
- uid_t loginuid; /* loginuid of sender */
- u32 sessionid;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
@@ -666,70 +820,90 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
- pid = NETLINK_CREDS(skb)->pid;
- uid = NETLINK_CREDS(skb)->uid;
- loginuid = audit_get_loginuid(current);
- sessionid = audit_get_sessionid(current);
- security_task_getsecid(current, &sid);
seq = nlh->nlmsg_seq;
- data = NLMSG_DATA(nlh);
+ data = nlmsg_data(nlh);
switch (msg_type) {
- case AUDIT_GET:
- status_set.enabled = audit_enabled;
- status_set.failure = audit_failure;
- status_set.pid = audit_pid;
- status_set.rate_limit = audit_rate_limit;
- status_set.backlog_limit = audit_backlog_limit;
- status_set.lost = atomic_read(&audit_lost);
- status_set.backlog = skb_queue_len(&audit_skb_queue);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
- &status_set, sizeof(status_set));
+ case AUDIT_GET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
+ s.pid = audit_pid;
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_skb_queue);
+ s.version = AUDIT_VERSION_LATEST;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
- case AUDIT_SET:
- if (nlh->nlmsg_len < sizeof(struct audit_status))
- return -EINVAL;
- status_get = (struct audit_status *)data;
- if (status_get->mask & AUDIT_STATUS_ENABLED) {
- err = audit_set_enabled(status_get->enabled,
- loginuid, sessionid, sid);
+ }
+ case AUDIT_SET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ if (s.mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(s.enabled);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_FAILURE) {
- err = audit_set_failure(status_get->failure,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(s.failure);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_PID) {
- int new_pid = status_get->pid;
+ if (s.mask & AUDIT_STATUS_PID) {
+ int new_pid = s.pid;
+ if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+ return -EACCES;
if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid,
- audit_pid, loginuid,
- sessionid, sid, 1);
-
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_pid = new_pid;
- audit_nlk_pid = NETLINK_CB(skb).pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
+ audit_sock = skb->sk;
+ }
+ if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
+ err = audit_set_rate_limit(s.rate_limit);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
+ err = audit_set_backlog_limit(s.backlog_limit);
+ if (err < 0)
+ return err;
}
- if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
- err = audit_set_rate_limit(status_get->rate_limit,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+ if (sizeof(s) > (size_t)nlh->nlmsg_len)
+ return -EINVAL;
+ if (s.backlog_wait_time < 0 ||
+ s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+ return -EINVAL;
+ err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
- err = audit_set_backlog_limit(status_get->backlog_limit,
- loginuid, sessionid, sid);
+ break;
+ }
+ case AUDIT_GET_FEATURE:
+ err = audit_get_feature(skb);
+ if (err)
+ return err;
+ break;
+ case AUDIT_SET_FEATURE:
+ err = audit_set_feature(skb);
+ if (err)
+ return err;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -737,79 +911,54 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(&NETLINK_CB(skb));
- if (err == 1) {
+ err = audit_filter_user(msg_type);
+ if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = audit_prepare_user_tty(pid, loginuid,
- sessionid);
+ err = tty_audit_push_current();
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type, pid, uid,
- loginuid, sessionid, sid);
-
+ mutex_unlock(&audit_cmd_mutex);
+ audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
- audit_log_format(ab, " msg='%.1024s'",
+ audit_log_format(ab, " msg='%.*s'",
+ AUDIT_MESSAGE_TEXT_MAX,
(char *)data);
else {
int size;
- audit_log_format(ab, " msg=");
+ audit_log_format(ab, " data=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
size--;
audit_log_n_untrustedstring(ab, data, size);
}
- audit_set_pid(ab, pid);
+ audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
+ mutex_lock(&audit_cmd_mutex);
}
break;
- case AUDIT_ADD:
- case AUDIT_DEL:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule))
- return -EINVAL;
- if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
- audit_log_end(ab);
- return -EPERM;
- }
- /* fallthrough */
- case AUDIT_LIST:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
- break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
audit_log_end(ab);
return -EPERM;
}
- /* fallthrough */
+ err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh));
+ break;
case AUDIT_LIST_RULES:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
+ err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
-
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -839,8 +988,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
@@ -865,53 +1013,56 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
return -ENOMEM;
}
- sig_data->uid = audit_sig_uid;
+ sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk;
- unsigned long flags;
-
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk && lock_task_sighand(tsk, &flags)) {
- s.enabled = tsk->signal->audit_tty != 0;
- unlock_task_sighand(tsk, &flags);
- } else
- err = -ESRCH;
- rcu_read_unlock();
-
- if (!err)
- audit_send_reply(NETLINK_CB(skb).pid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ struct task_struct *tsk = current;
+
+ spin_lock(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty;
+ s.log_passwd = tsk->signal->audit_tty_log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
- struct audit_tty_status *s;
- struct task_struct *tsk;
- unsigned long flags;
+ struct audit_tty_status s, old;
+ struct task_struct *tsk = current;
+ struct audit_buffer *ab;
+
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ /* check if new data is valid */
+ if ((s.enabled != 0 && s.enabled != 1) ||
+ (s.log_passwd != 0 && s.log_passwd != 1))
+ err = -EINVAL;
+
+ spin_lock(&tsk->sighand->siglock);
+ old.enabled = tsk->signal->audit_tty;
+ old.log_passwd = tsk->signal->audit_tty_log_passwd;
+ if (!err) {
+ tsk->signal->audit_tty = s.enabled;
+ tsk->signal->audit_tty_log_passwd = s.log_passwd;
+ }
+ spin_unlock(&tsk->sighand->siglock);
- if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
- return -EINVAL;
- s = data;
- if (s->enabled != 0 && s->enabled != 1)
- return -EINVAL;
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk && lock_task_sighand(tsk, &flags)) {
- tsk->signal->audit_tty = s->enabled != 0;
- unlock_task_sighand(tsk, &flags);
- } else
- err = -ESRCH;
- rcu_read_unlock();
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
+ " old-log_passwd=%d new-log_passwd=%d res=%d",
+ old.enabled, s.enabled, old.log_passwd,
+ s.log_passwd, !err);
+ audit_log_end(ab);
break;
}
default:
@@ -930,7 +1081,7 @@ static void audit_receive_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
- * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
@@ -939,13 +1090,13 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- while (NLMSG_OK(nlh, len)) {
+ while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- nlh = NLMSG_NEXT(nlh, len);
+ nlh = nlmsg_next(nlh, &len);
}
}
@@ -957,6 +1108,56 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
+static int __net_init audit_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
+ };
+
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->nlsk == NULL) {
+ audit_panic("cannot initialize netlink socket in namespace");
+ return -ENOMEM;
+ }
+ aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ return 0;
+}
+
+static void __net_exit audit_net_exit(struct net *net)
+{
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+ if (sock == audit_sock) {
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
+
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
+ synchronize_net();
+ netlink_kernel_release(sock);
+}
+
+static struct pernet_operations audit_net_ops __net_initdata = {
+ .init = audit_net_init,
+ .exit = audit_net_exit,
+ .id = &audit_net_id,
+ .size = sizeof(struct audit_net),
+};
+
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
@@ -965,14 +1166,9 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
- audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
- audit_receive, NULL, THIS_MODULE);
- if (!audit_sock)
- audit_panic("cannot initialize netlink socket");
- else
- audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ pr_info("initializing netlink subsys (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ register_pernet_subsys(&audit_net_ops);
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
@@ -996,22 +1192,32 @@ static int __init audit_enable(char *str)
if (!audit_default)
audit_initialized = AUDIT_DISABLED;
- printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+ pr_info("%s\n", audit_default ?
+ "enabled (after initialization)" : "disabled (until reboot)");
- if (audit_initialized == AUDIT_INITIALIZED) {
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
- } else if (audit_initialized == AUDIT_UNINITIALIZED) {
- printk(" (after initialization)");
- } else {
- printk(" (until reboot)");
+ return 1;
+}
+__setup("audit=", audit_enable);
+
+/* Process kernel command-line parameter at boot time.
+ * audit_backlog_limit=<n> */
+static int __init audit_backlog_limit_set(char *str)
+{
+ u32 audit_backlog_limit_arg;
+
+ pr_info("audit_backlog_limit: ");
+ if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
+ pr_cont("using default of %u, unable to parse %s\n",
+ audit_backlog_limit, str);
+ return 1;
}
- printk("\n");
+
+ audit_backlog_limit = audit_backlog_limit_arg;
+ pr_cont("%d\n", audit_backlog_limit);
return 1;
}
-
-__setup("audit=", audit_enable);
+__setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
@@ -1060,13 +1266,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
- goto nlmsg_failure;
+ goto err;
- nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+ nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+ if (!nlh)
+ goto out_kfree_skb;
return ab;
-nlmsg_failure: /* Used by NLMSG_NEW */
+out_kfree_skb:
kfree_skb(ab->skb);
ab->skb = NULL;
err:
@@ -1117,12 +1325,24 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
-/* Obtain an audit buffer. This routine does locking to obtain the
- * audit buffer, but then no locking is required for calls to
- * audit_log_*format. If the tsk is a task that is currently in a
- * syscall, then the syscall is marked as auditable and an audit record
- * will be written at syscall exit. If there is no associated task, tsk
- * should be NULL. */
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static long wait_for_auditd(long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ sleep_time = schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+
+ return sleep_time;
+}
/**
* audit_log_start - obtain an audit buffer
@@ -1145,7 +1365,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
struct audit_buffer *ab = NULL;
struct timespec t;
unsigned int uninitialized_var(serial);
- int reserve;
+ int reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
unsigned long timeout_start = jiffies;
if (audit_initialized != AUDIT_INITIALIZED)
@@ -1154,42 +1375,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT)
- reserve = 0;
- else
- reserve = 5; /* Allow atomic callers to go up to five
- entries over the normal backlog limit */
+ if (gfp_mask & __GFP_WAIT) {
+ if (audit_pid && audit_pid == current->pid)
+ gfp_mask &= ~__GFP_WAIT;
+ else
+ reserve = 0;
+ }
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
- && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
-
- /* Wait for auditd to drain the queue a little */
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
- continue;
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ long sleep_time;
+
+ sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
+ if (sleep_time > 0) {
+ sleep_time = wait_for_auditd(sleep_time);
+ if (sleep_time > 0)
+ continue;
+ }
}
if (audit_rate_check() && printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_backlog=%d > "
- "audit_backlog_limit=%d\n",
- skb_queue_len(&audit_skb_queue),
- audit_backlog_limit);
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_skb_queue),
+ audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
audit_backlog_wait_time = audit_backlog_wait_overflow;
wake_up(&audit_backlog_wait);
return NULL;
}
+ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
@@ -1307,7 +1523,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
int i, avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
- static const unsigned char *hex = "0123456789ABCDEF";
if (!ab)
return;
@@ -1325,10 +1540,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
}
ptr = skb_tail_pointer(skb);
- for (i=0; i<len; i++) {
- *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
- *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
- }
+ for (i = 0; i < len; i++)
+ ptr = hex_byte_pack_upper(ptr, buf[i]);
*ptr = 0;
skb_put(skb, len << 1); /* new string is twice the old string */
}
@@ -1418,7 +1631,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
- struct path *path)
+ const struct path *path)
{
char *p, *pathname;
@@ -1440,6 +1653,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
kfree(pathname);
}
+void audit_log_session_info(struct audit_buffer *ab)
+{
+ unsigned int sessionid = audit_get_sessionid(current);
+ uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+ audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+}
+
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
@@ -1449,14 +1670,285 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i) {
+ audit_log_format(ab, "%08x",
+ cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ }
+}
+
+void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ kernel_cap_t *perm = &name->fcap.permitted;
+ kernel_cap_t *inh = &name->fcap.inheritable;
+ int log = 0;
+
+ if (!cap_isclear(*perm)) {
+ audit_log_cap(ab, "cap_fp", perm);
+ log = 1;
+ }
+ if (!cap_isclear(*inh)) {
+ audit_log_cap(ab, "cap_fi", inh);
+ log = 1;
+ }
+
+ if (log)
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+ struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ audit_log_format(ab, " nametype=");
+ switch(n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, "NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, "PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, "DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, "CREATE");
+ break;
+ default:
+ audit_log_format(ab, "UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return 0;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return 0;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ const struct cred *cred;
+ char name[sizeof(tsk->comm)];
+ struct mm_struct *mm = tsk->mm;
+ char *tty;
+
+ if (!ab)
+ return;
+
+ /* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
+ from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ tty, audit_get_sessionid(tsk));
+
+ get_task_comm(name, tsk);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
+ audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
+/**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+ struct audit_buffer *ab;
+ struct audit_names *name;
+
+ name = kzalloc(sizeof(*name), GFP_NOFS);
+ if (!name)
+ return;
+
+ /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_ANOM_LINK);
+ if (!ab)
+ goto out;
+ audit_log_format(ab, "op=%s", operation);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " res=0");
+ audit_log_end(ab);
+
+ /* Generate AUDIT_PATH record with object. */
+ name->type = AUDIT_TYPE_NORMAL;
+ audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+ audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+ kfree(name);
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is placed on a queue and a tasklet is scheduled to
- * remove them from the queue outside the irq context. May be called in
- * any context.
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1466,7 +1958,19 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+
+ kauditd_send_multicast_skb(ab->skb);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
+ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 81676680337..7bb65730c89 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
/* 0 = no checking
1 = put_count checking
@@ -29,6 +30,11 @@
*/
#define AUDIT_DEBUG 0
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
+
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
@@ -59,10 +65,167 @@ struct audit_entry {
struct audit_krule rule;
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
-extern int audit_ever_enabled;
+struct audit_cap_data {
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+ union {
+ unsigned int fE; /* effective bit of file cap */
+ kernel_cap_t effective; /* effective set of process */
+ };
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+ struct list_head list; /* audit_context->names_list */
+
+ struct filename *name;
+ int name_len; /* number of chars to log */
+ bool hidden; /* don't log this record */
+ bool name_put; /* call __putname()? */
+
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ unsigned char type; /* record type */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit.
+ */
+ bool should_free;
+};
+
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state, current_state;
+ unsigned int serial; /* serial number for record */
+ int major; /* syscall number */
+ struct timespec ctime; /* time of syscall entry */
+ unsigned long argv[4]; /* syscall arguments */
+ long return_code;/* syscall return code */
+ u64 prio;
+ int return_valid; /* return code is valid */
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* struct audit_names->list anchor */
+ char *filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+ struct sockaddr_storage *sockaddr;
+ size_t sockaddr_len;
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ kuid_t target_auid;
+ kuid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ struct list_head killed_trees;
+ int tree_count;
+
+ int type;
+ union {
+ struct {
+ int nargs;
+ long args[6];
+ } socketcall;
+ struct {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ u32 osid;
+ int has_perm;
+ uid_t perm_uid;
+ gid_t perm_gid;
+ umode_t perm_mode;
+ unsigned long qbytes;
+ } ipc;
+ struct {
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+ } mq_getsetattr;
+ struct {
+ mqd_t mqdes;
+ int sigev_signo;
+ } mq_notify;
+ struct {
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+ } mq_sendrecv;
+ struct {
+ int oflag;
+ umode_t mode;
+ struct mq_attr attr;
+ } mq_open;
+ struct {
+ pid_t pid;
+ struct audit_cap_data cap;
+ } capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
+ struct {
+ int argc;
+ } execve;
+ };
+ int fds[2];
+ struct audit_proctitle proctitle;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
#endif
+};
+
+extern u32 audit_ever_enabled;
+
+extern void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap);
+extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
+extern void audit_log_name(struct audit_context *context,
+ struct audit_names *n, struct path *path,
+ int record_num, int *call_panic);
extern int audit_pid;
@@ -74,22 +237,32 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen);
-extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
- int done, int multi,
- const void *payload, int size);
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
+ int done, int multi,
+ const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff_head q;
};
int audit_send_list(void *);
+struct audit_net {
+ struct sock *nlsk;
+};
+
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
@@ -144,7 +317,7 @@ extern void audit_kill_trees(struct list_head *);
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
-extern uid_t audit_sig_uid;
+extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e..135944a7b28 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,8 +249,7 @@ static void untag_chunk(struct node *p)
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
fsnotify_duplicate_mark(&new->mark, entry);
if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
- free_chunk(new);
+ fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -292,8 +291,8 @@ static void untag_chunk(struct node *p)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
entry = &chunk->mark;
if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
- free_chunk(chunk);
+ fsnotify_put_mark(entry);
return -ENOSPC;
}
@@ -332,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(entry);
return 0;
}
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
insert_hash(chunk);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ fsnotify_put_mark(entry); /* drop initial reference */
return 0;
}
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
fsnotify_duplicate_mark(chunk_entry, old_entry);
if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
- free_chunk(chunk);
+ fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
}
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(chunk_entry);
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(old_entry);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
+ fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
- fsnotify_put_mark(old_entry); /* and kill it */
return 0;
}
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
- struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove rule");
- audit_log_format(ab, " dir=");
- audit_log_untrustedstring(ab, rule->tree->pathname);
- audit_log_key(ab, rule->filterkey);
- audit_log_format(ab, " list=%d res=1", rule->listnr);
- audit_log_end(ab);
+ audit_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
@@ -595,7 +603,7 @@ void audit_trim_trees(void)
root_mnt = collect_mounts(&path);
path_put(&path);
- if (!root_mnt)
+ if (IS_ERR(root_mnt))
goto skip_it;
spin_lock(&hash_lock);
@@ -609,9 +617,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
@@ -650,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
struct vfsmount *mnt;
int err;
+ rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
@@ -669,8 +678,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
- if (!mnt) {
- err = -ENOMEM;
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto Err;
}
@@ -719,8 +728,8 @@ int audit_tag_tree(char *old, char *new)
return err;
tagged = collect_mounts(&path2);
path_put(&path2);
- if (!tagged)
- return -ENOMEM;
+ if (IS_ERR(tagged))
+ return PTR_ERR(tagged);
err = kern_path(old, 0, &path1);
if (err) {
@@ -903,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
}
static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmonut_mark,
- struct fsnotify_event *event)
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
{
- BUG();
- return -EOPNOTSUPP;
+ return 0;
}
static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -916,22 +926,16 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
evict_chunk(chunk);
- fsnotify_put_mark(entry);
-}
-static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- return false;
+ /*
+ * We are guaranteed to have at least one reference to the mark from
+ * either the inode or the caller of fsnotify_destroy_mark().
+ */
+ BUG_ON(atomic_read(&entry->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
.handle_event = audit_tree_handle_event,
- .should_send_event = audit_tree_send_event,
- .free_group_priv = NULL,
- .free_event_priv = NULL,
.freeing_mark = audit_tree_freeing_mark,
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d..70b4554d2fb 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,8 +240,10 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab, "auid=%u ses=%u op=",
- audit_get_loginuid(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
audit_log_string(ab, op);
audit_log_format(ab, " path=");
@@ -265,7 +267,8 @@ static void audit_update_watch(struct audit_parent *parent,
/* Run all of the watches on this parent looking for the one that
* matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
@@ -349,40 +352,21 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
}
mutex_unlock(&audit_filter_mutex);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct nameidata nd;
- struct dentry *d;
- int err;
-
- err = kern_path_parent(watch->path, &nd);
- if (err)
- return err;
-
- if (nd.last_type != LAST_NORM) {
- path_put(&nd.path);
- return -EINVAL;
- }
-
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
- if (IS_ERR(d)) {
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
+ struct dentry *d = kern_path_locked(watch->path, parent);
+ if (IS_ERR(d))
return PTR_ERR(d);
- }
+ mutex_unlock(&parent->dentry->d_inode->i_mutex);
if (d->d_inode) {
/* update watch filter fields */
watch->dev = d->d_inode->i_sb->s_dev;
watch->ino = d->d_inode->i_ino;
}
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-
- *parent = nd.path;
dput(d);
return 0;
}
@@ -475,41 +459,33 @@ void audit_remove_watch_rule(struct audit_krule *krule)
if (list_empty(&parent->watches)) {
audit_get_parent(parent);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
}
}
-static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- return true;
-}
-
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
{
struct inode *inode;
- __u32 mask = event->mask;
- const char *dname = event->file_name;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
BUG_ON(group != audit_watch_group);
- switch (event->data_type) {
+ switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = event->path.dentry->d_inode;
+ inode = ((struct path *)data)->dentry->d_inode;
break;
case (FSNOTIFY_EVENT_INODE):
- inode = event->inode;
+ inode = (struct inode *)data;
break;
default:
BUG();
@@ -528,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .should_send_event = audit_watch_should_send_event,
.handle_event = audit_watch_handle_event,
- .free_group_priv = NULL,
- .freeing_mark = NULL,
- .free_event_priv = NULL,
};
static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6c3f1abd20..8e9bc9c3dbb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -29,6 +31,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -224,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
#endif
/* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
@@ -247,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
- printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -310,103 +314,84 @@ static u32 audit_to_op(u32 op)
return n;
}
-
-/* Translate struct audit_rule to kernel's rule respresentation.
- * Exists for backward compatibility with userspace. */
-static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+/* check if an audit field is valid */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- struct audit_entry *entry;
- int err = 0;
- int i;
-
- entry = audit_to_entry_common(rule);
- if (IS_ERR(entry))
- goto exit_nofree;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &entry->rule.fields[i];
- u32 n;
-
- n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
-
- /* Support for legacy operators where
- * AUDIT_NEGATE bit signifies != and otherwise assumes == */
- if (n & AUDIT_NEGATE)
- f->op = Audit_not_equal;
- else if (!n)
- f->op = Audit_equal;
- else
- f->op = audit_to_op(n);
-
- entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
-
- f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
- f->val = rule->values[i];
-
- err = -EINVAL;
- if (f->op == Audit_bad)
- goto exit_free;
-
- switch(f->type) {
- default:
- goto exit_free;
- case AUDIT_PID:
- case AUDIT_UID:
- case AUDIT_EUID:
- case AUDIT_SUID:
- case AUDIT_FSUID:
- case AUDIT_GID:
- case AUDIT_EGID:
- case AUDIT_SGID:
- case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- /* bit ops are only useful on syscall args */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
- break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- break;
- /* arch is only allowed to be = or != */
- case AUDIT_ARCH:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
- goto exit_free;
- entry->rule.arch_f = f;
- break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if (f->val & ~S_IFMT)
- goto exit_free;
- break;
- case AUDIT_INODE:
- err = audit_to_inode(&entry->rule, f);
- if (err)
- goto exit_free;
- break;
- }
- }
-
- if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
- entry->rule.inode_f = NULL;
-
-exit_nofree:
- return entry;
+ switch(f->type) {
+ case AUDIT_MSGTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ entry->rule.listnr != AUDIT_FILTER_USER)
+ return -EINVAL;
+ break;
+ };
-exit_free:
- audit_free_rule(entry);
- return ERR_PTR(err);
+ switch(f->type) {
+ default:
+ return -EINVAL;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ case AUDIT_PID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_INODE:
+ /* bit ops are only useful on syscall args */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ return -EINVAL;
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_WATCH:
+ case AUDIT_DIR:
+ case AUDIT_FILTERKEY:
+ break;
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ /* FALL THROUGH */
+ case AUDIT_ARCH:
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ return -EINVAL;
+ break;
+ case AUDIT_FILETYPE:
+ if (f->val & ~S_IFMT)
+ return -EINVAL;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
+ return -EINVAL;
+ break;
+ };
+ return 0;
}
/* Translate struct audit_rule_data to kernel's rule respresentation. */
@@ -420,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
int i;
char *str;
- entry = audit_to_entry_common((struct audit_rule *)data);
+ entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
@@ -437,32 +422,54 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->type = data->fields[i];
f->val = data->values[i];
+ f->uid = INVALID_UID;
+ f->gid = INVALID_GID;
f->lsm_str = NULL;
f->lsm_rule = NULL;
- switch(f->type) {
- case AUDIT_PID:
+
+ /* Support legacy tests for a valid loginuid */
+ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ f->type = AUDIT_LOGINUID_SET;
+ f->val = 0;
+ }
+
+ if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+ struct pid *pid;
+ rcu_read_lock();
+ pid = find_vpid(f->val);
+ if (!pid) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto exit_free;
+ }
+ f->val = pid_nr(pid);
+ rcu_read_unlock();
+ }
+
+ err = audit_field_valid(entry, f);
+ if (err)
+ goto exit_free;
+
+ err = -EINVAL;
+ switch (f->type) {
+ case AUDIT_LOGINUID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
+ case AUDIT_OBJ_UID:
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- case AUDIT_OBJ_UID:
case AUDIT_OBJ_GID:
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -487,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM "
- "\'%s\' is invalid\n", str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
err = 0;
}
if (err) {
@@ -534,20 +541,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if (f->val & ~S_IFMT)
- goto exit_free;
- break;
- case AUDIT_FIELD_COMPARE:
- if (f->val > AUDIT_MAX_FIELD_COMPARE)
- goto exit_free;
- break;
- default:
- goto exit_free;
}
}
@@ -558,6 +551,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -573,36 +570,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule.
- * Exists for backward compatibility with userspace. */
-static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
-{
- struct audit_rule *rule;
- int i;
-
- rule = kzalloc(sizeof(*rule), GFP_KERNEL);
- if (unlikely(!rule))
- return NULL;
-
- rule->flags = krule->flags | krule->listnr;
- rule->action = krule->action;
- rule->field_count = krule->field_count;
- for (i = 0; i < rule->field_count; i++) {
- rule->values[i] = krule->fields[i].val;
- rule->fields[i] = krule->fields[i].type;
-
- if (krule->vers_ops == 1) {
- if (krule->fields[i].op == Audit_not_equal)
- rule->fields[i] |= AUDIT_NEGATE;
- } else {
- rule->fields[i] |= audit_ops[krule->fields[i].op];
- }
- }
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
-
- return rule;
-}
-
/* Translate kernel rule respresentation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
@@ -707,6 +674,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+ return 1;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -740,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM \'%s\' is "
- "invalid\n", df->lsm_str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
ret = 0;
}
@@ -899,6 +883,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
+ /*
+ * normally audit_add_tree_rule() will free it
+ * on failure
+ */
+ if (tree)
+ audit_put_tree(tree);
goto error;
}
}
@@ -998,37 +988,8 @@ out:
return ret;
}
-/* List rules using struct audit_rule. Exists for backward
- * compatibility with userspace. */
-static void audit_list(int pid, int seq, struct sk_buff_head *q)
-{
- struct sk_buff *skb;
- struct audit_krule *r;
- int i;
-
- /* This is a blocking read, so use audit_filter_mutex instead of rcu
- * iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
- list_for_each_entry(r, &audit_rules_list[i], list) {
- struct audit_rule *rule;
-
- rule = audit_krule_to_rule(r);
- if (unlikely(!rule))
- break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
- rule, sizeof(*rule));
- if (skb)
- skb_queue_tail(q, skb);
- kfree(rule);
- }
- }
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
- if (skb)
- skb_queue_tail(q, skb);
-}
-
/* List rules using struct audit_rule_data. */
-static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
@@ -1043,24 +1004,25 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
- data, sizeof(*data) + data->buflen);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+ 0, 1, data,
+ sizeof(*data) + data->buflen);
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
/* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
- char *action, struct audit_krule *rule,
- int res)
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
+ uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+ unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1068,17 +1030,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_task_context(ab);
audit_log_format(ab, " op=");
audit_log_string(ab, action);
audit_log_key(ab, rule->filterkey);
@@ -1087,83 +1040,37 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
}
/**
- * audit_receive_filter - apply all rules to the specified message type
+ * audit_rule_change - apply all rules to the specified message type
* @type: audit message type
- * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
+ * @portid: target port id for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
*/
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
- size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+int audit_rule_change(int type, __u32 portid, int seq, void *data,
+ size_t datasz)
{
- struct task_struct *tsk;
- struct audit_netlink_list *dest;
int err = 0;
struct audit_entry *entry;
switch (type) {
- case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- /* We can't just spew out the rules here because we might fill
- * the available socket buffer space and deadlock waiting for
- * auditctl to read from it... which isn't ever going to
- * happen if we're actually running in the context of auditctl
- * trying to _send_ the stuff */
-
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
- if (!dest)
- return -ENOMEM;
- dest->pid = pid;
- skb_queue_head_init(&dest->q);
-
- mutex_lock(&audit_filter_mutex);
- if (type == AUDIT_LIST)
- audit_list(pid, seq, &dest->q);
- else
- audit_list_rules(pid, seq, &dest->q);
- mutex_unlock(&audit_filter_mutex);
-
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
- if (IS_ERR(tsk)) {
- skb_queue_purge(&dest->q);
- kfree(dest);
- err = PTR_ERR(tsk);
- }
- break;
- case AUDIT_ADD:
case AUDIT_ADD_RULE:
- if (type == AUDIT_ADD)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("add rule", &entry->rule, !err);
if (err)
audit_free_rule(entry);
break;
- case AUDIT_DEL:
case AUDIT_DEL_RULE:
- if (type == AUDIT_DEL)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("remove rule", &entry->rule, !err);
audit_free_rule(entry);
break;
default:
@@ -1173,6 +1080,46 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return err;
}
+/**
+ * audit_list_rules_send - list the audit rules
+ * @request_skb: skb of request we are replying to (used to target the reply)
+ * @seq: netlink audit message sequence (serial) number
+ */
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
+{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct task_struct *tsk;
+ struct audit_netlink_list *dest;
+ int err = 0;
+
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest->net = get_net(net);
+ dest->portid = portid;
+ skb_queue_head_init(&dest->q);
+
+ mutex_lock(&audit_filter_mutex);
+ audit_list_rules(portid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+
+ return err;
+}
+
int audit_comparator(u32 left, u32 op, u32 right)
{
switch (op) {
@@ -1198,69 +1145,142 @@ int audit_comparator(u32 left, u32 op, u32 right)
}
}
-/* Compare given dentry name with last component in given path,
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen)
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
- int dlen, plen;
- const char *p;
+ switch (op) {
+ case Audit_equal:
+ return uid_eq(left, right);
+ case Audit_not_equal:
+ return !uid_eq(left, right);
+ case Audit_lt:
+ return uid_lt(left, right);
+ case Audit_le:
+ return uid_lte(left, right);
+ case Audit_gt:
+ return uid_gt(left, right);
+ case Audit_ge:
+ return uid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
- if (!dname || !path)
- return 1;
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return gid_eq(left, right);
+ case Audit_not_equal:
+ return !gid_eq(left, right);
+ case Audit_lt:
+ return gid_lt(left, right);
+ case Audit_le:
+ return gid_lte(left, right);
+ case Audit_gt:
+ return gid_gt(left, right);
+ case Audit_ge:
+ return gid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+ int plen;
+ const char *p;
- dlen = strlen(dname);
plen = strlen(path);
- if (plen < dlen)
- return 1;
+
+ if (plen == 0)
+ return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
- /* find last path component */
- p = p - dlen + 1;
- if (p < path)
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
return 1;
- else if (p > path) {
- if (*--p != '/')
- return 1;
- else
- p++;
- }
- /* return length of path's directory component */
- if (dirlen)
- *dirlen = p - path;
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
- struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
enum audit_state *state)
{
int i;
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ pid_t pid;
int result = 0;
u32 sid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(cb->creds.pid, f->op, f->val);
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
- result = audit_comparator(cb->creds.uid, f->op, f->val);
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cb->creds.gid, f->op, f->val);
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(audit_get_loginuid(current),
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
f->op, f->val);
break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(type, f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -1287,23 +1307,26 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
return 1;
}
-int audit_filter_user(struct netlink_skb_parms *cb)
+int audit_filter_user(int type)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
- int ret = 1;
+ int rc, ret;
+
+ ret = 1; /* Audit by default */
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(cb, &e->rule, &state)) {
- if (state == AUDIT_DISABLED)
+ rc = audit_filter_user_rules(&e->rule, type, &state);
+ if (rc) {
+ if (rc > 0 && state == AUDIT_DISABLED)
ret = 0;
break;
}
}
rcu_read_unlock();
- return ret; /* Audit by default */
+ return ret;
}
int audit_filter_type(int type)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34ea..21eae3c05ec 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,6 +42,8 @@
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <asm/types.h>
#include <linux/atomic.h>
@@ -67,6 +69,8 @@
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
#include "audit.h"
@@ -75,59 +79,18 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). If we get more names we will allocate
- * a name dynamically and also add those to the list anchored by names_list. */
-#define AUDIT_NAMES 5
-
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
-
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
/* number of audit rules */
int audit_n_rules;
/* determines whether we collect data for signals sent */
int audit_signals;
-struct audit_cap_data {
- kernel_cap_t permitted;
- kernel_cap_t inheritable;
- union {
- unsigned int fE; /* effective bit of a file capability */
- kernel_cap_t effective; /* effective set of a process */
- };
-};
-
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
- *
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
-struct audit_names {
- struct list_head list; /* audit_context->names_list */
- const char *name;
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- uid_t uid;
- gid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
- int name_len; /* number of name's characters to log */
- bool name_put; /* call __putname() for this name */
- /*
- * This was an allocated audit_names and not from the array of
- * names allocated in the task audit context. Thus this name
- * should be freed on syscall exit
- */
- bool should_free;
-};
-
struct audit_aux_data {
struct audit_aux_data *next;
int type;
@@ -138,18 +101,11 @@ struct audit_aux_data {
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
-struct audit_aux_data_execve {
- struct audit_aux_data d;
- int argc;
- int envc;
- struct mm_struct *mm;
-};
-
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
- uid_t target_auid[AUDIT_AUX_PIDS];
- uid_t target_uid[AUDIT_AUX_PIDS];
+ kuid_t target_auid[AUDIT_AUX_PIDS];
+ kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -164,118 +120,11 @@ struct audit_aux_data_bprm_fcaps {
struct audit_cap_data new_pcap;
};
-struct audit_aux_data_capset {
- struct audit_aux_data d;
- pid_t pid;
- struct audit_cap_data cap;
-};
-
struct audit_tree_refs {
struct audit_tree_refs *next;
struct audit_chunk *c[31];
};
-/* The per-task audit context. */
-struct audit_context {
- int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
- enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
- int major; /* syscall number */
- struct timespec ctime; /* time of syscall entry */
- unsigned long argv[4]; /* syscall arguments */
- long return_code;/* syscall return code */
- u64 prio;
- int return_valid; /* return code is valid */
- /*
- * The names_list is the list of all audit_names collected during this
- * syscall. The first AUDIT_NAMES entries in the names_list will
- * actually be from the preallocated_names array for performance
- * reasons. Except during allocation they should never be referenced
- * through the preallocated_names array and should only be found/used
- * by running the names_list.
- */
- struct audit_names preallocated_names[AUDIT_NAMES];
- int name_count; /* total records in names_list */
- struct list_head names_list; /* anchor for struct audit_names->list */
- char * filterkey; /* key for rule that triggered record */
- struct path pwd;
- struct audit_context *previous; /* For nested syscalls */
- struct audit_aux_data *aux;
- struct audit_aux_data *aux_pids;
- struct sockaddr_storage *sockaddr;
- size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
- uid_t uid, euid, suid, fsuid;
- gid_t gid, egid, sgid, fsgid;
- unsigned long personality;
- int arch;
-
- pid_t target_pid;
- uid_t target_auid;
- uid_t target_uid;
- unsigned int target_sessionid;
- u32 target_sid;
- char target_comm[TASK_COMM_LEN];
-
- struct audit_tree_refs *trees, *first_trees;
- struct list_head killed_trees;
- int tree_count;
-
- int type;
- union {
- struct {
- int nargs;
- long args[6];
- } socketcall;
- struct {
- uid_t uid;
- gid_t gid;
- umode_t mode;
- u32 osid;
- int has_perm;
- uid_t perm_uid;
- gid_t perm_gid;
- umode_t perm_mode;
- unsigned long qbytes;
- } ipc;
- struct {
- mqd_t mqdes;
- struct mq_attr mqstat;
- } mq_getsetattr;
- struct {
- mqd_t mqdes;
- int sigev_signo;
- } mq_notify;
- struct {
- mqd_t mqdes;
- size_t msg_len;
- unsigned int msg_prio;
- struct timespec abs_timeout;
- } mq_sendrecv;
- struct {
- int oflag;
- umode_t mode;
- struct mq_attr attr;
- } mq_open;
- struct {
- pid_t pid;
- struct audit_cap_data cap;
- } capset;
- struct {
- int fd;
- int flags;
- } mmap;
- };
- int fds[2];
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
-};
-
static inline int open_arg(int flags, int mask)
{
int n = ACC_MODE(flags);
@@ -463,37 +312,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
-static int audit_compare_id(uid_t uid1,
- struct audit_names *name,
- unsigned long name_offset,
- struct audit_field *f,
- struct audit_context *ctx)
+static int audit_compare_uid(kuid_t uid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
{
struct audit_names *n;
- unsigned long addr;
- uid_t uid2;
int rc;
-
- BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
-
+
if (name) {
- addr = (unsigned long)name;
- addr += name_offset;
-
- uid2 = *(uid_t *)addr;
- rc = audit_comparator(uid1, f->op, uid2);
+ rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- addr = (unsigned long)n;
- addr += name_offset;
-
- uid2 = *(uid_t *)addr;
+ rc = audit_uid_comparator(uid, f->op, n->uid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
- rc = audit_comparator(uid1, f->op, uid2);
+static int audit_compare_gid(kgid_t gid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_gid_comparator(gid, f->op, name->gid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_gid_comparator(gid, f->op, n->gid);
if (rc)
return rc;
}
@@ -510,80 +369,62 @@ static int audit_field_compare(struct task_struct *tsk,
switch (f->val) {
/* process to file object comparisons */
case AUDIT_COMPARE_UID_TO_OBJ_UID:
- return audit_compare_id(cred->uid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->uid, name, f, ctx);
case AUDIT_COMPARE_GID_TO_OBJ_GID:
- return audit_compare_id(cred->gid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->gid, name, f, ctx);
case AUDIT_COMPARE_EUID_TO_OBJ_UID:
- return audit_compare_id(cred->euid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->euid, name, f, ctx);
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
- return audit_compare_id(cred->egid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
- return audit_compare_id(tsk->loginuid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(tsk->loginuid, name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
- return audit_compare_id(cred->suid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
- return audit_compare_id(cred->sgid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->sgid, name, f, ctx);
case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
- return audit_compare_id(cred->fsuid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->fsuid, name, f, ctx);
case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
- return audit_compare_id(cred->fsgid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
- return audit_comparator(cred->uid, f->op, tsk->loginuid);
+ return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
case AUDIT_COMPARE_UID_TO_EUID:
- return audit_comparator(cred->uid, f->op, cred->euid);
+ return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
- return audit_comparator(cred->uid, f->op, cred->suid);
+ return audit_uid_comparator(cred->uid, f->op, cred->suid);
case AUDIT_COMPARE_UID_TO_FSUID:
- return audit_comparator(cred->uid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
- return audit_comparator(tsk->loginuid, f->op, cred->euid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
- return audit_comparator(tsk->loginuid, f->op, cred->suid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
- return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
- return audit_comparator(cred->euid, f->op, cred->suid);
+ return audit_uid_comparator(cred->euid, f->op, cred->suid);
case AUDIT_COMPARE_EUID_TO_FSUID:
- return audit_comparator(cred->euid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
/* suid comparisons */
case AUDIT_COMPARE_SUID_TO_FSUID:
- return audit_comparator(cred->suid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
/* gid comparisons */
case AUDIT_COMPARE_GID_TO_EGID:
- return audit_comparator(cred->gid, f->op, cred->egid);
+ return audit_gid_comparator(cred->gid, f->op, cred->egid);
case AUDIT_COMPARE_GID_TO_SGID:
- return audit_comparator(cred->gid, f->op, cred->sgid);
+ return audit_gid_comparator(cred->gid, f->op, cred->sgid);
case AUDIT_COMPARE_GID_TO_FSGID:
- return audit_comparator(cred->gid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
/* egid comparisons */
case AUDIT_COMPARE_EGID_TO_SGID:
- return audit_comparator(cred->egid, f->op, cred->sgid);
+ return audit_gid_comparator(cred->egid, f->op, cred->sgid);
case AUDIT_COMPARE_EGID_TO_FSGID:
- return audit_comparator(cred->egid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
/* sgid comparison */
case AUDIT_COMPARE_SGID_TO_FSGID:
- return audit_comparator(cred->sgid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
default:
WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
return 0;
@@ -616,41 +457,57 @@ static int audit_filter_rules(struct task_struct *tsk,
struct audit_field *f = &rule->fields[i];
struct audit_names *n;
int result = 0;
+ pid_t pid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(tsk->pid, f->op, f->val);
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
- ctx->ppid = sys_getppid();
+ ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
case AUDIT_UID:
- result = audit_comparator(cred->uid, f->op, f->val);
+ result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
case AUDIT_EUID:
- result = audit_comparator(cred->euid, f->op, f->val);
+ result = audit_uid_comparator(cred->euid, f->op, f->uid);
break;
case AUDIT_SUID:
- result = audit_comparator(cred->suid, f->op, f->val);
+ result = audit_uid_comparator(cred->suid, f->op, f->uid);
break;
case AUDIT_FSUID:
- result = audit_comparator(cred->fsuid, f->op, f->val);
+ result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cred->gid, f->op, f->val);
+ result = audit_gid_comparator(cred->gid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_group_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_group_p(f->gid);
+ }
break;
case AUDIT_EGID:
- result = audit_comparator(cred->egid, f->op, f->val);
+ result = audit_gid_comparator(cred->egid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_egroup_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_egroup_p(f->gid);
+ }
break;
case AUDIT_SGID:
- result = audit_comparator(cred->sgid, f->op, f->val);
+ result = audit_gid_comparator(cred->sgid, f->op, f->gid);
break;
case AUDIT_FSGID:
- result = audit_comparator(cred->fsgid, f->op, f->val);
+ result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
@@ -704,7 +561,7 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_INODE:
if (name)
- result = (name->ino == f->val);
+ result = audit_comparator(name->ino, f->op, f->val);
else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_comparator(n->ino, f->op, f->val)) {
@@ -716,10 +573,10 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_OBJ_UID:
if (name) {
- result = audit_comparator(name->uid, f->op, f->val);
+ result = audit_uid_comparator(name->uid, f->op, f->uid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (audit_comparator(n->uid, f->op, f->val)) {
+ if (audit_uid_comparator(n->uid, f->op, f->uid)) {
++result;
break;
}
@@ -728,10 +585,10 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_OBJ_GID:
if (name) {
- result = audit_comparator(name->gid, f->op, f->val);
+ result = audit_gid_comparator(name->gid, f->op, f->gid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (audit_comparator(n->gid, f->op, f->val)) {
+ if (audit_gid_comparator(n->gid, f->op, f->gid)) {
++result;
break;
}
@@ -749,7 +606,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID:
result = 0;
if (ctx)
- result = audit_comparator(tsk->loginuid, f->op, f->val);
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
@@ -868,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
return AUDIT_BUILD_CONTEXT;
}
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+ int word, bit;
+
+ if (val > 0xffffffff)
+ return false;
+
+ word = AUDIT_WORD(val);
+ if (word >= AUDIT_BITMASK_SIZE)
+ return false;
+
+ bit = AUDIT_BIT(val);
+
+ return rule->mask[word] & bit;
+}
+
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
@@ -885,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
-
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
rcu_read_unlock();
@@ -909,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
struct audit_context *ctx) {
- int word, bit;
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
struct audit_entry *e;
enum audit_state state;
- word = AUDIT_WORD(ctx->major);
- bit = AUDIT_BIT(ctx->major);
-
if (list_empty(list))
return 0;
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
ctx->current_state = state;
return 1;
@@ -953,7 +822,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
int return_valid,
long return_code)
{
@@ -990,22 +860,30 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
return context;
}
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d"
- " ino_count=%d [NOT freeing]\n",
- __FILE__, __LINE__,
+ int i = 0;
+
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d ino_count=%d"
+ " [NOT freeing]\n", __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ pr_err("names[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -1019,7 +897,7 @@ static inline void audit_free_names(struct audit_context *context)
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
if (n->name && n->name_put)
- __putname(n->name);
+ final_putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1043,21 +921,15 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
-static inline void audit_zero_context(struct audit_context *context,
- enum audit_state state)
-{
- memset(context, 0, sizeof(*context));
- context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
- if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
return NULL;
- audit_zero_context(context, state);
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
return context;
@@ -1082,8 +954,10 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED)
+ if (state == AUDIT_DISABLED) {
+ clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
return 0;
+ }
if (!(context = audit_alloc_context(state))) {
kfree(key);
@@ -1099,91 +973,18 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
- struct audit_context *previous;
- int count = 0;
-
- do {
- previous = context->previous;
- if (previous || (count && count < 10)) {
- ++count;
- printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
- " freeing multiple contexts (%d)\n",
- context->serial, context->major,
- context->name_count, count);
- }
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- free_tree_refs(context);
- audit_free_aux(context);
- kfree(context->filterkey);
- kfree(context->sockaddr);
- kfree(context);
- context = previous;
- } while (context);
- if (count >= 10)
- printk(KERN_ERR "audit: freed %d contexts\n", count);
-}
-
-void audit_log_task_context(struct audit_buffer *ab)
-{
- char *ctx = NULL;
- unsigned len;
- int error;
- u32 sid;
-
- security_task_getsecid(current, &sid);
- if (!sid)
- return;
-
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
- return;
- }
-
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- return;
-
-error_path:
- audit_panic("error in audit_log_task_context");
- return;
-}
-
-EXPORT_SYMBOL(audit_log_task_context);
-
-static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-{
- char name[sizeof(tsk->comm)];
- struct mm_struct *mm = tsk->mm;
- struct vm_area_struct *vma;
-
- /* tsk == current */
-
- get_task_comm(name, tsk);
- audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, name);
-
- if (mm) {
- down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) &&
- vma->vm_file) {
- audit_log_d_path(ab, " exe=",
- &vma->vm_file->f_path);
- break;
- }
- vma = vma->vm_next;
- }
- up_read(&mm->mmap_sem);
- }
- audit_log_task_context(ab);
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ free_tree_refs(context);
+ audit_free_aux(context);
+ kfree(context->filterkey);
+ kfree(context->sockaddr);
+ audit_proctitle_free(context);
+ kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- uid_t auid, uid_t uid, unsigned int sessionid,
+ kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
{
struct audit_buffer *ab;
@@ -1195,14 +996,17 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
if (!ab)
return rc;
- audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
- uid, sessionid);
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid), sessionid);
+ if (sid) {
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
}
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
@@ -1359,20 +1163,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,
}
static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab,
- struct audit_aux_data_execve *axi)
+ struct audit_buffer **ab)
{
int i, len;
size_t len_sent = 0;
const char __user *p;
char *buf;
- if (axi->mm != current->mm)
- return; /* execve failed, no additional info */
+ p = (const char __user *)current->mm->arg_start;
- p = (const char __user *)axi->mm->arg_start;
-
- audit_log_format(*ab, "argc=%d", axi->argc);
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1382,11 +1182,11 @@ static void audit_log_execve_info(struct audit_context *context,
*/
buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf) {
- audit_panic("out of memory for argv string\n");
+ audit_panic("out of memory for argv string");
return;
}
- for (i = 0; i < axi->argc; i++) {
+ for (i = 0; i < context->execve.argc; i++) {
len = audit_log_single_execve_arg(context, ab, i,
&len_sent, p, buf);
if (len <= 0)
@@ -1396,35 +1196,6 @@ static void audit_log_execve_info(struct audit_context *context,
kfree(buf);
}
-static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
-}
-
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1446,7 +1217,9 @@ static void show_special(struct audit_context *context, int *call_panic)
u32 osid = context->ipc.osid;
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
- context->ipc.uid, context->ipc.gid, context->ipc.mode);
+ from_kuid(&init_user_ns, context->ipc.uid),
+ from_kgid(&init_user_ns, context->ipc.gid),
+ context->ipc.mode);
if (osid) {
char *ctx = NULL;
u32 len;
@@ -1462,14 +1235,14 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab,
"qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
- if (!ab)
- return;
}
break; }
case AUDIT_MQ_OPEN: {
@@ -1516,94 +1289,74 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break; }
+ case AUDIT_EXECVE: {
+ audit_log_execve_info(context, &ab);
+ break; }
}
audit_log_end(ab);
}
-static void audit_log_name(struct audit_context *context, struct audit_names *n,
- int record_num, int *call_panic)
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
- return; /* audit_panic has been called */
+ return; /* audit_panic or being filtered */
- audit_log_format(ab, "item=%d", record_num);
+ audit_log_format(ab, "proctitle=");
- if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
- n->name_len);
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
}
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- n->uid,
- n->gid,
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
}
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
}
-
- audit_log_fcaps(ab, n);
-
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
audit_log_end(ab);
}
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
- const struct cred *cred;
int i, call_panic = 0;
struct audit_buffer *ab;
struct audit_aux_data *aux;
- const char *tty;
struct audit_names *n;
/* tsk == current */
- context->pid = tsk->pid;
- if (!context->ppid)
- context->ppid = sys_getppid();
- cred = current_cred();
- context->uid = cred->uid;
- context->gid = cred->gid;
- context->euid = cred->euid;
- context->suid = cred->suid;
- context->fsuid = cred->fsuid;
- context->egid = cred->egid;
- context->sgid = cred->sgid;
- context->fsgid = cred->fsgid;
context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1618,32 +1371,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " ppid=%d pid=%d auid=%u uid=%u gid=%u"
- " euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count,
- context->ppid,
- context->pid,
- tsk->loginuid,
- context->uid,
- context->gid,
- context->euid, context->suid, context->fsuid,
- context->egid, context->sgid, context->fsgid, tty,
- tsk->sessionid);
-
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
audit_log_task_info(ab, tsk);
audit_log_key(ab, context->filterkey);
@@ -1657,11 +1391,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
switch (aux->type) {
- case AUDIT_EXECVE: {
- struct audit_aux_data_execve *axi = (void *)aux;
- audit_log_execve_info(context, &ab, axi);
- break; }
-
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1731,8 +1460,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
i = 0;
- list_for_each_entry(n, &context->names_list, list)
- audit_log_name(context, n, i++, &call_panic);
+ list_for_each_entry(n, &context->names_list, list) {
+ if (n->hidden)
+ continue;
+ audit_log_name(context, n, NULL, i++, &call_panic);
+ }
+
+ audit_log_proctitle(tsk, context);
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1752,7 +1486,7 @@ void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- context = audit_get_context(tsk, 0, 0);
+ context = audit_take_context(tsk, 0, 0);
if (!context)
return;
@@ -1797,42 +1531,6 @@ void __audit_syscall_entry(int arch, int major,
if (!context)
return;
- /*
- * This happens only on certain architectures that make system
- * calls in kernel_thread via the entry.S interface, instead of
- * with direct calls. (If you are porting to a new
- * architecture, hitting this condition can indicate that you
- * got the _exit/_leave calls backward in entry.S.)
- *
- * i386 no
- * x86_64 no
- * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
- *
- * This also happens with vm86 emulation in a non-nested manner
- * (entries without exits), so this case must be caught.
- */
- if (context->in_syscall) {
- struct audit_context *newctx;
-
-#if AUDIT_DEBUG
- printk(KERN_ERR
- "audit(:%d) pid=%d in syscall=%d;"
- " entering syscall=%d\n",
- context->serial, tsk->pid, context->major, major);
-#endif
- newctx = audit_alloc_context(context->state);
- if (newctx) {
- newctx->previous = context;
- context = newctx;
- tsk->audit_context = newctx;
- } else {
- /* If we can't alloc a new context, the best we
- * can do is to leak memory (any pending putname
- * will be lost). The only other alternative is
- * to abandon auditing. */
- audit_zero_context(context, context->state);
- }
- }
BUG_ON(context->in_syscall || context->name_count);
if (!audit_enabled)
@@ -1882,7 +1580,7 @@ void __audit_syscall_exit(int success, long return_code)
else
success = AUDITSC_FAILURE;
- context = audit_get_context(tsk, success, return_code);
+ context = audit_take_context(tsk, success, return_code);
if (!context)
return;
@@ -1895,28 +1593,21 @@ void __audit_syscall_exit(int success, long return_code)
if (!list_empty(&context->killed_trees))
audit_kill_trees(&context->killed_trees);
- if (context->previous) {
- struct audit_context *new_context = context->previous;
- context->previous = NULL;
- audit_free_context(context);
- tsk->audit_context = new_context;
- } else {
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
- tsk->audit_context = context;
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ audit_free_aux(context);
+ context->aux = NULL;
+ context->aux_pids = NULL;
+ context->target_pid = 0;
+ context->target_sid = 0;
+ context->sockaddr_len = 0;
+ context->type = 0;
+ context->fds[0] = -1;
+ if (context->state != AUDIT_RECORD_CONTEXT) {
+ kfree(context->filterkey);
+ context->filterkey = NULL;
}
+ tsk->audit_context = context;
}
static inline void handle_one(const struct inode *inode)
@@ -1939,7 +1630,7 @@ static inline void handle_one(const struct inode *inode)
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
- printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
@@ -1998,8 +1689,7 @@ retry:
goto retry;
}
/* too bad */
- printk(KERN_WARNING
- "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
@@ -2008,7 +1698,8 @@ retry:
#endif
}
-static struct audit_names *audit_alloc_name(struct audit_context *context)
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
{
struct audit_names *aname;
@@ -2023,6 +1714,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
aname->ino = (unsigned long)-1;
+ aname->type = type;
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
@@ -2033,33 +1725,62 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr)
+ return n->name;
+ }
+ return NULL;
+}
+
+/**
* audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
{
struct audit_context *context = current->audit_context;
struct audit_names *n;
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ pr_err("%s:%d(:%d): ignoring getname(%p)\n",
__FILE__, __LINE__, context->serial, name);
dump_stack();
#endif
return;
}
- n = audit_alloc_name(context);
+#if AUDIT_DEBUG
+ /* The filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
n->name = name;
n->name_len = AUDIT_NAME_FULL;
n->name_put = true;
+ name->aname = n;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
@@ -2072,112 +1793,124 @@ void __audit_getname(const char *name)
* then we delay the putname until syscall exit.
* Called from include/linux/fs.h:putname().
*/
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
{
struct audit_context *context = current->audit_context;
BUG_ON(!context);
- if (!context->in_syscall) {
+ if (!name->aname || !context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ pr_err("%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
struct audit_names *n;
- int i;
+ int i = 0;
list_for_each_entry(n, &context->names_list, list)
- printk(KERN_ERR "name[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ pr_err("name[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
#endif
- __putname(name);
+ final_putname(name);
}
#if AUDIT_DEBUG
else {
++context->put_count;
if (context->put_count > context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d"
- " in_syscall=%d putname(%p) name_count=%d"
- " put_count=%d\n",
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+ " name_count=%d put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
- context->in_syscall, name, context->name_count,
- context->put_count);
+ context->in_syscall, name->name,
+ context->name_count, context->put_count);
dump_stack();
}
}
#endif
}
-static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-
-/* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
/**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- *
- * Called from fs/namei.c:path_lookup().
+ * @flags: attributes for this particular entry
*/
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int flags)
{
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
struct audit_names *n;
+ bool parent = flags & AUDIT_INODE_PARENT;
if (!context->in_syscall)
return;
+ if (!name)
+ goto out_alloc;
+
+#if AUDIT_DEBUG
+ /* The struct filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (n->name && (n->name == name))
- goto out;
+ /* does the name pointer match? */
+ if (!n->name || n->name->name != name->name)
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
}
- /* unable to find the name from a previous getname() */
- n = audit_alloc_name(context);
+out_alloc:
+ /* unable to find the name from a previous getname(). Allocate a new
+ * anonymous entry.
+ */
+ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
if (!n)
return;
out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ if (flags & AUDIT_INODE_HIDDEN)
+ n->hidden = true;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
+ }
handle_path(dentry);
audit_copy_inode(n, dentry, inode);
}
/**
- * audit_inode_child - collect inode info for created/removed objects
- * @dentry: dentry being audited
+ * __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -2187,15 +1920,14 @@ out:
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct dentry *dentry,
- const struct inode *parent)
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
const char *dname = dentry->d_name.name;
- struct audit_names *n;
- int dirlen = 0;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
if (!context->in_syscall)
return;
@@ -2203,62 +1935,65 @@ void __audit_inode_child(const struct dentry *dentry,
if (inode)
handle_one(inode);
- /* parent is more likely, look for it first */
+ /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ if (!n->name || n->type != AUDIT_TYPE_PARENT)
continue;
if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- n->name_len = dirlen; /* update parent data in place */
- found_parent = n->name;
- goto add_names;
+ !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ found_parent = n;
+ break;
}
}
- /* no matching parent, look for matching child */
+ /* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ /* can only match entries that have a name */
+ if (!n->name || n->type != type)
continue;
- /* strcmp() is the more likely scenario */
- if (!strcmp(dname, n->name) ||
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- if (inode)
- audit_copy_inode(n, NULL, inode);
- else
- n->ino = (unsigned long)-1;
- found_child = n->name;
- goto add_names;
+ /* if we found a parent, make sure this one is a child of it */
+ if (found_parent && (n->name != found_parent->name))
+ continue;
+
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ found_child = n;
+ break;
}
}
-add_names:
if (!found_parent) {
- n = audit_alloc_name(context);
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- n = audit_alloc_name(context);
- if (!n)
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
return;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- n->name = found_parent;
- n->name_len = AUDIT_NAME_FULL;
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- n->name_put = false;
+ found_child->name_put = false;
}
-
- if (inode)
- audit_copy_inode(n, NULL, inode);
}
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2290,6 +2025,47 @@ int auditsc_get_stamp(struct audit_context *ctx,
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid, unsigned int sessionid,
+ int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+
+ if (!audit_enabled)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+ audit_log_end(ab);
+}
+
/**
* audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
@@ -2298,38 +2074,29 @@ static atomic_t session_id = ATOMIC_INIT(0);
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(uid_t loginuid)
+int audit_set_loginuid(kuid_t loginuid)
{
struct task_struct *task = current;
- struct audit_context *context = task->audit_context;
- unsigned int sessionid;
+ unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ kuid_t oldloginuid;
+ int rc;
-#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
- if (task->loginuid != -1)
- return -EPERM;
-#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
- sessionid = atomic_inc_return(&session_id);
- if (context && context->in_syscall) {
- struct audit_buffer *ab;
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (ab) {
- audit_log_format(ab, "login pid=%d uid=%u "
- "old auid=%u new auid=%u"
- " old ses=%u new ses=%u",
- task->pid, task_uid(task),
- task->loginuid, loginuid,
- task->sessionid, sessionid);
- audit_log_end(ab);
- }
- }
task->sessionid = sessionid;
task->loginuid = loginuid;
- return 0;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
}
/**
@@ -2450,38 +2217,31 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
context->ipc.has_perm = 1;
}
-int __audit_bprm(struct linux_binprm *bprm)
+void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- ax = kmalloc(sizeof(*ax), GFP_KERNEL);
- if (!ax)
- return -ENOMEM;
-
- ax->argc = bprm->argc;
- ax->envc = bprm->envc;
- ax->mm = bprm->mm;
- ax->d.type = AUDIT_EXECVE;
- ax->d.next = context->aux;
- context->aux = (void *)ax;
- return 0;
+ context->type = AUDIT_EXECVE;
+ context->execve.argc = bprm->argc;
}
/**
* audit_socketcall - record audit data for sys_socketcall
- * @nargs: number of args
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
-void __audit_socketcall(int nargs, unsigned long *args)
+int __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
+ if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+ return -EINVAL;
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+ return 0;
}
/**
@@ -2524,7 +2284,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = t->pid;
+ context->target_pid = task_pid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2545,12 +2305,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
- uid_t uid = current_uid(), t_uid = task_uid(t);
+ kuid_t uid = current_uid(), t_uid = task_uid(t);
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = tsk->pid;
- if (tsk->loginuid != -1)
+ audit_sig_pid = task_pid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
@@ -2563,7 +2323,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
- ctx->target_pid = t->tgid;
+ ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
@@ -2584,7 +2344,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
- axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2643,18 +2403,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
/**
* __audit_log_capset - store information about the arguments to the capset syscall
- * @pid: target pid of the capset call
* @new: the new credentials
* @old: the old (current) credentials
*
* Record the aguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
-void __audit_log_capset(pid_t pid,
- const struct cred *new, const struct cred *old)
+void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = pid;
+ context->capset.pid = task_pid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2669,25 +2427,34 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+static void audit_log_task(struct audit_buffer *ab)
{
- uid_t auid, uid;
- gid_t gid;
+ kuid_t auid, uid;
+ kgid_t gid;
unsigned int sessionid;
+ struct mm_struct *mm = current->mm;
auid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
current_uid_gid(&uid, &gid);
audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
audit_log_untrustedstring(ab, current->comm);
- audit_log_format(ab, " reason=");
- audit_log_string(ab, reason);
- audit_log_format(ab, " sig=%ld", signr);
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
}
+
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
@@ -2706,17 +2473,26 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "memory violation", signr);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall)
+void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", SIGKILL);
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c..1323360d90e 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
static void backtrace_test_normal(void)
{
- printk("Testing a backtrace from process context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from process context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
- printk("Testing a backtrace from irq context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from irq context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
struct stack_trace trace;
unsigned long entries[8];
- printk("Testing a saved backtrace.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a saved backtrace.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
#else
static void backtrace_test_saved(void)
{
- printk("Saved backtrace test skipped.\n");
+ pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
- printk("====[ backtrace testing ]===========\n");
+ pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
- printk("====[ end of backtrace testing ]====\n");
+ pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b..9fd4246b04b 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,8 @@
#include <linux/mmzone.h>
#include <linux/kbuild.h>
#include <linux/page_cgroup.h>
+#include <linux/log2.h>
+#include <linux/spinlock_types.h>
void foo(void)
{
@@ -17,5 +19,9 @@ void foo(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+ DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
+ DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 3f1adb6c647..a5cf13c018c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,8 @@
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
@@ -22,7 +24,6 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-
EXPORT_SYMBOL(__cap_empty_set);
int file_caps_enabled = 1;
@@ -42,15 +43,10 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- static int warned;
- if (!warned) {
- char name[sizeof(current->comm)];
-
- printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
- " (legacy support in use)\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+ get_task_comm(name, current));
}
/*
@@ -71,16 +67,10 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- static int warned;
-
- if (!warned) {
- char name[sizeof(current->comm)];
+ char name[sizeof(current->comm)];
- printk(KERN_INFO "warning: `%s' uses deprecated v2"
- " capabilities in a way that may be insecure.\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+ get_task_comm(name, current));
}
/*
@@ -198,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
- * unexpectidly fail; the capget/modify/capset aborts
+ * unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
@@ -277,7 +267,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (ret < 0)
goto error;
- audit_log_capset(pid, new, current_cred());
+ audit_log_capset(new, current_cred());
return commit_creds(new);
@@ -380,7 +370,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
- printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+ pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
@@ -393,6 +383,31 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+ int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -409,13 +424,19 @@ bool capable(int cap)
EXPORT_SYMBOL(capable);
/**
- * nsown_capable - Check superior capability to one's own user_ns
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @inode: The inode in question
* @cap: The capability in question
*
- * Return true if the current task has the given superior capability
- * targeted at its own user namespace.
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
*/
-bool nsown_capable(int cap)
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
- return ns_capable(current_user_ns(), cap);
+ struct user_namespace *ns = current_user_ns();
+
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
}
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f7..70776aec256 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,14 +26,16 @@
* distribution for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cgroup.h>
#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/fs.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
@@ -41,182 +43,135 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
-#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
-#include <linux/namei.h>
+#include <linux/hashtable.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/atomic.h>
/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on. Modifying requires both cgroup_mutex and
- * cgroup_root_mutex. Readers can acquire either of the two. This is to
- * break the following locking order cycle.
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
*
- * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- * B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
*/
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
+#else
static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_MUTEX(cgroup_root_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
+#endif
/*
- * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
- * registered after that. The mutable section of this array is protected by
- * cgroup_mutex.
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
*/
-#define SUBSYS(_x) &_x ## _subsys,
-static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-#include <linux/cgroup_subsys.h>
-};
-
-#define MAX_CGROUP_ROOT_NAMELEN 64
+static DEFINE_SPINLOCK(cgroup_idr_lock);
/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
-struct cgroupfs_root {
- struct super_block *sb;
-
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
- unsigned long subsys_bits;
-
- /* Unique id for this hierarchy. */
- int hierarchy_id;
-
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_bits;
+static DEFINE_SPINLOCK(release_agent_path_lock);
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
+#define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
- /* The root cgroup for this hierarchy */
- struct cgroup top_cgroup;
-
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
-
- /* A list running through the active hierarchies */
- struct list_head root_list;
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
- /* Hierarchy-specific flags */
- unsigned long flags;
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
- /* The path to use for release notifications. */
- char release_agent_path[PATH_MAX];
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
- /* The name for this hierarchy - may be empty */
- char name[MAX_CGROUP_ROOT_NAMELEN];
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
/*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
- * subsystems that are otherwise unattached - it never has more than a
- * single cgroup, and all tasks are part of that cgroup.
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
*/
-static struct cgroupfs_root rootnode;
+struct cgroup_root cgrp_dfl_root;
/*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
*/
-#define CSS_ID_MAX (65535)
-struct css_id {
- /*
- * The css to which this ID points. This pointer is set to valid value
- * after cgroup is populated. If cgroup is removed, this will be NULL.
- * This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
- */
- struct cgroup_subsys_state __rcu *css;
- /*
- * ID of this css.
- */
- unsigned short id;
- /*
- * Depth in hierarchy which this ID belongs to.
- */
- unsigned short depth;
- /*
- * ID is freed by RCU. (and lookup routine is RCU safe.)
- */
- struct rcu_head rcu_head;
- /*
- * Hierarchy of CSS ID belongs to.
- */
- unsigned short stack[0]; /* Array of Length (depth+1) */
-};
+static bool cgrp_dfl_root_visible;
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
- /*
- * Cgroup which the event belongs to.
- */
- struct cgroup *cgrp;
- /*
- * Control file which the event associated.
- */
- struct cftype *cft;
- /*
- * eventfd to signal userspace about the event.
- */
- struct eventfd_ctx *eventfd;
- /*
- * Each of these stored in a list by the cgroup.
- */
- struct list_head list;
- /*
- * All fields below needed to unregister event when
- * userspace closes eventfd.
- */
- poll_table pt;
- wait_queue_head_t *wqh;
- wait_queue_t wait;
- struct work_struct remove;
-};
+/* some controllers are not supported in the default hierarchy */
+static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+#ifdef CONFIG_CGROUP_DEBUG
+ | (1 << debug_cgrp_id)
+#endif
+ ;
/* The list of hierarchy roots */
-static LIST_HEAD(roots);
-static int root_count;
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
-static DEFINE_IDA(hierarchy_ida);
-static int next_hierarchy_id;
-static DEFINE_SPINLOCK(hierarchy_id_lock);
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
-#define dummytop (&rootnode.top_cgroup)
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
@@ -225,30 +180,152 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
*/
static int need_forkexit_callback __read_mostly;
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
+static struct cftype cgroup_base_files[];
+
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned int ss_mask);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
{
- return lockdep_is_held(&cgroup_mutex);
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
- return mutex_is_locked(&cgroup_mutex);
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgroup_parent(cgrp);
+
+ return cgroup_css(cgrp, ss);
+}
/* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
- return test_bit(CGRP_REMOVED, &cgrp->flags);
+ return !(cgrp->self.flags & CSS_ONLINE);
}
-/* bits in struct cgroupfs_root flags field */
-enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
-};
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgroup_parent(cgrp);
+ }
+ return false;
+}
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -263,21 +340,55 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
-/*
- * for_each_subsys() allows you to iterate on each subsystem attached to
- * an active hierarchy
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_subsys(_root, _ss) \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
-/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_active_root(_root) \
-list_for_each_entry(_root, &roots, root_list)
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
@@ -287,40 +398,80 @@ static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
-/* Link structure for associating css_set objects with cgroups */
-struct cg_cgroup_link {
- /*
- * List running through cg_cgroup_links associated with a
- * cgroup, anchored on cgroup->css_sets
- */
- struct list_head cgrp_link_list;
- struct cgroup *cgrp;
- /*
- * List running through cg_cgroup_links pointing at a
- * single css_set object, anchored on css_set->cg_links
- */
- struct list_head cg_link_list;
- struct css_set *cg;
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
};
-/* The default css_set - used by init and its children prior to any
+/*
+ * The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
-static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static int css_set_count = 1; /* 1 for init_css_set */
-static int cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *css);
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
-/* css_set_lock protects the list of css_set objects, and the
- * chain of tasks off each css_set. Nests outside task->alloc_lock
- * due to cgroup_iter_start() */
-static DEFINE_RWLOCK(css_set_lock);
-static int css_set_count;
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
/*
* hash table for cgroup groups. This improves the performance to find
@@ -328,141 +479,136 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
int i;
- int index;
- unsigned long tmp = 0UL;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- return &css_set_table[index];
+ return key;
}
-/* We don't maintain the lists running through each css_set to its
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
-
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cg->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cg->refcount)) {
- write_unlock(&css_set_lock);
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (!atomic_dec_and_test(&cset->refcount))
return;
- }
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
+ hash_del(&cset->hlist);
css_set_count--;
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
struct cgroup *cgrp = link->cgrp;
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- if (atomic_dec_and_test(&cgrp->count) &&
- notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+
+ /* @cgrp can't go away while we're holding css_set_rwsem */
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ if (notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
}
kfree(link);
}
- write_unlock(&css_set_lock);
- kfree_rcu(cg, rcu_head);
+ kfree_rcu(cset, rcu_head);
}
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
+static void put_css_set(struct css_set *cset, bool taskexit)
{
- atomic_inc(&cg->refcount);
-}
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
-static inline void put_css_set(struct css_set *cg)
-{
- __put_css_set(cg, 0);
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
}
-static inline void put_css_set_taskexit(struct css_set *cg)
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
{
- __put_css_set(cg, 1);
+ atomic_inc(&cset->refcount);
}
-/*
+/**
* compare_css_sets - helper function for find_existing_css_set().
- * @cg: candidate css_set being tested
- * @old_cg: existing css_set for a task
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
- * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
-static bool compare_css_sets(struct css_set *cg,
- struct css_set *old_cg,
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
- if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
- /* Not all subsystems matched */
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
- }
/*
* Compare cgroup pointers in order to distinguish between
- * different cgroups in heirarchies with no subsystems. We
- * could get by with just this check alone (and skip the
- * memcmp above) but on most setups the memcmp check will
- * avoid the need for this more expensive check on almost all
- * candidates.
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
*/
-
- l1 = &cg->cg_links;
- l2 = &old_cg->cg_links;
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
while (1) {
- struct cg_cgroup_link *cgl1, *cgl2;
- struct cgroup *cg1, *cg2;
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
- if (l1 == &cg->cg_links) {
- BUG_ON(l2 != &old_cg->cg_links);
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
- BUG_ON(l2 == &old_cg->cg_links);
+ BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
- cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
- cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
- cg1 = cgl1->cgrp;
- cg2 = cgl2->cgrp;
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
- BUG_ON(cg1->root != cg2->root);
+ BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
@@ -471,239 +617,340 @@ static bool compare_css_sets(struct css_set *cg,
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
- if (cg1->root == new_cgrp->root) {
- if (cg1 != new_cgrp)
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
return false;
} else {
- if (cg1 != cg2)
+ if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
-/*
- * find_existing_css_set() is a helper for
- * find_css_set(), and checks to see whether an existing
- * css_set is suitable.
- *
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
*/
-static struct css_set *find_existing_css_set(
- struct css_set *oldcg,
- struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
int i;
- struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
- struct css_set *cg;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- if (root->subsys_bits & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgrp->subsys[i];
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
- template[i] = oldcg->subsys[i];
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
- if (!compare_css_sets(cg, oldcg, cgrp, template))
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
- return cg;
+ return cset;
}
/* No existing cgroup group matched */
return NULL;
}
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+ struct cgrp_cset_link *link, *tmp_link;
- list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
- list_del(&link->cgrp_link_list);
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
kfree(link);
}
}
-/*
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
- * success or a negative error
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
*/
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
int i;
- INIT_LIST_HEAD(tmp);
+
+ INIT_LIST_HEAD(tmp_links);
+
for (i = 0; i < count; i++) {
- link = kmalloc(sizeof(*link), GFP_KERNEL);
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
- free_cg_links(tmp);
+ free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
- list_add(&link->cgrp_link_list, tmp);
+ list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
- * @cg: the css_set to be linked
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
-static void link_css_set(struct list_head *tmp_cg_links,
- struct css_set *cg, struct cgroup *cgrp)
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
- BUG_ON(list_empty(tmp_cg_links));
- link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
- cgrp_link_list);
- link->cg = cg;
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
link->cgrp = cgrp;
- atomic_inc(&cgrp->count);
- list_move(&link->cgrp_link_list, &cgrp->css_sets);
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
+ list_move(&link->cset_link, &cgrp->cset_links);
+
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
- list_add_tail(&link->cg_link_list, &cg->cg_links);
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
}
-/*
- * find_css_set() takes an existing cgroup group and a
- * cgroup object, and returns a css_set object that's
- * equivalent to the old group, but with the given cgroup
- * substituted into the appropriate hierarchy. Must be called with
- * cgroup_mutex held
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
*/
-static struct css_set *find_css_set(
- struct css_set *oldcg, struct cgroup *cgrp)
+static struct css_set *find_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp)
{
- struct css_set *res;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
- struct list_head tmp_cg_links;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct css_set *cset;
+ struct list_head tmp_links;
+ struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid;
- struct hlist_head *hhead;
- struct cg_cgroup_link *link;
+ lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
- res = find_existing_css_set(oldcg, cgrp, template);
- if (res)
- get_css_set(res);
- read_unlock(&css_set_lock);
+ down_read(&css_set_rwsem);
+ cset = find_existing_css_set(old_cset, cgrp, template);
+ if (cset)
+ get_css_set(cset);
+ up_read(&css_set_rwsem);
- if (res)
- return res;
+ if (cset)
+ return cset;
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
+ cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ if (!cset)
return NULL;
- /* Allocate all the cg_cgroup_link objects that we'll need */
- if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
- kfree(res);
+ /* Allocate all the cgrp_cset_link objects that we'll need */
+ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+ kfree(cset);
return NULL;
}
- atomic_set(&res->refcount, 1);
- INIT_LIST_HEAD(&res->cg_links);
- INIT_LIST_HEAD(&res->tasks);
- INIT_HLIST_NODE(&res->hlist);
+ atomic_set(&cset->refcount, 1);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
+ INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
- memcpy(res->subsys, template, sizeof(res->subsys));
+ memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
- list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
+
if (c->root == cgrp->root)
c = cgrp;
- link_css_set(&tmp_cg_links, res, c);
+ link_css_set(&tmp_links, cset, c);
}
- BUG_ON(!list_empty(&tmp_cg_links));
+ BUG_ON(!list_empty(&tmp_links));
css_set_count++;
- /* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ /* Add @cset to the hash table */
+ key = css_set_hash(cset->subsys);
+ hash_add(css_set_table, &cset->hlist, key);
- write_unlock(&css_set_lock);
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
- return res;
+ up_write(&css_set_rwsem);
+
+ return cset;
}
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroupfs_root *root)
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct css_set *css;
- struct cgroup *res = NULL;
+ struct cgroup *root_cgrp = kf_root->kn->priv;
+
+ return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- read_lock(&css_set_lock);
/*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
*/
- css = task->cgroups;
- if (css == &init_css_set) {
- res = &root->top_cgroup;
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
} else {
- struct cg_cgroup_link *link;
- list_for_each_entry(link, &css->cg_links, cg_link_list) {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
+
if (c->root == root) {
res = c;
break;
}
}
}
- read_unlock(&css_set_lock);
+
BUG_ON(!res);
return res;
}
/*
- * There is one global cgroup mutex. We also require taking
- * task_lock() when dereferencing a task's cgroup subsys pointers.
- * See "The task_lock() exception", at the end of this comment.
- *
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
@@ -729,384 +976,302 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
- * need a special hack to ensure that top_cgroup cannot be deleted.
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
- * another. It does so using cgroup_mutex, however there are
- * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
- * mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
- * the task_struct routinely used for such matters.
+ * need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- mutex_lock(&cgroup_mutex);
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
-EXPORT_SYMBOL_GPL(cgroup_lock);
/**
- * cgroup_unlock - release lock on cgroup changes
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
*
- * Undo the lock taken in a previous cgroup_lock() call.
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
*/
-void cgroup_unlock(void)
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unlock);
+ umode_t mode = 0;
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
+ if (cft->mode)
+ return cft->mode;
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
-static const struct inode_operations cgroup_dir_inode_operations;
-static const struct file_operations proc_cgroupstats_operations;
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .name = "cgroup",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
+ if (cft->write_u64 || cft->write_s64 || cft->write)
+ mode |= S_IWUSR;
-static int alloc_css_id(struct cgroup_subsys *ss,
- struct cgroup *parent, struct cgroup *child);
+ return mode;
+}
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
+static void cgroup_get(struct cgroup *cgrp)
{
- struct inode *inode = new_inode(sb);
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
- if (inode) {
- inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
*/
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
+static void cgroup_kn_unlock(struct kernfs_node *kn)
{
- struct cgroup_subsys *ss;
- int ret = 0;
+ struct cgroup *cgrp;
- for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy) {
- ret = ss->pre_destroy(ss, cgrp);
- if (ret)
- break;
- }
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- return ret;
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
-
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->destroy(ss, cgrp);
+ struct cgroup *cgrp;
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(kn);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ mutex_lock(&cgroup_mutex);
- kfree_rcu(cgrp, rcu_head);
- }
- iput(inode);
-}
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
-static int cgroup_delete(const struct dentry *d)
-{
- return 1;
+ cgroup_kn_unlock(kn);
+ return NULL;
}
-static void remove_dir(struct dentry *d)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
- struct dentry *parent = dget(d->d_parent);
+ char name[CGROUP_FILE_NAME_MAX];
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ lockdep_assert_held(&cgroup_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
-static void cgroup_clear_directory(struct dentry *dentry)
+/**
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- struct list_head *node;
+ struct cgroup_subsys *ss;
+ int i;
- BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dentry->d_lock);
- node = dentry->d_subdirs.next;
- while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(node);
- if (d->d_inode) {
- /* This should never be called on a cgroup
- * directory with child cgroups */
- BUG_ON(d->d_inode->i_mode & S_IFDIR);
- dget_dlock(d);
- spin_unlock(&d->d_lock);
- spin_unlock(&dentry->d_lock);
- d_delete(d);
- simple_unlink(dentry->d_inode, d);
- dput(d);
- spin_lock(&dentry->d_lock);
- } else
- spin_unlock(&d->d_lock);
- node = dentry->d_subdirs.next;
+ if (!(subsys_mask & (1 << i)))
+ continue;
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
}
- spin_unlock(&dentry->d_lock);
}
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
{
- struct dentry *parent;
-
- cgroup_clear_directory(dentry);
-
- parent = dentry->d_parent;
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
- remove_dir(dentry);
-}
+ struct cgroup_subsys *ss;
+ unsigned int tmp_ss_mask;
+ int ssid, i, ret;
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+ lockdep_assert_held(&cgroup_mutex);
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
+ for_each_subsys(ss, ssid) {
+ if (!(ss_mask & (1 << ssid)))
+ continue;
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ return -EBUSY;
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+ return -EBUSY;
+ }
-/*
- * Call with cgroup_mutex held. Drops reference counts on modules, including
- * any duplicate ones that parse_cgroupfs_options took. If this function
- * returns an error, no reference counts are touched.
- */
-static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long final_bits)
-{
- unsigned long added_bits, removed_bits;
- struct cgroup *cgrp = &root->top_cgroup;
- int i;
+ /* skip creating root files on dfl_root for inhibited subsystems */
+ tmp_ss_mask = ss_mask;
+ if (dst_root == &cgrp_dfl_root)
+ tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
+ ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+ if (ret) {
+ if (dst_root != &cgrp_dfl_root)
+ return ret;
- removed_bits = root->actual_subsys_bits & ~final_bits;
- added_bits = final_bits & ~root->actual_subsys_bits;
- /* Check that any added subsystems are currently free */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
- struct cgroup_subsys *ss = subsys[i];
- if (!(bit & added_bits))
- continue;
/*
- * Nobody should tell us to do a subsys that doesn't exist:
- * parse_cgroupfs_options should catch that case and refcounts
- * ensure that subsystems won't disappear once selected.
+ * Rebinding back to the default root is not allowed to
+ * fail. Using both default and non-default roots should
+ * be rare. Moving subsystems back and forth even more so.
+ * Just warn about it and continue.
*/
- BUG_ON(ss == NULL);
- if (ss->root != &rootnode) {
- /* Subsystem isn't free */
- return -EBUSY;
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
}
- /* Currently we don't handle adding/removing subsystems when
- * any child cgroups exist. This is theoretically supportable
- * but involves complex error handling, so it's being left until
- * later */
- if (root->number_of_cgroups > 1)
- return -EBUSY;
+ /*
+ * Nothing can fail from this point on. Remove files for the
+ * removed subsystems and rebind each subsystem.
+ */
+ for_each_subsys(ss, ssid)
+ if (ss_mask & (1 << ssid))
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- /* Process each subsystem */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- unsigned long bit = 1UL << i;
- if (bit & added_bits) {
- /* We're binding this subsystem to this hierarchy */
- BUG_ON(ss == NULL);
- BUG_ON(cgrp->subsys[i]);
- BUG_ON(!dummytop->subsys[i]);
- BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
- mutex_lock(&ss->hierarchy_mutex);
- cgrp->subsys[i] = dummytop->subsys[i];
- cgrp->subsys[i]->cgroup = cgrp;
- list_move(&ss->sibling, &root->subsys_list);
- ss->root = root;
- if (ss->bind)
- ss->bind(ss, cgrp);
- mutex_unlock(&ss->hierarchy_mutex);
- /* refcount was already taken, and we're keeping it */
- } else if (bit & removed_bits) {
- /* We're removing this subsystem */
- BUG_ON(ss == NULL);
- BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
- mutex_lock(&ss->hierarchy_mutex);
- if (ss->bind)
- ss->bind(ss, dummytop);
- dummytop->subsys[i]->cgroup = dummytop;
- cgrp->subsys[i] = NULL;
- subsys[i]->root = &rootnode;
- list_move(&ss->sibling, &rootnode.subsys_list);
- mutex_unlock(&ss->hierarchy_mutex);
- /* subsystem is now free - drop reference on module */
- module_put(ss->module);
- } else if (bit & final_bits) {
- /* Subsystem state should already exist */
- BUG_ON(ss == NULL);
- BUG_ON(!cgrp->subsys[i]);
- /*
- * a refcount was taken, but we already had one, so
- * drop the extra reference.
- */
- module_put(ss->module);
-#ifdef CONFIG_MODULE_UNLOAD
- BUG_ON(ss->module && !module_refcount(ss->module));
-#endif
- } else {
- /* Subsystem state shouldn't exist */
- BUG_ON(cgrp->subsys[i]);
- }
+ for_each_subsys(ss, ssid) {
+ struct cgroup_root *src_root;
+ struct cgroup_subsys_state *css;
+ struct css_set *cset;
+
+ if (!(ss_mask & (1 << ssid)))
+ continue;
+
+ src_root = ss->root;
+ css = cgroup_css(&src_root->cgrp, ss);
+
+ WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+
+ RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+ rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ ss->root = dst_root;
+ css->cgroup = &dst_root->cgrp;
+
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root)
+ dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+
+ if (ss->bind)
+ ss->bind(css);
}
- root->subsys_bits = root->actual_subsys_bits = final_bits;
- synchronize_rcu();
+ kernfs_activate(dst_root->cgrp.kn);
return 0;
}
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
-
- mutex_lock(&cgroup_root_mutex);
- for_each_subsys(root, ss)
- seq_printf(seq, ",%s", ss->name);
- if (test_bit(ROOT_NOPREFIX, &root->flags))
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->name);
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+ seq_puts(seq, ",sane_behavior");
+ if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- if (clone_children(&root->top_cgroup))
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_root_mutex);
return 0;
}
struct cgroup_sb_opts {
- unsigned long subsys_bits;
- unsigned long flags;
+ unsigned int subsys_mask;
+ unsigned int flags;
char *release_agent;
- bool clone_children;
+ bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
-
- struct cgroupfs_root *new_root;
-
};
-/*
- * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
- * with cgroup_mutex held to protect the subsys[] array. This function takes
- * refcounts on subsystems to be used, unless it returns error, in which case
- * no refcounts are taken.
- */
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = (unsigned long)-1;
+ unsigned int mask = -1U;
+ struct cgroup_subsys *ss;
int i;
- bool module_pin_failed = false;
-
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_subsys_id);
+ mask = ~(1U << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1126,12 +1291,20 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
if (!strcmp(token, "noprefix")) {
- set_bit(ROOT_NOPREFIX, &opts->flags);
+ opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
- opts->clone_children = true;
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
continue;
}
if (!strncmp(token, "release_agent=", 14)) {
@@ -1170,10 +1343,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
}
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
+ for_each_subsys(ss, i) {
if (strcmp(token, ss->name))
continue;
if (ss->disabled)
@@ -1182,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
- set_bit(i, &opts->subsys_bits);
+ opts->subsys_mask |= (1 << i);
one_ss = true;
break;
@@ -1191,501 +1361,479 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- /*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options
- * were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name)) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->disabled)
- continue;
- set_bit(i, &opts->subsys_bits);
+ /* Consistency checks */
+
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ return -EINVAL;
}
- }
+ } else {
+ /*
+ * If the 'all' option was specified select all the
+ * subsystems, otherwise if 'none', 'name=' and a subsystem
+ * name options were not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
- /* Consistency checks */
+ /*
+ * We either have to specify by name or by subsystems. (So
+ * all empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+ }
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
- (opts->subsys_bits & mask))
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
/* Can't specify "none" and some subsystems */
- if (opts->subsys_bits && opts->none)
- return -EINVAL;
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_bits && !opts->name)
+ if (opts->subsys_mask && opts->none)
return -EINVAL;
- /*
- * Grab references on all the modules we'll need, so the subsystems
- * don't dance around before rebind_subsystems attaches them. This may
- * take duplicate reference counts on a subsystem that's already used,
- * but rebind_subsystems handles this case.
- */
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
-
- if (!(bit & opts->subsys_bits))
- continue;
- if (!try_module_get(subsys[i]->module)) {
- module_pin_failed = true;
- break;
- }
- }
- if (module_pin_failed) {
- /*
- * oops, one of the modules was going away. this means that we
- * raced with a module_delete call, and to the user this is
- * essentially a "subsystem doesn't exist" case.
- */
- for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
- /* drop refcounts only on the ones we took */
- unsigned long bit = 1UL << i;
-
- if (!(bit & opts->subsys_bits))
- continue;
- module_put(subsys[i]->module);
- }
- return -ENOENT;
- }
-
return 0;
}
-static void drop_parsed_module_refcounts(unsigned long subsys_bits)
-{
- int i;
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
-
- if (!(bit & subsys_bits))
- continue;
- module_put(subsys[i]->module);
- }
-}
-
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
+ unsigned int added_mask, removed_mask;
+
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: remount is not allowed\n");
+ return -EINVAL;
+ }
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
/* Don't allow flags or name to change at remount */
- if (opts.flags != root->flags ||
+ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
(opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+ root->flags & CGRP_ROOT_OPTION_MASK, root->name);
ret = -EINVAL;
- drop_parsed_module_refcounts(opts.subsys_bits);
goto out_unlock;
}
- ret = rebind_subsystems(root, opts.subsys_bits);
- if (ret) {
- drop_parsed_module_refcounts(opts.subsys_bits);
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
goto out_unlock;
}
- /* (re)populate subsystem files */
- cgroup_populate_dir(cgrp);
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
- if (opts.release_agent)
+ rebind_subsystems(&cgrp_dfl_root, removed_mask);
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
}
-static const struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
-static void init_cgroup_housekeeping(struct cgroup *cgrp)
+static void cgroup_enable_task_cg_lists(void)
{
- INIT_LIST_HEAD(&cgrp->sibling);
- INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->css_sets);
- INIT_LIST_HEAD(&cgrp->release_list);
- INIT_LIST_HEAD(&cgrp->pidlists);
- mutex_init(&cgrp->pidlist_mutex);
- INIT_LIST_HEAD(&cgrp->event_list);
- spin_lock_init(&cgrp->event_list_lock);
-}
+ struct task_struct *p, *g;
-static void init_cgroup_root(struct cgroupfs_root *root)
-{
- struct cgroup *cgrp = &root->top_cgroup;
- INIT_LIST_HEAD(&root->subsys_list);
- INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
- cgrp->root = root;
- cgrp->top_cgroup = cgrp;
- init_cgroup_housekeeping(cgrp);
-}
+ down_write(&css_set_rwsem);
-static bool init_root_id(struct cgroupfs_root *root)
-{
- int ret = 0;
+ if (use_task_css_set_links)
+ goto out_unlock;
- do {
- if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
- return false;
- spin_lock(&hierarchy_id_lock);
- /* Try to allocate the next unused ID */
- ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
- &root->hierarchy_id);
- if (ret == -ENOSPC)
- /* Try again starting from 0 */
- ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
- if (!ret) {
- next_hierarchy_id = root->hierarchy_id + 1;
- } else if (ret != -EAGAIN) {
- /* Can only get here if the 31-bit IDR is full ... */
- BUG_ON(ret);
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING)) {
+ struct css_set *cset = task_css_set(p);
+
+ list_add(&p->cg_list, &cset->tasks);
+ get_css_set(cset);
}
- spin_unlock(&hierarchy_id_lock);
- } while (ret);
- return true;
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
}
-static int cgroup_test_super(struct super_block *sb, void *data)
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
- struct cgroup_sb_opts *opts = data;
- struct cgroupfs_root *root = sb->s_fs_info;
+ struct cgroup_subsys *ss;
+ int ssid;
- /* If we asked for a name then it must match */
- if (opts->name && strcmp(opts->name, root->name))
- return 0;
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
+ INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match
- */
- if ((opts->subsys_bits || opts->none)
- && (opts->subsys_bits != root->subsys_bits))
- return 0;
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
- return 1;
+ init_waitqueue_head(&cgrp->offline_waitq);
}
-static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+static void init_cgroup_root(struct cgroup_root *root,
+ struct cgroup_sb_opts *opts)
{
- struct cgroupfs_root *root;
-
- if (!opts->subsys_bits && !opts->none)
- return NULL;
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root)
- return ERR_PTR(-ENOMEM);
+ struct cgroup *cgrp = &root->cgrp;
- if (!init_root_id(root)) {
- kfree(root);
- return ERR_PTR(-ENOMEM);
- }
- init_cgroup_root(root);
+ INIT_LIST_HEAD(&root->root_list);
+ atomic_set(&root->nr_cgrps, 1);
+ cgrp->root = root;
+ init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
- root->subsys_bits = opts->subsys_bits;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
- if (opts->clone_children)
- set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
- return root;
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static void cgroup_drop_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
- if (!root)
- return;
-
- BUG_ON(!root->hierarchy_id);
- spin_lock(&hierarchy_id_lock);
- ida_remove(&hierarchy_ida, root->hierarchy_id);
- spin_unlock(&hierarchy_id_lock);
- kfree(root);
-}
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct css_set *cset;
+ int i, ret;
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
- int ret;
- struct cgroup_sb_opts *opts = data;
+ lockdep_assert_held(&cgroup_mutex);
- /* If we don't have a new root, we can't set up a new sb */
- if (!opts->new_root)
- return -EINVAL;
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
- BUG_ON(!opts->subsys_bits && !opts->none);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out;
- ret = set_anon_super(sb, NULL);
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
if (ret)
- return ret;
+ goto cancel_ref;
- sb->s_fs_info = opts->new_root;
- opts->new_root->sb = sb;
+ ret = cgroup_init_root_id(root);
+ if (ret)
+ goto cancel_ref;
+
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CGROUP_SUPER_MAGIC;
- sb->s_op = &cgroup_ops;
+ ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (ret)
+ goto destroy_root;
- return 0;
-}
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto destroy_root;
-static int cgroup_get_rootdir(struct super_block *sb)
-{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- .d_delete = cgroup_delete,
- };
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
- struct inode *inode =
- cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
- struct dentry *dentry;
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
- if (!inode)
- return -ENOMEM;
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &cgroup_dir_inode_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- dentry = d_alloc_root(inode);
- if (!dentry) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = dentry;
- /* for everything else we want ->d_op set */
- sb->s_d_op = &cgroup_dops;
- return 0;
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
+
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
}
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
+ struct cgroup_root *root;
struct cgroup_sb_opts opts;
- struct cgroupfs_root *root;
- int ret = 0;
- struct super_block *sb;
- struct cgroupfs_root *new_root;
- struct inode *inode;
+ struct dentry *dentry;
+ int ret;
+ int i;
+ bool new_sb;
+
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
- /* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
+
+ /* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- mutex_unlock(&cgroup_mutex);
if (ret)
- goto out_err;
+ goto out_unlock;
- /*
- * Allocate a new cgroup root. We may not need it if we're
- * reusing an existing hierarchy.
- */
- new_root = cgroup_root_from_opts(&opts);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
- goto drop_modules;
- }
- opts.new_root = new_root;
-
- /* Locate an existing or new sb for this hierarchy */
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- cgroup_drop_root(opts.new_root);
- goto drop_modules;
+ /* look for a matching existing root */
+ if (!opts.subsys_mask && !opts.none && !opts.name) {
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ ret = 0;
+ goto out_unlock;
}
- root = sb->s_fs_info;
- BUG_ON(!root);
- if (root == opts.new_root) {
- /* We used the new root structure, so this is a new hierarchy */
- struct list_head tmp_cg_links;
- struct cgroup *root_cgrp = &root->top_cgroup;
- struct cgroupfs_root *existing_root;
- const struct cred *cred;
- int i;
-
- BUG_ON(sb->s_root != NULL);
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
- ret = cgroup_get_rootdir(sb);
- if (ret)
- goto drop_new_super;
- inode = sb->s_root->d_inode;
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
+ for_each_root(root) {
+ bool name_match = false;
- /* Check for name clashes with existing mounts */
- ret = -EBUSY;
- if (strlen(root->name))
- for_each_active_root(existing_root)
- if (!strcmp(existing_root->name, root->name))
- goto unlock_drop;
+ if (root == &cgrp_dfl_root)
+ continue;
/*
- * We're accessing css_set_count without locking
- * css_set_lock here, but that's OK - it can only be
- * increased by someone holding cgroup_lock, and
- * that's us. The worst that can happen is that we
- * have some link structures left over
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
*/
- ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret)
- goto unlock_drop;
-
- ret = rebind_subsystems(root, root->subsys_bits);
- if (ret == -EBUSY) {
- free_cg_links(&tmp_cg_links);
- goto unlock_drop;
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
}
+
/*
- * There must be no failure case after here, since rebinding
- * takes care of subsystems' refcounts, which are explicitly
- * dropped in the failure exit path.
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
*/
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
- /* EBUSY should be the only error here */
- BUG_ON(ret);
-
- list_add(&root->root_list, &roots);
- root_count++;
-
- sb->s_root->d_fsdata = root_cgrp;
- root->top_cgroup.dentry = sb->s_root;
-
- /* Link the top cgroup in this hierarchy into all
- * the css_set objects */
- write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
+ if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
+ if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ } else {
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+ }
+ }
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
}
- write_unlock(&css_set_lock);
- free_cg_links(&tmp_cg_links);
+ ret = 0;
+ goto out_unlock;
+ }
- BUG_ON(!list_empty(&root_cgrp->sibling));
- BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- cred = override_creds(&init_cred);
- cgroup_populate_dir(root_cgrp);
- revert_creds(cred);
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- /*
- * We re-used an existing hierarchy - the new root (if
- * any) is not needed
- */
- cgroup_drop_root(opts.new_root);
- /* no subsys rebinding, so refcounts don't change */
- drop_parsed_module_refcounts(opts.subsys_bits);
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- kfree(opts.release_agent);
- kfree(opts.name);
- return dget(sb->s_root);
+ init_cgroup_root(root, &opts);
+
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
- unlock_drop:
- mutex_unlock(&cgroup_root_mutex);
+out_unlock:
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- drop_new_super:
- deactivate_locked_super(sb);
- drop_modules:
- drop_parsed_module_refcounts(opts.subsys_bits);
- out_err:
+out_free:
kfree(opts.release_agent);
kfree(opts.name);
- return ERR_PTR(ret);
-}
-
-static void cgroup_kill_sb(struct super_block *sb) {
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
- int ret;
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
- BUG_ON(!root);
-
- BUG_ON(root->number_of_cgroups != 1);
- BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
-
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
+ if (ret)
+ return ERR_PTR(ret);
- /* Rebind all subsystems back to the default hierarchy */
- ret = rebind_subsystems(root, 0);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
+ cgroup_put(&root->cgrp);
/*
- * Release all the links from css_sets to this hierarchy's
- * root cgroup
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
*/
- write_lock(&css_set_lock);
-
- list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
- cgrp_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- kfree(link);
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
}
- write_unlock(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- root_count--;
- }
+ return dentry;
+}
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+
+ /*
+ * If @root doesn't have any mounts or children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ * cgroup_mount() may wait for @root's release.
+ *
+ * And don't kill the default root.
+ */
+ if (css_has_online_children(&root->cgrp.self) ||
+ root == &cgrp_dfl_root)
+ cgroup_put(&root->cgrp);
+ else
+ percpu_ref_kill(&root->cgrp.self.refcnt);
- kill_litter_super(sb);
- cgroup_drop_root(root);
+ kernfs_kill_sb(sb);
}
static struct file_system_type cgroup_fs_type = {
@@ -1696,81 +1844,66 @@ static struct file_system_type cgroup_fs_type = {
static struct kobject *cgroup_kobj;
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Return value is the same as kernfs_path().
*/
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
- char *start;
- struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
-
- if (!dentry || cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
- return 0;
- }
-
- start = buf + buflen;
+ struct cgroup_root *root;
+ struct cgroup *cgrp;
+ int hierarchy_id = 1;
+ char *path = NULL;
- *--start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
+ root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
- dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
- if (!cgrp->parent)
- continue;
- if (--start < buf)
- return -ENAMETOOLONG;
- *start = '/';
+ if (root) {
+ cgrp = task_cgroup_from_root(task, root);
+ path = cgroup_path(cgrp, buf, buflen);
+ } else {
+ /* if no hierarchy exists, everyone is in "/" */
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
}
- memmove(buf, start, buf + buflen - start);
- return 0;
-}
-EXPORT_SYMBOL_GPL(cgroup_path);
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
- struct task_struct *task;
- struct cgroup *cgrp;
-};
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return path;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
+/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
- struct task_and_cgroup single;
- struct flex_array *tc_array;
- int tc_array_len;
- int idx;
- struct cgroup *cur_cgrp;
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
};
/**
@@ -1781,15 +1914,11 @@ struct cgroup_taskset {
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
- if (tset->tc_array) {
- tset->idx = 0;
- return cgroup_taskset_next(tset);
- } else {
- tset->cur_cgrp = tset->single.cgrp;
- return tset->single.task;
- }
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_first);
/**
* cgroup_taskset_next - iterate to the next task in taskset
@@ -1800,54 +1929,45 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
- struct task_and_cgroup *tc;
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
- if (!tset->tc_array || tset->idx >= tset->tc_array_len)
- return NULL;
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
- tc = flex_array_get(tset->tc_array, tset->idx++);
- tset->cur_cgrp = tc->cgrp;
- return tc->task;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
-/**
- * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
- * @tset: taskset of interest
- *
- * Return the cgroup for the current (last returned) task of @tset. This
- * function must be preceded by either cgroup_taskset_first() or
- * cgroup_taskset_next().
- */
-struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
-{
- return tset->cur_cgrp;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
-/**
- * cgroup_taskset_size - return the number of tasks in taskset
- * @tset: taskset of interest
- */
-int cgroup_taskset_size(struct cgroup_taskset *tset)
-{
- return tset->tc_array ? tset->tc_array_len : 1;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_size);
-
-/*
+/**
* cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
*
- * 'guarantee' is set if the caller promises that a new css_set for the task
- * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
*/
-static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
- struct task_struct *tsk, bool guarantee)
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
+ struct task_struct *tsk,
+ struct css_set *new_cset)
{
- struct css_set *oldcg;
- struct css_set *newcg;
+ struct css_set *old_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
/*
* We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1855,1182 +1975,1584 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
* css_set to init_css_set and dropping the old one.
*/
WARN_ON_ONCE(tsk->flags & PF_EXITING);
- oldcg = tsk->cgroups;
-
- /* locate or allocate a new css_set for this task. */
- if (guarantee) {
- /* we know the css_set we want already exists. */
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
- read_lock(&css_set_lock);
- newcg = find_existing_css_set(oldcg, cgrp, template);
- BUG_ON(!newcg);
- get_css_set(newcg);
- read_unlock(&css_set_lock);
- } else {
- might_sleep();
- /* find_css_set will give us newcg already referenced. */
- newcg = find_css_set(oldcg, cgrp);
- if (!newcg)
- return -ENOMEM;
- }
-
- task_lock(tsk);
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ old_cset = task_css_set(tsk);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &newcg->tasks);
- write_unlock(&css_set_lock);
+ get_css_set(new_cset);
+ rcu_assign_pointer(tsk->cgroups, new_cset);
/*
- * We just gained a reference on oldcg by taking it from the task. As
- * trading it for newcg is protected by cgroup_mutex, we're safe to drop
- * it here; it will be freed under RCU.
+ * Use move_tail so that cgroup_taskset_first() still returns the
+ * leader after migration. This works because cgroup_migrate()
+ * ensures that the dst_cset of the leader is the first on the
+ * tset's dst_csets list.
*/
- put_css_set(oldcg);
+ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- return 0;
+ /*
+ * We just gained a reference on old_cset by taking it from the
+ * task. As trading it for new_cset is protected by cgroup_mutex,
+ * we're safe to drop it here; it will be freed under RCU.
+ */
+ set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
+ put_css_set_locked(old_cset, false);
}
/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
*
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
*/
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
{
- int retval;
- struct cgroup_subsys *ss, *failed_ss = NULL;
- struct cgroup *oldcgrp;
- struct cgroupfs_root *root = cgrp->root;
- struct cgroup_taskset tset = { };
-
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
- return -ESRCH;
-
- /* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- return 0;
-
- tset.single.task = tsk;
- tset.single.cgrp = oldcgrp;
-
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, &tset);
- if (retval) {
- /*
- * Remember on which subsystem the can_attach()
- * failed, so that we only call cancel_attach()
- * against the subsystems whose can_attach()
- * succeeded. (See below)
- */
- failed_ss = ss;
- goto out;
- }
- }
- }
+ struct css_set *cset, *tmp_cset;
- retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
- if (retval)
- goto out;
+ lockdep_assert_held(&cgroup_mutex);
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(ss, cgrp, &tset);
+ down_write(&css_set_rwsem);
+ list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_preload_node);
+ put_css_set_locked(cset, false);
}
-
- synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
-out:
- if (retval) {
- for_each_subsys(root, ss) {
- if (ss == failed_ss)
- /*
- * This subsystem was the one that failed the
- * can_attach() check earlier, so we don't need
- * to call cancel_attach() against it or any
- * remaining subsystems.
- */
- break;
- if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, &tset);
- }
- }
- return retval;
+ up_write(&css_set_rwsem);
}
/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
*/
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- struct cgroupfs_root *root;
- int retval = 0;
+ struct cgroup *src_cgrp;
- cgroup_lock();
- for_each_active_root(root) {
- struct cgroup *from_cg = task_cgroup_from_root(from, root);
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
- retval = cgroup_attach_task(from_cg, tsk);
- if (retval)
- break;
- }
- cgroup_unlock();
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
- return retval;
+ if (!list_empty(&src_cset->mg_preload_node))
+ return;
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
+
+ src_cset->mg_src_cgrp = src_cgrp;
+ get_css_set(src_cset);
+ list_add(&src_cset->mg_preload_node, preloaded_csets);
}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-/*
- * cgroup_attach_proc works in two stages, the first of which prefetches all
- * new css_sets needed (to make sure we have enough memory before committing
- * to the move) and stores them in a list of entries of the following type.
- * TODO: possible optimization: use css_set->rcu_head for chaining instead
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup (may be %NULL)
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
*/
-struct cg_list_entry {
- struct css_set *cg;
- struct list_head links;
-};
-
-static bool css_set_check_fetched(struct cgroup *cgrp,
- struct task_struct *tsk, struct css_set *cg,
- struct list_head *newcg_list)
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- struct css_set *newcg;
- struct cg_list_entry *cg_entry;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ LIST_HEAD(csets);
+ struct css_set *src_cset, *tmp_cset;
- read_lock(&css_set_lock);
- newcg = find_existing_css_set(cg, cgrp, template);
- read_unlock(&css_set_lock);
+ lockdep_assert_held(&cgroup_mutex);
- /* doesn't exist at all? */
- if (!newcg)
- return false;
- /* see if it's already in the list */
- list_for_each_entry(cg_entry, newcg_list, links)
- if (cg_entry->cg == newcg)
- return true;
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
- /* not found */
- return false;
-}
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ struct css_set *dst_cset;
-/*
- * Find the new css_set and store it in the list in preparation for moving the
- * given task to the given cgroup. Returns 0 or -ENOMEM.
- */
-static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
- struct list_head *newcg_list)
-{
- struct css_set *newcg;
- struct cg_list_entry *cg_entry;
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
+ if (!dst_cset)
+ goto err;
- /* ensure a new css_set will exist for this thread */
- newcg = find_css_set(cg, cgrp);
- if (!newcg)
- return -ENOMEM;
- /* add it to the list */
- cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
- if (!cg_entry) {
- put_css_set(newcg);
- return -ENOMEM;
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
+
+ src_cset->mg_dst_cset = dst_cset;
+
+ if (list_empty(&dst_cset->mg_preload_node))
+ list_add(&dst_cset->mg_preload_node, &csets);
+ else
+ put_css_set(dst_cset, false);
}
- cg_entry->cg = newcg;
- list_add(&cg_entry->links, newcg_list);
+
+ list_splice_tail(&csets, preloaded_csets);
return 0;
+err:
+ cgroup_migrate_finish(&csets);
+ return -ENOMEM;
}
/**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
- * @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ *
+ * Migrate a process or task denoted by @leader to @cgrp. If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader. The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
*
- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
*/
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
-{
- int retval, i, group_size;
- struct cgroup_subsys *ss, *failed_ss = NULL;
- /* guaranteed to be initialized later, but the compiler needs this */
- struct css_set *oldcg;
- struct cgroupfs_root *root = cgrp->root;
- /* threadgroup list cursor and array */
- struct task_struct *tsk;
- struct task_and_cgroup *tc;
- struct flex_array *group;
- struct cgroup_taskset tset = { };
- /*
- * we need to make sure we have css_sets for all the tasks we're
- * going to move -before- we actually start moving them, so that in
- * case we get an ENOMEM we can bail out before making any changes.
- */
- struct list_head newcg_list;
- struct cg_list_entry *cg_entry, *temp_nobe;
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+ bool threadgroup)
+{
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
/*
- * step 0: in order to do expensive, possibly blocking operations for
- * every thread, we cannot iterate the thread group list, since it needs
- * rcu or tasklist locked. instead, build an array of all threads in the
- * group - group_rwsem prevents new threads from appearing, and if
- * threads exit, this will just be an over-estimate.
+ * Prevent freeing of tasks while we take a snapshot. Tasks that are
+ * already PF_EXITING could be freed from underneath us unless we
+ * take an rcu_read_lock.
*/
- group_size = get_nr_threads(leader);
- /* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
- if (!group)
- return -ENOMEM;
- /* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
- if (retval)
- goto out_free_group_list;
-
- /* prevent changes to the threadgroup list while we take a snapshot. */
- read_lock(&tasklist_lock);
- if (!thread_group_leader(leader)) {
- /*
- * a race with de_thread from another thread's exec() may strip
- * us of our leadership, making while_each_thread unsafe to use
- * on this task. if this happens, there is no choice but to
- * throw this task away and try again (from cgroup_procs_write);
- * this is "double-double-toil-and-trouble-check locking".
- */
- read_unlock(&tasklist_lock);
- retval = -EAGAIN;
- goto out_free_group_list;
- }
-
- tsk = leader;
- i = 0;
+ down_write(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
do {
- struct task_and_cgroup ent;
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ goto next;
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
- continue;
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ goto next;
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ goto next;
- /* as per above, nr_threads may decrease, but not increase. */
- BUG_ON(i >= group_size);
/*
- * saying GFP_ATOMIC has no effect here because we did prealloc
- * earlier, but it's good form to communicate our expectations.
+ * cgroup_taskset_first() must always return the leader.
+ * Take care to avoid disturbing the ordering.
*/
- ent.task = tsk;
- ent.cgrp = task_cgroup_from_root(tsk, root);
- /* nothing to do if this task is already in the cgroup */
- if (ent.cgrp == cgrp)
- continue;
- retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
- BUG_ON(retval != 0);
- i++;
- } while_each_thread(leader, tsk);
- /* remember the number of threads in the array for later. */
- group_size = i;
- tset.tc_array = group;
- tset.tc_array_len = group_size;
- read_unlock(&tasklist_lock);
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset.dst_csets);
+ next:
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_write(&css_set_rwsem);
/* methods shouldn't be called if no task is actually migrating */
- retval = 0;
- if (!group_size)
- goto out_free_group_list;
+ if (list_empty(&tset.src_csets))
+ return 0;
- /*
- * step 1: check that we can legitimately attach to the cgroup.
- */
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, &tset);
- if (retval) {
- failed_ss = ss;
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
+ failed_css = css;
goto out_cancel_attach;
}
}
}
/*
- * step 2: make sure css_sets exist for all threads to be migrated.
- * we use find_css_set, which allocates a new one if necessary.
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- INIT_LIST_HEAD(&newcg_list);
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- oldcg = tc->task->cgroups;
-
- /* if we don't already have it in the list get a new one */
- if (!css_set_check_fetched(cgrp, tc->task, oldcg,
- &newcg_list)) {
- retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
- if (retval)
- goto out_list_teardown;
- }
+ down_write(&css_set_rwsem);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
+ up_write(&css_set_rwsem);
/*
- * step 3: now that we're guaranteed success wrt the css_sets,
- * proceed to move all tasks to the new cgroup. There are no
- * failure cases after here, so this is the commit point.
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
*/
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
- BUG_ON(retval);
- }
- /* nothing is sensitive to fork() after this point. */
+ tset.csets = &tset.dst_csets;
- /*
- * step 4: do subsystem attach callbacks.
- */
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(ss, cgrp, &tset);
- }
+ for_each_e_css(css, i, cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, &tset);
+
+ ret = 0;
+ goto out_release_tset;
- /*
- * step 5: success! and cleanup
- */
- synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
- retval = 0;
-out_list_teardown:
- /* clean up the list of prefetched css_sets. */
- list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
- list_del(&cg_entry->links);
- put_css_set(cg_entry->cg);
- kfree(cg_entry);
- }
out_cancel_attach:
- /* same deal as in cgroup_attach_task */
- if (retval) {
- for_each_subsys(root, ss) {
- if (ss == failed_ss)
- break;
- if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, &tset);
- }
+ for_each_e_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
-out_free_group_list:
- flex_array_free(group);
- return retval;
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ up_write(&css_set_rwsem);
+ return ret;
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+ struct task_struct *leader, bool threadgroup)
+{
+ LIST_HEAD(preloaded_csets);
+ struct task_struct *task;
+ int ret;
+
+ /* look up all src csets */
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+ &preloaded_csets);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ if (!ret)
+ ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup; may take task_lock of task.
+ * cgroup_mutex and threadgroup.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
+ struct cgroup *cgrp;
+ pid_t pid;
int ret;
- if (!cgroup_lock_live_group(cgrp))
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
+retry_find_task:
+ rcu_read_lock();
if (pid) {
- rcu_read_lock();
tsk = find_task_by_vpid(pid);
if (!tsk) {
rcu_read_unlock();
- cgroup_unlock();
- return -ESRCH;
- }
- if (threadgroup) {
- /*
- * RCU protects this access, since tsk was found in the
- * tid map. a race with de_thread may cause group_leader
- * to stop being the leader, but cgroup_attach_proc will
- * detect it later.
- */
- tsk = tsk->group_leader;
+ ret = -ESRCH;
+ goto out_unlock_cgroup;
}
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
*/
tcred = __task_cred(tsk);
- if (cred->euid &&
- cred->euid != tcred->uid &&
- cred->euid != tcred->suid) {
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
rcu_read_unlock();
- cgroup_unlock();
- return -EACCES;
+ ret = -EACCES;
+ goto out_unlock_cgroup;
}
- get_task_struct(tsk);
+ } else
+ tsk = current;
+
+ if (threadgroup)
+ tsk = tsk->group_leader;
+
+ /*
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
+ * trapped in a cpuset, or RT worker may be born in a cgroup
+ * with no rt_runtime allocated. Just say no.
+ */
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
rcu_read_unlock();
- } else {
- if (threadgroup)
- tsk = current->group_leader;
- else
- tsk = current;
- get_task_struct(tsk);
+ goto out_unlock_cgroup;
}
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
- if (threadgroup)
- ret = cgroup_attach_proc(cgrp, tsk);
- else
- ret = cgroup_attach_task(cgrp, tsk);
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
threadgroup_unlock(tsk);
put_task_struct(tsk);
- cgroup_unlock();
- return ret;
+out_unlock_cgroup:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
}
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
- return attach_task_by_pid(cgrp, pid, false);
+ struct cgroup_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int ret;
- do {
- /*
- * attach_proc fails with -EAGAIN if threadgroup leadership
- * changes in the middle of the operation, in which case we need
- * to find the task_struct for the new leader and start over.
- */
- ret = attach_task_by_pid(cgrp, tgid, true);
- } while (ret == -EAGAIN);
- return ret;
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
}
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
- */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- mutex_lock(&cgroup_mutex);
- if (cgroup_is_removed(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
- }
- return true;
+ return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup *cgrp;
+
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- if (strlen(buffer) >= PATH_MAX)
- return -EINVAL;
- if (!cgroup_lock_live_group(cgrp))
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- mutex_lock(&cgroup_root_mutex);
- strcpy(cgrp->root->release_agent_path, buffer);
- mutex_unlock(&cgroup_root_mutex);
- cgroup_unlock();
- return 0;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
}
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
seq_putc(seq, '\n');
- cgroup_unlock();
return 0;
}
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
-
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- char buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- char *end;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (!nbytes)
- return -EINVAL;
- if (nbytes >= sizeof(buffer))
- return -E2BIG;
- if (copy_from_user(buffer, userbuf, nbytes))
- return -EFAULT;
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ return 0;
+}
- buffer[nbytes] = 0; /* nul-terminate */
- if (cft->write_u64) {
- u64 val = simple_strtoull(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_u64(cgrp, cft, val);
- } else {
- s64 val = simple_strtoll(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_s64(cgrp, cft, val);
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
}
- if (!retval)
- retval = nbytes;
- return retval;
+ if (printed)
+ seq_putc(seq, '\n');
}
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
{
- char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- size_t max_bytes = cft->max_write_len;
- char *buffer = local_buffer;
-
- if (!max_bytes)
- max_bytes = sizeof(local_buffer) - 1;
- if (nbytes >= max_bytes)
- return -E2BIG;
- /* Allocate a dynamic buffer if we need one */
- if (nbytes >= sizeof(local_buffer)) {
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
- }
- if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out;
- }
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- buffer[nbytes] = 0; /* nul-terminate */
- retval = cft->write_string(cgrp, cft, strstrip(buffer));
- if (!retval)
- retval = nbytes;
-out:
- if (buffer != local_buffer)
- kfree(buffer);
- return retval;
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+ ~cgrp_dfl_root_inhibit_ss_mask);
+ return 0;
}
-static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (cgroup_is_removed(cgrp))
- return -ENODEV;
- if (cft->write)
- return cft->write(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_u64 || cft->write_s64)
- return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_string)
- return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->trigger) {
- int ret = cft->trigger(cgrp, (unsigned int)cft->private);
- return ret ? ret : nbytes;
- }
- return -EINVAL;
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ return 0;
}
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- u64 val = cft->read_u64(cgrp, cft);
- int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ return 0;
}
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- s64 val = cft->read_s64(cgrp, cft);
- int len = sprintf(tmp, "%lld\n", (long long) val);
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
+ }
+ up_read(&css_set_rwsem);
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
+ }
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
-static ssize_t cgroup_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ unsigned int enable = 0, disable = 0;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
+
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name) ||
+ ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
- if (cgroup_is_removed(cgrp))
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- if (cft->read)
- return cft->read(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_u64)
- return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_s64)
- return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
- return -EINVAL;
-}
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->child_subsys_mask & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
-struct cgroup_seqfile_state {
- struct cftype *cft;
- struct cgroup *cgroup;
-};
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->child_subsys_mask & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
-static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+ /*
+ * Create csses for enables and update child_subsys_mask. This
+ * changes cgroup_e_css() results which in turn makes the
+ * subsequent cgroup_update_dfl_csses() associate all tasks in the
+ * subtree to the updated csses.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ ret = create_css(child, ss);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ cgrp->child_subsys_mask |= enable;
+ cgrp->child_subsys_mask &= ~disable;
+
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /* all tasks are now migrated away from the old csses, kill them */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp)
+ kill_css(cgroup_css(child, ss));
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+
+err_undo_css:
+ cgrp->child_subsys_mask &= ~enable;
+ cgrp->child_subsys_mask |= disable;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ if (css)
+ kill_css(css);
+ }
+ }
+ goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
{
- struct seq_file *sf = cb->state;
- return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
}
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- struct cgroup_seqfile_state *state = m->private;
- struct cftype *cft = state->cft;
- if (cft->read_map) {
- struct cgroup_map_cb cb = {
- .fill = cgroup_map_add,
- .state = m,
- };
- return cft->read_map(state->cgroup, cft, &cb);
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
}
- return cft->read_seq_string(state->cgroup, cft, m);
+
+ return ret ?: nbytes;
}
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- struct seq_file *seq = file->private_data;
- kfree(seq->private);
- return single_release(inode, file);
+ return seq_cft(seq)->seq_start(seq, ppos);
}
-static const struct file_operations cgroup_seqfile_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = seq_lseek,
- .release = cgroup_seqfile_release,
-};
-
-static int cgroup_file_open(struct inode *inode, struct file *file)
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- int err;
- struct cftype *cft;
-
- err = generic_file_open(inode, file);
- if (err)
- return err;
- cft = __d_cft(file->f_dentry);
-
- if (cft->read_map || cft->read_seq_string) {
- struct cgroup_seqfile_state *state =
- kzalloc(sizeof(*state), GFP_USER);
- if (!state)
- return -ENOMEM;
- state->cft = cft;
- state->cgroup = __d_cgrp(file->f_dentry->d_parent);
- file->f_op = &cgroup_seqfile_operations;
- err = single_open(file, cgroup_seqfile_show, state);
- if (err < 0)
- kfree(state);
- } else if (cft->open)
- err = cft->open(inode, file);
- else
- err = 0;
+ return seq_cft(seq)->seq_next(seq, v, ppos);
+}
- return err;
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+ seq_cft(seq)->seq_stop(seq, v);
}
-static int cgroup_file_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- if (cft->release)
- return cft->release(inode, file);
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
+
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
+
+ if (cft->read_u64)
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
return 0;
}
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
+
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
{
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
+ if (kn->parent != new_parent)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ /*
+ * This isn't a proper migration and its usefulness is very
+ * limited. Disallow if sane_behavior.
+ */
+ if (cgroup_sane_behavior(cgrp))
+ return -EPERM;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
}
-static const struct file_operations cgroup_file_operations = {
- .read = cgroup_file_read,
- .write = cgroup_file_write,
- .llseek = generic_file_llseek,
- .open = cgroup_file_open,
- .release = cgroup_file_release,
-};
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
-static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = cgroup_lookup,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
-};
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
- d_add(dentry, NULL);
- return NULL;
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+ int ret;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, false, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
}
-/*
- * Check if a file is a control file
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @cgrp: the target cgroup
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails. If addition fails, this
+ * function doesn't remove files already added. The caller is responsible
+ * for cleaning up.
*/
-static inline struct cftype *__file_cft(struct file *file)
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
- return ERR_PTR(-EINVAL);
- return __d_cft(file->f_dentry);
+ struct cftype *cft;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+ continue;
+
+ if (is_add) {
+ ret = cgroup_add_file(cgrp, cft);
+ if (ret) {
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
+ return ret;
+ }
+ } else {
+ cgroup_rm_file(cgrp, cft);
+ }
+ }
+ return 0;
}
-static int cgroup_create_file(struct dentry *dentry, umode_t mode,
- struct super_block *sb)
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- struct inode *inode;
+ LIST_HEAD(pending);
+ struct cgroup_subsys *ss = cfts[0].ss;
+ struct cgroup *root = &ss->root->cgrp;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
+ lockdep_assert_held(&cgroup_mutex);
- inode = cgroup_new_inode(mode, sb);
- if (!inode)
- return -ENOMEM;
+ /* add/rm files for all cgroups created before */
+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_is_dead(cgrp))
+ continue;
+
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ if (ret)
+ break;
+ }
- if (S_ISDIR(mode)) {
- inode->i_op = &cgroup_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
+}
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+ struct cftype *cft;
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cgroup_file_operations;
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
}
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
+ }
+
return 0;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
+ return 0;
+}
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
*/
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- umode_t mode)
-{
- struct dentry *parent;
- int error = 0;
-
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
- }
- dput(dentry);
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+ int ret;
- return error;
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
*
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
*/
-static umode_t cgroup_file_mode(const struct cftype *cft)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
- umode_t mode = 0;
+ int ret;
- if (cft->mode)
- return cft->mode;
+ if (ss->disabled)
+ return 0;
- if (cft->read || cft->read_u64 || cft->read_s64 ||
- cft->read_map || cft->read_seq_string)
- mode |= S_IRUGO;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
- if (cft->write || cft->write_u64 || cft->write_s64 ||
- cft->write_string || cft->trigger)
- mode |= S_IWUSR;
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
- return mode;
+ mutex_lock(&cgroup_mutex);
+
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
+
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
-int cgroup_add_file(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype *cft)
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
{
- struct dentry *dir = cgrp->dentry;
- struct dentry *dentry;
- int error;
- umode_t mode;
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ up_read(&css_set_rwsem);
+ return count;
+}
- char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
- if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
- strcpy(name, subsys->name);
- strcat(name, ".");
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /*
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() inbetween iterations.
+ *
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
+ */
+ if (!pos) {
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+ } else {
+ list_for_each_entry_rcu(next, &parent->children, sibling)
+ if (next->serial_nr > pos->serial_nr)
+ break;
}
- strcat(name, cft->name);
- BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
- dentry = lookup_one_len(name, dir, strlen(name));
- if (!IS_ERR(dentry)) {
- mode = cgroup_file_mode(cft);
- error = cgroup_create_file(dentry, mode | S_IFREG,
- cgrp->root->sb);
- if (!error)
- dentry->d_fsdata = (void *)cft;
- dput(dentry);
- } else
- error = PTR_ERR(dentry);
- return error;
+
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_add_file);
-int cgroup_add_files(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype cft[],
- int count)
+/**
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_pre(). Find the next descendant
+ * to visit for pre-order traversal of @root's descendants. @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
{
- int i, err;
- for (i = 0; i < count; i++) {
- err = cgroup_add_file(cgrp, subsys, &cft[i]);
- if (err)
- return err;
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = css_next_child(NULL, pos);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
}
- return 0;
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_add_files);
/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
*
- * Return the number of tasks in the cgroup.
+ * Return the rightmost descendant of @pos. If there's no descendant, @pos
+ * is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
*/
-int cgroup_task_count(const struct cgroup *cgrp)
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
- int count = 0;
- struct cg_cgroup_link *link;
+ struct cgroup_subsys_state *last, *tmp;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ css_for_each_child(tmp, last)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last;
+
+ do {
+ last = pos;
+ pos = css_next_child(NULL, pos);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_post(). Find the next descendant
+ * to visit for post-order traversal of @root's descendants. @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return css_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *child;
+ bool ret = false;
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
- count += atomic_read(&link->cg->refcount);
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
}
- read_unlock(&css_set_lock);
- return count;
+ rcu_read_unlock();
+ return ret;
}
-/*
- * Advance a list_head iterator. The iterator should be positioned at
- * the start of a css_set
+/**
+ * css_advance_task_iter - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
*/
-static void cgroup_advance_iter(struct cgroup *cgrp,
- struct cgroup_iter *it)
+static void css_advance_task_iter(struct css_task_iter *it)
{
- struct list_head *l = it->cg_link;
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct list_head *l = it->cset_pos;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &cgrp->css_sets) {
- it->cg_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
- cg = link->cg;
- } while (list_empty(&cg->tasks));
- it->cg_link = l;
- it->task = cg->tasks.next;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+ } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
+ it->cset_pos = l;
+
+ if (!list_empty(&cset->tasks))
+ it->task_pos = cset->tasks.next;
+ else
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
-/*
- * To reduce the fork() overhead for systems that are not actually
- * using their cgroups capability, we don't maintain the lists running
- * through each css_set to its tasks until we see the list actually
- * used - in other words after the first call to cgroup_iter_start().
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
*
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
+ * Initiate iteration through the tasks of @css. The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL. On completion of iteration, css_task_iter_end() must be
+ * called.
+ *
+ * Note that this function acquires a lock which is released when the
+ * iteration finishes. The caller can't sleep while iteration is in
+ * progress.
*/
-static void cgroup_enable_task_cg_lists(void)
+void css_task_iter_start(struct cgroup_subsys_state *css,
+ struct css_task_iter *it)
+ __acquires(css_set_rwsem)
{
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- do_each_thread(g, p) {
- task_lock(p);
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- */
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
- list_add(&p->cg_list, &p->cgroups->tasks);
- task_unlock(p);
- } while_each_thread(g, p);
- write_unlock(&css_set_lock);
-}
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
-void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
- __acquires(css_set_lock)
-{
- /*
- * The first time anyone tries to iterate across a cgroup,
- * we need to enable the list linking each css_set to its
- * tasks, and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ down_read(&css_set_rwsem);
+
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
- read_lock(&css_set_lock);
- it->cg_link = &cgrp->css_sets;
- cgroup_advance_iter(cgrp, it);
+ it->cset_head = it->cset_pos;
+
+ css_advance_task_iter(it);
}
-struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
- struct cgroup_iter *it)
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration. @it should have been
+ * initialized via css_task_iter_start(). Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cg_cgroup_link *link;
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cg_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
- /* Advance iterator to find next entry */
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
l = l->next;
- link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
- if (l == &link->cg->tasks) {
- /* We reached the end of this task list - move on to
- * the next cg_cgroup_link */
- cgroup_advance_iter(cgrp, it);
- } else {
- it->task = l;
- }
- return res;
-}
-void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
- __releases(css_set_lock)
-{
- read_unlock(&css_set_lock);
-}
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
-{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively) simultaneously.
- */
- return t1 > t2;
- }
+ if (l == it->mg_tasks_head)
+ css_advance_task_iter(it);
+ else
+ it->task_pos = l;
+
+ return res;
}
-/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
*/
-static inline int started_after(void *p1, void *p2)
+void css_task_iter_end(struct css_task_iter *it)
+ __releases(css_set_rwsem)
{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
+ up_read(&css_set_rwsem);
}
/**
- * cgroup_scan_tasks - iterate though all the tasks in a cgroup
- * @scan: struct cgroup_scanner containing arguments for the scan
- *
- * Arguments include pointers to callback functions test_task() and
- * process_task().
- * Iterate through all the tasks in a cgroup, calling test_task() for each,
- * and if it returns true, call process_task() for it also.
- * The test_task pointer may be NULL, meaning always true (select all tasks).
- * Effectively duplicates cgroup_iter_{start,next,end}()
- * but does not lock css_set_lock for the call to process_task().
- * The struct cgroup_scanner may be embedded in any structure of the caller's
- * creation.
- * It is guaranteed that process_task() will act on every task that
- * is a member of the cgroup for the duration of this call. This
- * function may or may not call process_task() for tasks that exit
- * or move to a different cgroup during the call, or are forked or
- * move into the cgroup during the call.
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
*
- * Note that test_task() may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should
- * be cheap.
- * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
- * pre-allocated and will be used for heap operations (and its "gt" member will
- * be overwritten), else a temporary heap will be used (allocation of which
- * may cause this function to fail).
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
*/
-int cgroup_scan_tasks(struct cgroup_scanner *scan)
-{
- int retval, i;
- struct cgroup_iter it;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
- struct ptr_heap tmp_heap;
- struct ptr_heap *heap;
- struct timespec latest_time = { 0, 0 };
-
- if (scan->heap) {
- /* The caller supplied our heap and pre-allocated its memory */
- heap = scan->heap;
- heap->gt = &started_after;
- } else {
- /* We need to allocate our own heap memory */
- heap = &tmp_heap;
- retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- /* cannot allocate the heap */
- return retval;
- }
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* all tasks in @from are being moved, all csets are source */
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+ up_read(&css_set_rwsem);
+
+ ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ if (ret)
+ goto out_err;
- again:
/*
- * Scan tasks in the cgroup, using the scanner's "test_task" callback
- * to determine which are of interest, and using the scanner's
- * "process_task" callback to process any of them that need an update.
- * Since we don't want to hold any locks during the task updates,
- * gather tasks to be processed in a heap structure.
- * The heap is sorted by descending task start time.
- * If the statically-sized heap fills up, we overflow tasks that
- * started later, and in future iterations only consider tasks that
- * started after the latest task in the previous pass. This
- * guarantees forward progress and that we don't miss any tasks.
+ * Migrate tasks one-by-one until @form is empty. This fails iff
+ * ->can_attach() fails.
*/
- heap->size = 0;
- cgroup_iter_start(scan->cg, &it);
- while ((p = cgroup_iter_next(scan->cg, &it))) {
- /*
- * Only affect tasks that qualify per the caller's callback,
- * if he provided one
- */
- if (scan->test_task && !scan->test_task(p, scan))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(heap, p);
- if (dropped == NULL) {
- /*
- * The new task was inserted; the heap wasn't
- * previously full
- */
- get_task_struct(p);
- } else if (dropped != p) {
- /*
- * The new task was inserted, and pushed out a
- * different task
- */
- get_task_struct(p);
- put_task_struct(dropped);
- }
- /*
- * Else the new task was newer than anything already in
- * the heap and wasn't inserted
- */
- }
- cgroup_iter_end(scan->cg, &it);
-
- if (heap->size) {
- for (i = 0; i < heap->size; i++) {
- struct task_struct *q = heap->ptrs[i];
- if (i == 0) {
- latest_time = q->start_time;
- latest_task = q;
- }
- /* Process the task per the caller's callback */
- scan->process_task(q, scan);
- put_task_struct(q);
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(to, task, false);
+ put_task_struct(task);
}
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't get processed.
- * Not the most efficient way to do it, but it avoids
- * having to take callback_mutex in the fork path
- */
- goto again;
- }
- if (heap == &tmp_heap)
- heap_free(&tmp_heap);
- return 0;
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&preloaded_csets);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
/*
@@ -3043,6 +3565,36 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*
*/
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3056,6 +3608,7 @@ static void *pidlist_allocate(int count)
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
+
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
@@ -3063,35 +3616,55 @@ static void pidlist_free(void *p)
else
kfree(p);
}
-static void *pidlist_resize(void *p, int newcount)
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
- void *newlist;
- /* note: if new alloc fails, old p will still be valid either way */
- if (is_vmalloc_addr(p)) {
- newlist = vmalloc(newcount * sizeof(pid_t));
- if (!newlist)
- return NULL;
- memcpy(newlist, p, newcount * sizeof(pid_t));
- vfree(p);
- } else {
- newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
}
- return newlist;
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
*/
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
{
int src, dest = 1;
- pid_t *list = *p;
- pid_t *newlist;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
@@ -3112,67 +3685,95 @@ static int pidlist_uniq(pid_t **p, int length)
dest++;
}
after:
- /*
- * if the length difference is large enough, we want to allocate a
- * smaller buffer to save memory. if this fails due to out of memory,
- * we'll just stay with what we've got.
- */
- if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = pidlist_resize(list, dest);
- if (newlist)
- *p = newlist;
- }
return dest;
}
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+ if (cgroup_sane_behavior(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+}
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
+static int fried_cmppid(const void *a, const void *b)
+{
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
- /*
- * We can't drop the pidlist_mutex before taking the l->mutex in case
- * the last ref-holder is trying to remove l from the list at the same
- * time. Holding the pidlist_mutex precludes somebody taking whichever
- * list we find out from under us - compare release_pid_array().
- */
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry(l, &cgrp->pidlists, links) {
- if (l->key.type == type && l->key.ns == ns) {
- /* make sure l doesn't vanish out from under us */
- down_write(&l->mutex);
- mutex_unlock(&cgrp->pidlist_mutex);
- return l;
- }
- }
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
/* entry not found; create a new one */
- l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
return l;
- }
- init_rwsem(&l->mutex);
- down_write(&l->mutex);
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
l->key.type = type;
- l->key.ns = get_pid_ns(ns);
- l->use_count = 0; /* don't increment here */
- l->list = NULL;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
- mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
@@ -3185,10 +3786,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
struct cgroup_pidlist *l;
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
@@ -3200,8 +3803,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
@@ -3212,23 +3815,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (cgroup_sane_behavior(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(&array, length);
- l = cgroup_pidlist_find(cgrp, type);
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
- /* store array, freeing old if necessary - lock already held */
+
+ /* store array, freeing old if necessary */
pidlist_free(l->list);
l->list = array;
l->length = length;
- l->use_count++;
- up_write(&l->mutex);
*lp = l;
return 0;
}
@@ -3244,24 +3851,34 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
- int ret = -EINVAL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
/*
- * Validate dentry by checking the superblock operations,
- * and make sure it's a directory.
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
*/
- if (dentry->d_sb->s_op != &cgroup_ops ||
- !S_ISDIR(dentry->d_inode->i_mode))
- goto err;
-
- ret = 0;
- cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
@@ -3281,10 +3898,10 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
break;
}
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
-err:
- return ret;
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
@@ -3302,20 +3919,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup_pidlist *l = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
- int *iter;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
- down_read(&l->mutex);
if (pid) {
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
- if (l->list[mid] == pid) {
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
@@ -3326,19 +3968,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
- *pos = *iter;
+ *pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup_pidlist *l = s->private;
- up_read(&l->mutex);
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup_pidlist *l = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -3349,7 +3997,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
- *pos = *p;
+ *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
@@ -3359,756 +4007,714 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return seq_printf(s, "%d\n", *(int *)v);
}
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
- .start = cgroup_pidlist_start,
- .stop = cgroup_pidlist_stop,
- .next = cgroup_pidlist_next,
- .show = cgroup_pidlist_show,
-};
-
-static void cgroup_release_pid_array(struct cgroup_pidlist *l)
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- /*
- * the case where we're the last user of this particular pidlist will
- * have us remove it from the cgroup's list, which entails taking the
- * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
- * pidlist_mutex, we have to take pidlist_mutex first.
- */
- mutex_lock(&l->owner->pidlist_mutex);
- down_write(&l->mutex);
- BUG_ON(!l->use_count);
- if (!--l->use_count) {
- /* we're the last user if refcount is 0; remove and free */
- list_del(&l->links);
- mutex_unlock(&l->owner->pidlist_mutex);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- up_write(&l->mutex);
- kfree(l);
- return;
- }
- mutex_unlock(&l->owner->pidlist_mutex);
- up_write(&l->mutex);
+ return notify_on_release(css->cgroup);
}
-static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-{
- struct cgroup_pidlist *l;
- if (!(file->f_mode & FMODE_READ))
- return 0;
- /*
- * the seq_file will only be initialized if the file was opened for
- * reading; hence we check if it's not null only in that case.
- */
- l = ((struct seq_file *)file->private_data)->private;
- cgroup_release_pid_array(l);
- return seq_release(inode, file);
-}
-
-static const struct file_operations cgroup_pidlist_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .write = cgroup_file_write,
- .release = cgroup_pidlist_release,
-};
-
-/*
- * The following functions handle opens on a file that displays a pidlist
- * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
- * in the cgroup.
- */
-/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l;
- int retval;
-
- /* Nothing to do for write-only files */
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- /* have the array populated */
- retval = pidlist_array_load(cgrp, type, &l);
- if (retval)
- return retval;
- /* configure file information */
- file->f_op = &cgroup_pidlist_operations;
-
- retval = seq_open(file, &cgroup_pidlist_seq_operations);
- if (retval) {
- cgroup_release_pid_array(l);
- return retval;
- }
- ((struct seq_file *)file->private_data)->private = l;
+ clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
-{
- return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
-}
-static int cgroup_procs_open(struct inode *unused, struct file *file)
-{
- return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
-}
-static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft)
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return notify_on_release(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}
-static int cgroup_write_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft,
- u64 val)
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
return 0;
}
-/*
- * Unregister event and free resources.
+static struct cftype cgroup_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .flags = CFTYPE_INSANE,
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_ONLY_ON_DFL,
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
+ },
+
+ /*
+ * Historical crazy stuff. These don't have "cgroup." prefix and
+ * don't exist if sane_behavior. If you're depending on these, be
+ * prepared to be burned.
+ */
+ {
+ .name = "tasks",
+ .flags = CFTYPE_INSANE, /* use "procs" instead */
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "notify_on_release",
+ .flags = CFTYPE_INSANE,
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/**
+ * cgroup_populate_dir - create subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be added
*
- * Gets called from workqueue.
+ * On failure, no file is added.
*/
-static void cgroup_event_remove(struct work_struct *work)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- struct cgroup_event *event = container_of(work, struct cgroup_event,
- remove);
- struct cgroup *cgrp = event->cgrp;
+ struct cgroup_subsys *ss;
+ int i, ret = 0;
- event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+ /* process cftsets of each subsystem */
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- eventfd_ctx_put(event->eventfd);
- kfree(event);
- dput(cgrp->dentry);
-}
+ if (!(subsys_mask & (1 << i)))
+ continue;
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
- int sync, void *key)
-{
- struct cgroup_event *event = container_of(wait,
- struct cgroup_event, wait);
- struct cgroup *cgrp = event->cgrp;
- unsigned long flags = (unsigned long)key;
-
- if (flags & POLLHUP) {
- __remove_wait_queue(event->wqh, &event->wait);
- spin_lock(&cgrp->event_list_lock);
- list_del(&event->list);
- spin_unlock(&cgrp->event_list_lock);
- /*
- * We are in atomic context, but cgroup_event_remove() may
- * sleep, so we have to call it in workqueue.
- */
- schedule_work(&event->remove);
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
+ if (ret < 0)
+ goto err;
+ }
}
-
return 0;
-}
-
-static void cgroup_event_ptable_queue_proc(struct file *file,
- wait_queue_head_t *wqh, poll_table *pt)
-{
- struct cgroup_event *event = container_of(pt,
- struct cgroup_event, pt);
-
- event->wqh = wqh;
- add_wait_queue(wqh, &event->wait);
+err:
+ cgroup_clear_dir(cgrp, subsys_mask);
+ return ret;
}
/*
- * Parse input and register new cgroup event handler.
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts. Killing of the percpu_ref is initiated.
+ * Implemented in kill_css().
*
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ * accessors are inside RCU read sections. css_release() schedules the
+ * RCU callback.
+ *
+ * 4. After the grace period, the css can be freed. Implemented in
+ * css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
*/
-static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
-{
- struct cgroup_event *event = NULL;
- unsigned int efd, cfd;
- struct file *efile = NULL;
- struct file *cfile = NULL;
- char *endp;
- int ret;
+static void css_free_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup *cgrp = css->cgroup;
- efd = simple_strtoul(buffer, &endp, 10);
- if (*endp != ' ')
- return -EINVAL;
- buffer = endp + 1;
+ if (css->ss) {
+ /* css free path */
+ if (css->parent)
+ css_put(css->parent);
- cfd = simple_strtoul(buffer, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
- return -EINVAL;
- buffer = endp + 1;
+ css->ss->css_free(css);
+ cgroup_put(cgrp);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ cgroup_pidlist_destroy_all(cgrp);
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
- event->cgrp = cgrp;
- INIT_LIST_HEAD(&event->list);
- init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
- init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
- INIT_WORK(&event->remove, cgroup_event_remove);
-
- efile = eventfd_fget(efd);
- if (IS_ERR(efile)) {
- ret = PTR_ERR(efile);
- goto fail;
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
+}
- event->eventfd = eventfd_ctx_fileget(efile);
- if (IS_ERR(event->eventfd)) {
- ret = PTR_ERR(event->eventfd);
- goto fail;
- }
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct cgroup_subsys_state *css =
+ container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
- cfile = fget(cfd);
- if (!cfile) {
- ret = -EBADF;
- goto fail;
- }
+ INIT_WORK(&css->destroy_work, css_free_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
- /* the process need read permission on control file */
- /* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
- if (ret < 0)
- goto fail;
+static void css_release_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
- event->cft = __file_cft(cfile);
- if (IS_ERR(event->cft)) {
- ret = PTR_ERR(event->cft);
- goto fail;
- }
+ mutex_lock(&cgroup_mutex);
- if (!event->cft->register_event || !event->cft->unregister_event) {
- ret = -EINVAL;
- goto fail;
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (ss) {
+ /* css release path */
+ cgroup_idr_remove(&ss->css_idr, css->id);
+ } else {
+ /* cgroup release path */
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
}
- ret = event->cft->register_event(cgrp, event->cft,
- event->eventfd, buffer);
- if (ret)
- goto fail;
+ mutex_unlock(&cgroup_mutex);
- if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
- event->cft->unregister_event(cgrp, event->cft, event->eventfd);
- ret = 0;
- goto fail;
- }
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+}
- /*
- * Events should be removed after rmdir of cgroup directory, but before
- * destroying subsystem state objects. Let's take reference to cgroup
- * directory dentry to do that.
- */
- dget(cgrp->dentry);
+static void css_release(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
- spin_lock(&cgrp->event_list_lock);
- list_add(&event->list, &cgrp->event_list);
- spin_unlock(&cgrp->event_list_lock);
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
- fput(cfile);
- fput(efile);
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ lockdep_assert_held(&cgroup_mutex);
- return 0;
+ cgroup_get(cgrp);
-fail:
- if (cfile)
- fput(cfile);
+ memset(css, 0, sizeof(*css));
+ css->cgroup = cgrp;
+ css->ss = ss;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
+
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
+ }
- if (event && event->eventfd && !IS_ERR(event->eventfd))
- eventfd_ctx_put(event->eventfd);
+ BUG_ON(cgroup_css(cgrp, ss));
+}
- if (!IS_ERR_OR_NULL(efile))
- fput(efile);
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ int ret = 0;
- kfree(event);
+ lockdep_assert_held(&cgroup_mutex);
+ if (ss->css_online)
+ ret = ss->css_online(css);
+ if (!ret) {
+ css->flags |= CSS_ONLINE;
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+ }
return ret;
}
-static u64 cgroup_clone_children_read(struct cgroup *cgrp,
- struct cftype *cft)
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
{
- return clone_children(cgrp);
-}
+ struct cgroup_subsys *ss = css->ss;
-static int cgroup_clone_children_write(struct cgroup *cgrp,
- struct cftype *cft,
- u64 val)
-{
- if (val)
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
- else
- clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
- return 0;
-}
+ lockdep_assert_held(&cgroup_mutex);
-/*
- * for the common functions, 'private' gives the type of file
- */
-/* for hysterical raisins, we can't put this on the older files */
-#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
-static struct cftype files[] = {
- {
- .name = "tasks",
- .open = cgroup_tasks_open,
- .write_u64 = cgroup_tasks_write,
- .release = cgroup_pidlist_release,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = CGROUP_FILE_GENERIC_PREFIX "procs",
- .open = cgroup_procs_open,
- .write_u64 = cgroup_procs_write,
- .release = cgroup_pidlist_release,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = "notify_on_release",
- .read_u64 = cgroup_read_notify_on_release,
- .write_u64 = cgroup_write_notify_on_release,
- },
- {
- .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
- .write_string = cgroup_write_event_control,
- .mode = S_IWUGO,
- },
- {
- .name = "cgroup.clone_children",
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
-};
+ if (!(css->flags & CSS_ONLINE))
+ return;
-static struct cftype cft_release_agent = {
- .name = "release_agent",
- .read_seq_string = cgroup_release_agent_show,
- .write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
-};
+ if (ss->css_offline)
+ ss->css_offline(css);
+
+ css->flags &= ~CSS_ONLINE;
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
+}
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
+ */
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+ struct cgroup_subsys_state *css;
int err;
- struct cgroup_subsys *ss;
- /* First clear out any existing files */
- cgroup_clear_directory(cgrp->dentry);
+ lockdep_assert_held(&cgroup_mutex);
- err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
- if (err < 0)
- return err;
+ css = ss->css_alloc(parent_css);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
- if (cgrp == cgrp->top_cgroup) {
- if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
- return err;
- }
+ init_and_link_css(css, ss, cgrp);
- for_each_subsys(cgrp->root, ss) {
- if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
- return err;
- }
- /* This cgroup is ready now */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- /*
- * Update id->css pointer and make this css visible from
- * CSS ID functions. This pointer will be dereferened
- * from RCU-read-side without locks.
- */
- if (css->id)
- rcu_assign_pointer(css->id->css, css);
- }
+ err = percpu_ref_init(&css->refcnt, css_release);
+ if (err)
+ goto err_free_css;
- return 0;
-}
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
-static void init_cgroup_css(struct cgroup_subsys_state *css,
- struct cgroup_subsys *ss,
- struct cgroup *cgrp)
-{
- css->cgroup = cgrp;
- atomic_set(&css->refcnt, 1);
- css->flags = 0;
- css->id = NULL;
- if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
- BUG_ON(cgrp->subsys[ss->subsys_id]);
- cgrp->subsys[ss->subsys_id] = css;
-}
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-{
- /* We need to take each hierarchy_mutex in a consistent order */
- int i;
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
- /*
- * No worry about a race with rebind_subsystems that might mess up the
- * locking order, since both parties are under cgroup_mutex.
- */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->root == root)
- mutex_lock(&ss->hierarchy_mutex);
+ err = online_css(css);
+ if (err)
+ goto err_list_del;
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ cgroup_parent(parent)) {
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
+ ss->warned_broken_hierarchy = true;
}
-}
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-{
- int i;
+ return 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->root == root)
- mutex_unlock(&ss->hierarchy_mutex);
- }
+err_list_del:
+ list_del_rcu(&css->sibling);
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
+err_free_percpu_ref:
+ percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+ return err;
}
-/*
- * cgroup_create - create a cgroup
- * @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
- *
- * Must be called with the mutex on the parent inode held
- */
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- umode_t mode)
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
{
- struct cgroup *cgrp;
- struct cgroupfs_root *root = parent->root;
- int err = 0;
+ struct cgroup *parent, *cgrp;
+ struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct super_block *sb = root->sb;
+ struct kernfs_node *kn;
+ int ssid, ret;
+
+ parent = cgroup_kn_lock_live(parent_kn);
+ if (!parent)
+ return -ENODEV;
+ root = parent->root;
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp)
- return -ENOMEM;
+ if (!cgrp) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- /* Grab a reference on the superblock so the hierarchy doesn't
- * get deleted on unmount if there are child cgroups. This
- * can be done outside cgroup_mutex, since the sb can't
- * disappear while someone has an open control file on the
- * fs */
- atomic_inc(&sb->s_active);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out_free_cgrp;
- mutex_lock(&cgroup_mutex);
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (cgrp->id < 0) {
+ ret = -ENOMEM;
+ goto out_cancel_ref;
+ }
init_cgroup_housekeeping(cgrp);
- cgrp->parent = parent;
- cgrp->root = parent->root;
- cgrp->top_cgroup = parent->top_cgroup;
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- if (clone_children(parent))
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-
- for_each_subsys(root, ss) {
- struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- if (IS_ERR(css)) {
- err = PTR_ERR(css);
- goto err_destroy;
- }
- init_cgroup_css(css, ss, cgrp);
- if (ss->use_id) {
- err = alloc_css_id(ss, parent, cgrp);
- if (err)
- goto err_destroy;
- }
- /* At error, ->destroy() callback has to free assigned ID. */
- if (clone_children(parent) && ss->post_clone)
- ss->post_clone(ss, cgrp);
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_id;
}
+ cgrp->kn = kn;
- cgroup_lock_hierarchy(root);
- list_add(&cgrp->sibling, &cgrp->parent->children);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups++;
+ /*
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
- err = cgroup_create_dir(cgrp, dentry, mode);
- if (err < 0)
- goto err_remove;
+ cgrp->self.serial_nr = css_serial_nr_next++;
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+ /* allocation complete, commit to creation */
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
- err = cgroup_populate_dir(cgrp);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ /*
+ * @cgrp is now fully operational. If something fails after this
+ * point, it'll be released via the normal destruction path.
+ */
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
- return 0;
+ ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (ret)
+ goto out_destroy;
- err_remove:
+ /* let's create and online css's */
+ for_each_subsys(ss, ssid) {
+ if (parent->child_subsys_mask & (1 << ssid)) {
+ ret = create_css(cgrp, ss);
+ if (ret)
+ goto out_destroy;
+ }
+ }
- cgroup_lock_hierarchy(root);
- list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups--;
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * child_subsys_mask from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->child_subsys_mask = parent->child_subsys_mask;
- err_destroy:
+ kernfs_activate(kn);
- for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
- }
+ ret = 0;
+ goto out_unlock;
- mutex_unlock(&cgroup_mutex);
+out_free_id:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+ percpu_ref_cancel_init(&cgrp->self.refcnt);
+out_free_cgrp:
+ kfree(cgrp);
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
+
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ goto out_unlock;
+}
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
- kfree(cgrp);
- return err;
+ mutex_lock(&cgroup_mutex);
+ offline_css(css);
+ mutex_unlock(&cgroup_mutex);
+
+ css_put(css);
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
{
- struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
- /* the vfs holds inode->i_mutex already */
- return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-static int cgroup_has_css_refs(struct cgroup *cgrp)
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
{
- /* Check the reference count on each subsystem. Since we
- * already established that there are no tasks in the
- * cgroup, if the css refcount is also 1, then there should
- * be no outstanding references, so the subsystem is safe to
- * destroy. We scan across all subsystems rather than using
- * the per-hierarchy linked list of mounted subsystems since
- * we can be called via check_for_release() with no
- * synchronization other than RCU, and the subsystem linked
- * list isn't RCU-safe */
- int i;
+ lockdep_assert_held(&cgroup_mutex);
+
/*
- * We won't need to lock the subsys array, because the subsystems
- * we're concerned about aren't going anywhere since our cgroup root
- * has a reference on them.
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
*/
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
- /* Skip subsystems not present or not in this hierarchy */
- if (ss == NULL || ss->root != cgrp->root)
- continue;
- css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway. */
- if (css && (atomic_read(&css->refcnt) > 1))
- return 1;
- }
- return 0;
-}
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- */
+ /*
+ * Killing would put the base ref, but we need to keep it alive
+ * until after ->css_offline().
+ */
+ css_get(css);
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
- local_irq_save(flags);
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- int refcnt;
- while (1) {
- /* We can only remove a CSS with a refcnt==1 */
- refcnt = atomic_read(&css->refcnt);
- if (refcnt > 1) {
- failed = true;
- goto done;
- }
- BUG_ON(!refcnt);
- /*
- * Drop the refcnt to 0 while we check other
- * subsystems. This will cause any racing
- * css_tryget() to spin until we set the
- * CSS_REMOVED bits or abort
- */
- if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
- break;
- cpu_relax();
- }
- }
- done:
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- if (failed) {
- /*
- * Restore old refcnt if we previously managed
- * to clear it from 1 to 0
- */
- if (!atomic_read(&css->refcnt))
- atomic_set(&css->refcnt, 1);
- } else {
- /* Commit the fact that the CSS is removed */
- set_bit(CSS_REMOVED, &css->flags);
- }
- }
- local_irq_restore(flags);
- return !failed;
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is
+ * invoked, no new css reference will be given out via
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
+ * proceed to offlining css's because percpu_ref_kill() doesn't
+ * guarantee that the ref is seen as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
- DEFINE_WAIT(wait);
- struct cgroup_event *event, *tmp;
- int ret;
+ struct cgroup_subsys_state *css;
+ bool empty;
+ int ssid;
- /* the vfs holds both inode->i_mutex already */
-again:
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
+ */
+ down_read(&css_set_rwsem);
+ empty = list_empty(&cgrp->cset_links);
+ up_read(&css_set_rwsem);
+ if (!empty)
return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
/*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
*/
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ if (css_has_online_children(&cgrp->self))
+ return -EBUSY;
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Mark @cgrp dead. This prevents further task migration and child
+ * creation by disabling cgroup_lock_live_group().
*/
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
+ cgrp->self.flags &= ~CSS_ONLINE;
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ /* initiate massacre of all css's */
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
+ /* CSS_ONLINE is clear, remove from ->release_list for the last time */
raw_spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- cgroup_lock_hierarchy(cgrp->root);
- /* delete this cgroup from parent->children */
- list_del_init(&cgrp->sibling);
- cgroup_unlock_hierarchy(cgrp->root);
+ /*
+ * Remove @cgrp directory along with the base files. @cgrp has an
+ * extra ref on its kn.
+ */
+ kernfs_remove(cgrp->kn);
+
+ set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
+ check_for_release(cgroup_parent(cgrp));
+
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
- d = dget(cgrp->dentry);
+ return 0;
+};
+
+static int cgroup_rmdir(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ cgrp = cgroup_kn_lock_live(kn);
+ if (!cgrp)
+ return 0;
+ cgroup_get(cgrp); /* for @kn->priv clearing */
- cgroup_d_remove_dir(d);
- dput(d);
+ ret = cgroup_destroy_locked(cgrp);
- set_bit(CGRP_RELEASABLE, &parent->flags);
- check_for_release(parent);
+ cgroup_kn_unlock(kn);
/*
- * Unregister events and notify userspace.
- * Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace
+ * There are two control paths which try to determine cgroup from
+ * dentry without going through kernfs - cgroupstats_build() and
+ * css_tryget_online_from_dir(). Those are supported by RCU
+ * protecting clearing of cgrp->kn->priv backpointer, which should
+ * happen after all files under it have been removed.
*/
- spin_lock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
- list_del(&event->list);
- remove_wait_queue(event->wqh, &event->wait);
- eventfd_signal(event->eventfd, 1);
- schedule_work(&event->remove);
- }
- spin_unlock(&cgrp->event_list_lock);
+ if (!ret)
+ RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
- mutex_unlock(&cgroup_mutex);
- return 0;
+ cgroup_put(cgrp);
+ return ret;
}
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
- /* Create the top cgroup state for this subsystem */
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
- css = ss->create(ss, dummytop);
+ mutex_lock(&cgroup_mutex);
+
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
+
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
- init_cgroup_css(css, ss, dummytop);
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
- * init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -4117,322 +4723,152 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
- ss->active = 1;
+ BUG_ON(online_css(css));
- /* this function shouldn't be used with modular subsystems, since they
- * need to register a subsys_id, among other things */
- BUG_ON(ss->module);
+ mutex_unlock(&cgroup_mutex);
}
/**
- * cgroup_load_subsys: load and register a modular subsystem at runtime
- * @ss: the subsystem to load
+ * cgroup_init_early - cgroup initialization at system boot
*
- * This function should be called in a modular subsystem's initcall. If the
- * subsystem is built as a module, it will be assigned a new subsys_id and set
- * up for use. If the subsystem is built-in anyway, work is delegated to the
- * simpler cgroup_init_subsys.
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
*/
-int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+int __init cgroup_init_early(void)
{
+ static struct cgroup_sb_opts __initdata opts =
+ { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ struct cgroup_subsys *ss;
int i;
- struct cgroup_subsys_state *css;
-
- /* check name and function validity */
- if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->create == NULL || ss->destroy == NULL)
- return -EINVAL;
- /*
- * we don't support callbacks in modular subsystems. this check is
- * before the ss->module check for consistency; a subsystem that could
- * be a module should still have no callbacks even if the user isn't
- * compiling it as one.
- */
- if (ss->fork || ss->exit)
- return -EINVAL;
+ init_cgroup_root(&cgrp_dfl_root, &opts);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
- /*
- * an optionally modular subsystem is built-in: we want to do nothing,
- * since cgroup_init_subsys will have already taken care of it.
- */
- if (ss->module == NULL) {
- /* a few sanity checks */
- BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
- BUG_ON(subsys[ss->subsys_id] != ss);
- return 0;
- }
+ RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
- /*
- * need to register a subsys id before anything else - for example,
- * init_cgroup_css needs it.
- */
- mutex_lock(&cgroup_mutex);
- /* find the first empty slot in the array */
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- if (subsys[i] == NULL)
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT) {
- /* maximum number of subsystems already registered! */
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- /* assign ourselves the subsys_id */
- ss->subsys_id = i;
- subsys[i] = ss;
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
- /*
- * no ss->create seems to need anything important in the ss struct, so
- * this can happen first (i.e. before the rootnode attachment).
- */
- css = ss->create(ss, dummytop);
- if (IS_ERR(css)) {
- /* failure case - need to deassign the subsys[] slot. */
- subsys[i] = NULL;
- mutex_unlock(&cgroup_mutex);
- return PTR_ERR(css);
- }
-
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
-
- /* our new subsystem will be attached to the dummy hierarchy. */
- init_cgroup_css(css, ss, dummytop);
- /* init_idr must be after init_cgroup_css because it sets css->id. */
- if (ss->use_id) {
- int ret = cgroup_init_idr(ss, css);
- if (ret) {
- dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(ss, dummytop);
- subsys[i] = NULL;
- mutex_unlock(&cgroup_mutex);
- return ret;
- }
- }
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
- /*
- * Now we need to entangle the css into the existing css_sets. unlike
- * in cgroup_init_subsys, there are now multiple css_sets, so each one
- * will need a new pointer to it; done by iterating the css_set_table.
- * furthermore, modifying the existing css_sets will corrupt the hash
- * table state, so each changed css_set will need its hash recomputed.
- * this is all done under the css_set_lock.
- */
- write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct css_set *cg;
- struct hlist_node *node, *tmp;
- struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
- hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
- /* skip entries that we already rehashed */
- if (cg->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hlist_del(&cg->hlist);
- /* set new value */
- cg->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- new_bucket = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, new_bucket);
- }
+ if (ss->early_init)
+ cgroup_init_subsys(ss, true);
}
- write_unlock(&css_set_lock);
-
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
- ss->active = 1;
-
- /* success! */
- mutex_unlock(&cgroup_mutex);
return 0;
}
-EXPORT_SYMBOL_GPL(cgroup_load_subsys);
/**
- * cgroup_unload_subsys: unload a modular subsystem
- * @ss: the subsystem to unload
+ * cgroup_init - cgroup initialization
*
- * This function should be called in a modular subsystem's exitcall. When this
- * function is invoked, the refcount on the subsystem's module will be 0, so
- * the subsystem will not be attached to any hierarchy.
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
*/
-void cgroup_unload_subsys(struct cgroup_subsys *ss)
+int __init cgroup_init(void)
{
- struct cg_cgroup_link *link;
- struct hlist_head *hhead;
-
- BUG_ON(ss->module == NULL);
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid, err;
- /*
- * we shouldn't be called if the subsystem is in use, and the use of
- * try_module_get in parse_cgroupfs_options should ensure that it
- * doesn't start being used while we're killing it off.
- */
- BUG_ON(ss->root != &rootnode);
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
mutex_lock(&cgroup_mutex);
- /* deassign the subsys_id */
- BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
- subsys[ss->subsys_id] = NULL;
-
- /* remove subsystem from rootnode's list of subsystems */
- list_del_init(&ss->sibling);
- /*
- * disentangle the css from all css_sets attached to the dummytop. as
- * in loading, we need to pay our respects to the hashtable gods.
- */
- write_lock(&css_set_lock);
- list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
- struct css_set *cg = link->cg;
-
- hlist_del(&cg->hlist);
- BUG_ON(!cg->subsys[ss->subsys_id]);
- cg->subsys[ss->subsys_id] = NULL;
- hhead = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, hhead);
- }
- write_unlock(&css_set_lock);
+ /* Add init_css_set to the hash table */
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
- /*
- * remove subsystem's css from the dummytop and free it - need to free
- * before marking as null because ss->destroy needs the cgrp->subsys
- * pointer to find their state. note that this also takes care of
- * freeing the css_id.
- */
- ss->destroy(ss, dummytop);
- dummytop->subsys[ss->subsys_id] = NULL;
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
-
-/**
- * cgroup_init_early - cgroup initialization at system boot
- *
- * Initialize cgroups at system boot, and initialize any
- * subsystems that request early init.
- */
-int __init cgroup_init_early(void)
-{
- int i;
- atomic_set(&init_css_set.refcount, 1);
- INIT_LIST_HEAD(&init_css_set.cg_links);
- INIT_LIST_HEAD(&init_css_set.tasks);
- INIT_HLIST_NODE(&init_css_set.hlist);
- css_set_count = 1;
- init_cgroup_root(&rootnode);
- root_count = 1;
- init_task.cgroups = &init_css_set;
-
- init_css_set_link.cg = &init_css_set;
- init_css_set_link.cgrp = dummytop;
- list_add(&init_css_set_link.cgrp_link_list,
- &rootnode.top_cgroup.css_sets);
- list_add(&init_css_set_link.cg_link_list,
- &init_css_set.cg_links);
-
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
- /* at bootup time, we don't worry about modular subsystems */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- BUG_ON(!ss->name);
- BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
- if (ss->subsys_id != i) {
- printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
- ss->name, ss->subsys_id);
- BUG();
- }
- if (ss->early_init)
- cgroup_init_subsys(ss);
- }
- return 0;
-}
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
-/**
- * cgroup_init - cgroup initialization
- *
- * Register cgroup filesystem and /proc file, and initialize
- * any subsystems that didn't request early init.
- */
-int __init cgroup_init(void)
-{
- int err;
- int i;
- struct hlist_head *hhead;
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
- err = bdi_init(&cgroup_backing_dev_info);
- if (err)
- return err;
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
- /* at bootup time, we don't worry about modular subsystems */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (!ss->early_init)
- cgroup_init_subsys(ss);
- if (ss->use_id)
- cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (!ss->disabled) {
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ }
}
- /* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
- BUG_ON(!init_root_id(&rootnode));
-
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
- if (!cgroup_kobj) {
- err = -ENOMEM;
- goto out;
- }
+ if (!cgroup_kobj)
+ return -ENOMEM;
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
- goto out;
+ return err;
}
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+ return 0;
+}
-out:
- if (err)
- bdi_destroy(&cgroup_backing_dev_info);
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
- return err;
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+
+ return 0;
}
+core_initcall(cgroup_wq_init);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
- * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
- * doesn't really matter if tsk->cgroup changes after we read it,
- * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
- * anyway. No need to check that tsk->cgroup != NULL, thanks to
- * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
- * cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -4445,28 +4881,36 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- for_each_active_root(root) {
+ for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int count = 0;
+ int ssid, count = 0;
+
+ if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(root, ss)
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
out_unlock:
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
@@ -4475,22 +4919,10 @@ out:
return retval;
}
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-const struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
+ struct cgroup_subsys *ss;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4500,14 +4932,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
* subsys/hierarchy state.
*/
mutex_lock(&cgroup_mutex);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
+
+ for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
- }
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -4525,99 +4955,83 @@ static const struct file_operations proc_cgroupstats_operations = {
};
/**
- * cgroup_fork - attach newly forked task to its parents cgroup.
+ * cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
*
- * Description: A task inherits its parent's cgroup at fork().
- *
- * A pointer to the shared css_set was automatically copied in
- * fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU, cgroup_mutex or
- * threadgroup_change_begin(), so it might no longer be a valid
- * cgroup pointer. cgroup_attach_task() might have already changed
- * current->cgroups, allowing the previously referenced cgroup
- * group to be removed and freed.
- *
- * Outside the pointer validity we also need to process the css_set
- * inheritance between threadgoup_change_begin() and
- * threadgoup_change_end(), this way there is no leak in any process
- * wide migration performed by cgroup_attach_proc() that could otherwise
- * miss a thread because it is too early or too late in the fork stage.
- *
- * At the point that cgroup_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set. Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
- /*
- * We don't need to task_lock() current because current->cgroups
- * can't be changed concurrently here. The parent obviously hasn't
- * exited and called cgroup_exit(), and we are synchronized against
- * cgroup migration through threadgroup_change_begin().
- */
- child->cgroups = current->cgroups;
- get_css_set(child->cgroups);
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- /*
- * forkexit callbacks are only supported for builtin
- * subsystems, and the builtin section of the subsys array is
- * immutable, so we don't need to lock the subsys array here.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(ss, child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ struct cgroup_subsys *ss;
+ int i;
+
+ /*
+ * This may race against cgroup_enable_task_cg_links(). As that
+ * function sets use_task_css_set_links before grabbing
+ * tasklist_lock and we just went through tasklist_lock to add
+ * @child, it's guaranteed that either we see the set
+ * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+ * @child during its iteration.
+ *
+ * If we won the race, @child is associated with %current's
+ * css_set. Grabbing css_set_rwsem guarantees both that the
+ * association is stable, and, on completion of the parent's
+ * migration, @child is visible in the source of migration or
+ * already in the destination cgroup. This guarantee is necessary
+ * when implementing operations which need to migrate all tasks of
+ * a cgroup to another.
+ *
+ * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * will remain in init_css_set. This is safe because all tasks are
+ * in the init_css_set before cg_links is enabled and there's no
+ * operation which transfers all tasks out of init_css_set.
+ */
if (use_task_css_set_links) {
- write_lock(&css_set_lock);
+ struct css_set *cset;
+
+ down_write(&css_set_rwsem);
+ cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
- /*
- * It's safe to use child->cgroups without task_lock()
- * here because we are protected through
- * threadgroup_change_begin() against concurrent
- * css_set change in cgroup_task_migrate(). Also
- * the task can't exit at that point until
- * wake_up_new_task() is called, so we are protected
- * against cgroup_exit() setting child->cgroup to
- * init_css_set.
- */
- list_add(&child->cg_list, &child->cgroups->tasks);
+ rcu_assign_pointer(child->cgroups, cset);
+ list_add(&child->cg_list, &cset->tasks);
+ get_css_set(cset);
}
- write_unlock(&css_set_lock);
+ up_write(&css_set_rwsem);
+ }
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for_each_subsys(ss, i)
+ if (ss->fork)
+ ss->fork(child);
}
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -4627,111 +5041,64 @@ void cgroup_post_fork(struct task_struct *child)
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
- * the_top_cgroup_hack:
- *
- * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
- *
- * We call cgroup_exit() while the task is still competent to
- * handle notify_on_release(), then leave the task attached to the
- * root cgroup in each hierarchy for the remainder of its exit.
- *
- * To do this properly, we would increment the reference count on
- * top_cgroup, and near the very end of the kernel/exit.c do_exit()
- * code we would add a second cgroup function call, to drop that
- * reference. This would just create an unnecessary hot spot on
- * the top_cgroup reference count, to no avail.
- *
- * Normally, holding a reference to a cgroup without bumping its
- * count is unsafe. The cgroup could go away, or someone could
- * attach us to a different cgroup, decrementing the count on
- * the first cgroup that we never incremented. But in this case,
- * top_cgroup isn't going away, and either task has PF_EXITING set,
- * which wards off any cgroup_attach_task() attempts, or task is a failed
- * fork, never visible to cgroup_attach_task.
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit. No need to bother with
+ * init_css_set refcnting. init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
- struct css_set *cg;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ bool put_cset = false;
int i;
/*
- * Unlink from the css_set task list if necessary.
- * Optimistically check cg_list before taking
- * css_set_lock
+ * Unlink from @tsk from its css_set. As migration path can't race
+ * with us, we can check cg_list without grabbing css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_del_init(&tsk->cg_list);
- write_unlock(&css_set_lock);
+ down_write(&css_set_rwsem);
+ list_del_init(&tsk->cg_list);
+ up_write(&css_set_rwsem);
+ put_cset = true;
}
/* Reassign the task to the init_css_set. */
- task_lock(tsk);
- cg = tsk->cgroups;
- tsk->cgroups = &init_css_set;
+ cset = task_css_set(tsk);
+ RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
+ if (need_forkexit_callback) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
if (ss->exit) {
- struct cgroup *old_cgrp =
- rcu_dereference_raw(cg->subsys[i])->cgroup;
- struct cgroup *cgrp = task_cgroup(tsk, i);
- ss->exit(ss, cgrp, old_cgrp, tsk);
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
+
+ ss->exit(css, old_css, tsk);
}
}
}
- task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
- int ret;
- struct cgroup *target;
-
- if (cgrp == dummytop)
- return 1;
-
- target = task_cgroup_from_root(task, cgrp->root);
- while (cgrp != target && cgrp!= cgrp->top_cgroup)
- cgrp = cgrp->parent;
- ret = (cgrp == target);
- return ret;
+ if (put_cset)
+ put_css_set(cset, true);
}
static void check_for_release(struct cgroup *cgrp)
{
- /* All of these checks rely on RCU to keep the cgroup
- * structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
- /* Control Group is currently removeable. If it's not
+ if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+ !css_has_online_children(&cgrp->self)) {
+ /*
+ * Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
- * it now */
+ * it now
+ */
int need_schedule_work = 0;
+
raw_spin_lock(&release_list_lock);
- if (!cgroup_is_removed(cgrp) &&
+ if (!cgroup_is_dead(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
@@ -4742,25 +5109,6 @@ static void check_for_release(struct cgroup *cgrp)
}
}
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
-{
- struct cgroup *cgrp = css->cgroup;
- int val;
- rcu_read_lock();
- val = atomic_sub_return(count, &css->refcnt);
- if (val == 1) {
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- cgroup_wakeup_rmdir_waiter(cgrp);
- }
- rcu_read_unlock();
- WARN_ON_ONCE(val < 1);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
@@ -4792,16 +5140,17 @@ static void cgroup_release_agent(struct work_struct *work)
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
@@ -4809,7 +5158,7 @@ static void cgroup_release_agent(struct work_struct *work)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
@@ -4835,19 +5184,15 @@ static void cgroup_release_agent(struct work_struct *work)
static int __init cgroup_disable(char *str)
{
- int i;
+ struct cgroup_subsys *ss;
char *token;
+ int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
- /*
- * cgroup_disable, being at boot time, can't know about module
- * subsystems, so we don't worry about them.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
+ for_each_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
@@ -4860,285 +5205,62 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-
-/*
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid;
-
- /*
- * This css_id() can return correct value when somone has refcnt
- * on this or this is under rcu_read_lock(). Once css->id is allocated,
- * it's unchanged until freed.
- */
- cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-
- if (cssid)
- return cssid->id;
- return 0;
-}
-EXPORT_SYMBOL_GPL(css_id);
-
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid;
-
- cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-
- if (cssid)
- return cssid->depth;
- return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
-
/**
- * css_is_ancestor - test "root" css is an ancestor of "child"
- * @child: the css to be tested.
- * @root: the css supporsed to be an ancestor of the child.
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
*
- * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
- * But, considering usual usage, the csses should be valid objects after test.
- * Assuming that the caller will do some action to the child if this returns
- * returns true, the caller must take "child";s reference count.
- * If "child" is valid object and this returns true, "root" is valid, too.
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
*/
-
-bool css_is_ancestor(struct cgroup_subsys_state *child,
- const struct cgroup_subsys_state *root)
-{
- struct css_id *child_id;
- struct css_id *root_id;
- bool ret = true;
-
- rcu_read_lock();
- child_id = rcu_dereference(child->id);
- root_id = rcu_dereference(root->id);
- if (!child_id
- || !root_id
- || (child_id->depth < root_id->depth)
- || (child_id->stack[root_id->depth] != root_id->id))
- ret = false;
- rcu_read_unlock();
- return ret;
-}
-
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
- struct css_id *id = css->id;
- /* When this is called before css_id initialization, id can be NULL */
- if (!id)
- return;
-
- BUG_ON(!ss->use_id);
-
- rcu_assign_pointer(id->css, NULL);
- rcu_assign_pointer(css->id, NULL);
- write_lock(&ss->id_lock);
- idr_remove(&ss->idr, id->id);
- write_unlock(&ss->id_lock);
- kfree_rcu(id, rcu_head);
-}
-EXPORT_SYMBOL_GPL(free_css_id);
-
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
- */
-
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
-{
- struct css_id *newid;
- int myid, error, size;
-
- BUG_ON(!ss->use_id);
-
- size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
- newid = kzalloc(size, GFP_KERNEL);
- if (!newid)
- return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
- write_lock(&ss->id_lock);
- /* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
- write_unlock(&ss->id_lock);
-
- /* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
- goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
-
- newid->id = myid;
- newid->depth = depth;
- return newid;
-remove_idr:
- error = -ENOSPC;
- write_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- write_unlock(&ss->id_lock);
-err_out:
- kfree(newid);
- return ERR_PTR(error);
-
-}
-
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *rootcss)
-{
- struct css_id *newid;
-
- rwlock_init(&ss->id_lock);
- idr_init(&ss->idr);
-
- newid = get_new_cssid(ss, 0);
- if (IS_ERR(newid))
- return PTR_ERR(newid);
-
- newid->stack[0] = newid->id;
- newid->css = rootcss;
- rootcss->id = newid;
- return 0;
-}
-
-static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
- struct cgroup *child)
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
{
- int subsys_id, i, depth = 0;
- struct cgroup_subsys_state *parent_css, *child_css;
- struct css_id *child_id, *parent_id;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
+ struct cgroup *cgrp;
- subsys_id = ss->subsys_id;
- parent_css = parent->subsys[subsys_id];
- child_css = child->subsys[subsys_id];
- parent_id = parent_css->id;
- depth = parent_id->depth + 1;
+ /* is @dentry a cgroup dir? */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return ERR_PTR(-EBADF);
- child_id = get_new_cssid(ss, depth);
- if (IS_ERR(child_id))
- return PTR_ERR(child_id);
+ rcu_read_lock();
- for (i = 0; i < depth; i++)
- child_id->stack[i] = parent_id->stack[i];
- child_id->stack[depth] = child_id->id;
/*
- * child_id->css pointer will be set after this cgroup is available
- * see cgroup_populate_dir()
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See cgroup_rmdir() for details.
*/
- rcu_assign_pointer(child_css->id, child_id);
-
- return 0;
-}
-
-/**
- * css_lookup - lookup css by id
- * @ss: cgroup subsys to be looked into.
- * @id: the id
- *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
- * NULL if not. Should be called under rcu_read_lock()
- */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
-{
- struct css_id *cssid = NULL;
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
- BUG_ON(!ss->use_id);
- cssid = idr_find(&ss->idr, id);
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
- if (unlikely(!cssid))
- return NULL;
-
- return rcu_dereference(cssid->css);
+ rcu_read_unlock();
+ return css;
}
-EXPORT_SYMBOL_GPL(css_lookup);
/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
*
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid)
-{
- struct cgroup_subsys_state *ret = NULL;
- struct css_id *tmp;
- int tmpid;
- int rootid = css_id(root);
- int depth = css_depth(root);
-
- if (!rootid)
- return NULL;
-
- BUG_ON(!ss->use_id);
- /* fill start point for scan */
- tmpid = id;
- while (1) {
- /*
- * scan next entry from bitmap(tree), tmpid is updated after
- * idr_get_next().
- */
- read_lock(&ss->id_lock);
- tmp = idr_get_next(&ss->idr, &tmpid);
- read_unlock(&ss->id_lock);
-
- if (!tmp)
- break;
- if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
- ret = rcu_dereference(tmp->css);
- if (ret) {
- *foundid = tmpid;
- break;
- }
- }
- /* continue to scan from next id */
- tmpid = tmpid + 1;
- }
- return ret;
-}
-
-/*
- * get corresponding css from file open on cgroupfs directory
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
*/
-struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
- struct cgroup *cgrp;
- struct inode *inode;
- struct cgroup_subsys_state *css;
-
- inode = f->f_dentry->d_inode;
- /* check in cgroup filesystem dir */
- if (inode->i_op != &cgroup_dir_inode_operations)
- return ERR_PTR(-EBADF);
-
- if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
- return ERR_PTR(-EINVAL);
-
- /* get cgroup */
- cgrp = __d_cgrp(f->f_dentry);
- css = cgrp->subsys[id];
- return css ? css : ERR_PTR(-ENOENT);
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
}
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5148,101 +5270,100 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
return css;
}
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+static void debug_css_free(struct cgroup_subsys_state *css)
{
- return atomic_read(&cont->count);
+ kfree(css);
}
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return cgroup_task_count(cont);
+ return cgroup_task_count(css->cgroup);
}
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
return (u64)(unsigned long)current->cgroups;
}
-static u64 current_css_set_refcount_read(struct cgroup *cont,
- struct cftype *cft)
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
u64 count;
rcu_read_lock();
- count = atomic_read(&current->cgroups->refcount);
+ count = atomic_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
-static int current_css_set_cg_links_read(struct cgroup *cont,
- struct cftype *cft,
- struct seq_file *seq)
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
- read_lock(&css_set_lock);
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ down_read(&css_set_rwsem);
rcu_read_lock();
- cg = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
- const char *name;
- if (c->dentry)
- name = c->dentry->d_name.name;
- else
- name = "?";
+ cgroup_name(c, name_buf, NAME_MAX + 1);
seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name);
+ c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
return 0;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
- struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
- struct cg_cgroup_link *link;
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
- struct css_set *cg = link->cg;
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
- seq_printf(seq, "css_set %p\n", cg);
- list_for_each_entry(task, &cg->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
- seq_puts(seq, " ...\n");
- break;
- } else {
- seq_printf(seq, " task %d\n",
- task_pid_vnr(task));
- }
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
}
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return 0;
}
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+ return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
}
static struct cftype debug_files[] = {
{
- .name = "cgroup_refcount",
- .read_u64 = cgroup_refcount_read,
- },
- {
.name = "taskcount",
.read_u64 = debug_taskcount_read,
},
@@ -5259,31 +5380,25 @@ static struct cftype debug_files[] = {
{
.name = "current_css_set_cg_links",
- .read_seq_string = current_css_set_cg_links_read,
+ .seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
- .read_seq_string = cgroup_css_links_read,
+ .seq_show = cgroup_css_links_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
-};
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, debug_files,
- ARRAY_SIZE(debug_files));
-}
+ { } /* terminate */
+};
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
- .populate = debug_populate,
- .subsys_id = debug_subsys_id,
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a6..a79e40f9d70 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,115 +21,69 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
-enum freezer_state {
- CGROUP_THAWED = 0,
- CGROUP_FREEZING,
- CGROUP_FROZEN,
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};
struct freezer {
- struct cgroup_subsys_state css;
- enum freezer_state state;
- spinlock_t lock; /* protects _writes_ to state */
+ struct cgroup_subsys_state css;
+ unsigned int state;
};
-static inline struct freezer *cgroup_freezer(
- struct cgroup *cgroup)
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
- return container_of(
- cgroup_subsys_state(cgroup, freezer_subsys_id),
- struct freezer, css);
+ return css ? container_of(css, struct freezer, css) : NULL;
}
static inline struct freezer *task_freezer(struct task_struct *task)
{
- return container_of(task_subsys_state(task, freezer_subsys_id),
- struct freezer, css);
+ return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+ return css_freezer(freezer->css.parent);
}
bool cgroup_freezing(struct task_struct *task)
{
- enum freezer_state state;
bool ret;
rcu_read_lock();
- state = task_freezer(task)->state;
- ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
}
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
-static const char *freezer_state_strs[] = {
- "THAWED",
- "FREEZING",
- "FROZEN",
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
};
-/*
- * State diagram
- * Transitions are caused by userspace writes to the freezer.state file.
- * The values in parenthesis are state labels. The rest are edge labels.
- *
- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
- * ^ ^ | |
- * | \_______THAWED_______/ |
- * \__________________________THAWED____________/
- */
-
-struct cgroup_subsys freezer_subsys;
-
-/* Locks taken and their ordering
- * ------------------------------
- * cgroup_mutex (AKA cgroup_lock)
- * freezer->lock
- * css_set_lock
- * task->alloc_lock (AKA task_lock)
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * freezer_can_attach():
- * cgroup_mutex (held by caller of can_attach)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * freezer->lock
- * sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * sighand->siglock (fake signal delivery inside freeze_task())
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
- * sighand->siglock
- */
-static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct freezer *freezer;
@@ -137,248 +91,394 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
- freezer->state = CGROUP_THAWED;
return &freezer->css;
}
-static void freezer_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
{
- struct freezer *freezer = cgroup_freezer(cgroup);
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
- if (freezer->state != CGROUP_THAWED)
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
- kfree(freezer);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
}
-/* task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
+static void freezer_css_free(struct cgroup_subsys_state *css)
{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
+ kfree(css_freezer(css));
}
/*
- * The call to cgroup_lock() in the freezer.state write method prevents
- * a write to that file racing against an attach, and hence the
- * can_attach() result will remain valid until the attach completes.
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
*/
-static int freezer_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup,
- struct cgroup_taskset *tset)
+static void freezer_attach(struct cgroup_subsys_state *new_css,
+ struct cgroup_taskset *tset)
{
- struct freezer *freezer;
+ struct freezer *freezer = css_freezer(new_css);
struct task_struct *task;
+ bool clear_frozen = false;
+
+ mutex_lock(&freezer_mutex);
/*
- * Anything frozen can't move or be moved to/from.
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
- cgroup_taskset_for_each(task, new_cgroup, tset)
- if (cgroup_freezing(task))
- return -EBUSY;
+ cgroup_taskset_for_each(task, tset) {
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = true;
+ }
+ }
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state != CGROUP_THAWED)
- return -EBUSY;
+ /* propagate FROZEN clearing upwards */
+ while (clear_frozen && (freezer = parent_freezer(freezer))) {
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = freezer->state & CGROUP_FREEZING;
+ }
- return 0;
+ mutex_unlock(&freezer_mutex);
}
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
/*
- * No lock is needed, since the task isn't on tasklist yet,
- * so it can't be moved to another cgroup, which means the
- * freezer won't be removed and will be valid during this
- * function call. Nevertheless, apply RCU read-side critical
- * section to suppress RCU lockdep false positives.
- */
- rcu_read_lock();
- freezer = task_freezer(task);
- rcu_read_unlock();
-
- /*
- * The root cgroup is non-freezable, so we can skip the
- * following check.
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
*/
- if (!freezer->css.cgroup->parent)
+ if (task_css_is_root(task, freezer_cgrp_id))
return;
- spin_lock_irq(&freezer->lock);
- BUG_ON(freezer->state == CGROUP_FROZEN);
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
- /* Locking avoids race with FREEZING -> THAWED transitions. */
- if (freezer->state == CGROUP_FREEZING)
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
- spin_unlock_irq(&freezer->lock);
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-/*
- * caller must hold freezer->lock
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
*/
-static void update_if_frozen(struct cgroup *cgroup,
- struct freezer *freezer)
+static void update_if_frozen(struct cgroup_subsys_state *css)
{
- struct cgroup_iter it;
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int nfrozen = 0, ntotal = 0;
- enum freezer_state old_state = freezer->state;
-
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- ntotal++;
- if (freezing(task) && is_task_frozen_enough(task))
- nfrozen++;
+
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
}
+ rcu_read_unlock();
- if (old_state == CGROUP_THAWED) {
- BUG_ON(nfrozen > 0);
- } else if (old_state == CGROUP_FREEZING) {
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- } else { /* old_state == CGROUP_FROZEN */
- BUG_ON(nfrozen != ntotal);
+ /* are all tasks frozen? */
+ css_task_iter_start(css, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
}
- cgroup_iter_end(cgroup, &it);
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
}
-static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
+static int freezer_read(struct seq_file *m, void *v)
{
- struct freezer *freezer;
- enum freezer_state state;
-
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
-
- freezer = cgroup_freezer(cgroup);
- spin_lock_irq(&freezer->lock);
- state = freezer->state;
- if (state == CGROUP_FREEZING) {
- /* We change from FREEZING to FROZEN lazily if the cgroup was
- * only partially frozen when we exitted write. */
- update_if_frozen(cgroup, freezer);
- state = freezer->state;
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
}
- spin_unlock_irq(&freezer->lock);
- cgroup_unlock();
- seq_puts(m, freezer_state_strs[state]);
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
return 0;
}
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int num_cant_freeze_now = 0;
-
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task))
- continue;
- if (is_task_frozen_enough(task))
- continue;
- if (!freezing(task) && !freezer_should_skip(task))
- num_cant_freeze_now++;
- }
- cgroup_iter_end(cgroup, &it);
- return num_cant_freeze_now ? -EBUSY : 0;
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
}
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it)))
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
__thaw_task(task);
- cgroup_iter_end(cgroup, &it);
+ css_task_iter_end(&it);
}
-static int freezer_change_state(struct cgroup *cgroup,
- enum freezer_state goal_state)
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
{
- struct freezer *freezer;
- int retval = 0;
-
- freezer = cgroup_freezer(cgroup);
-
- spin_lock_irq(&freezer->lock);
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
- update_if_frozen(cgroup, freezer);
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
- switch (goal_state) {
- case CGROUP_THAWED:
- if (freezer->state != CGROUP_THAWED)
- atomic_dec(&system_freezing_cnt);
- freezer->state = CGROUP_THAWED;
- unfreeze_cgroup(cgroup, freezer);
- break;
- case CGROUP_FROZEN:
- if (freezer->state == CGROUP_THAWED)
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
atomic_inc(&system_freezing_cnt);
- freezer->state = CGROUP_FREEZING;
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- default:
- BUG();
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
}
+}
- spin_unlock_irq(&freezer->lock);
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup_subsys_state *pos;
+
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
- return retval;
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-static int freezer_write(struct cgroup *cgroup,
- struct cftype *cft,
- const char *buffer)
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval;
- enum freezer_state goal_state;
+ bool freeze;
+
+ buf = strstrip(buf);
- if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
- goal_state = CGROUP_THAWED;
- else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
- goal_state = CGROUP_FROZEN;
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
else
return -EINVAL;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
- retval = freezer_change_state(cgroup, goal_state);
- cgroup_unlock();
- return retval;
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
static struct cftype files[] = {
{
.name = "state",
- .read_seq_string = freezer_read,
- .write_string = freezer_write,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
},
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
};
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
- if (!cgroup->parent)
- return 0;
- return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys freezer_subsys = {
- .name = "freezer",
- .create = freezer_create,
- .destroy = freezer_destroy,
- .populate = freezer_populate,
- .subsys_id = freezer_subsys_id,
- .can_attach = freezer_can_attach,
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
.fork = freezer_fork,
+ .base_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index f346cedfe24..633394f442f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,29 +30,6 @@
#include <asm/uaccess.h>
-/*
- * Note that the native side is already converted to a timespec, because
- * that's what we want anyway.
- */
-static int compat_get_timeval(struct timespec *o,
- struct compat_timeval __user *i)
-{
- long usec;
-
- if (get_user(o->tv_sec, &i->tv_sec) ||
- get_user(usec, &i->tv_usec))
- return -EFAULT;
- o->tv_nsec = usec * 1000;
- return 0;
-}
-
-static int compat_put_timeval(struct compat_timeval __user *o,
- struct timeval *i)
-{
- return (put_user(i->tv_sec, &o->tv_sec) ||
- put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
-}
-
static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
memset(txc, 0, sizeof(struct timex));
@@ -111,13 +88,13 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
return 0;
}
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
- if (compat_put_timeval(tv, &ktv))
+ if (compat_put_timeval(&ktv, tv))
return -EFAULT;
}
if (tz) {
@@ -128,38 +105,113 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
return 0;
}
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
- struct timespec kts;
- struct timezone ktz;
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
if (tv) {
- if (compat_get_timeval(&kts, tv))
+ if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ __get_user(tv->tv_sec, &ctv->tv_sec) ||
+ __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ __put_user(tv->tv_sec, &ctv->tv_sec) ||
+ __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-EXPORT_SYMBOL_GPL(put_compat_timespec);
+
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_get_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_put_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
+
+int compat_convert_timespec(struct timespec __user **kts,
+ const void __user *cts)
+{
+ struct timespec ts;
+ struct timespec __user *uts;
+
+ if (!cts || COMPAT_USE_64BIT_TIME) {
+ *kts = (struct timespec __user *)cts;
+ return 0;
+ }
+
+ uts = compat_alloc_user_space(sizeof(ts));
+ if (!uts)
+ return -EFAULT;
+ if (compat_get_timespec(&ts, cts))
+ return -EFAULT;
+ if (copy_to_user(uts, &ts, sizeof(ts)))
+ return -EFAULT;
+
+ *kts = uts;
+ return 0;
+}
static long compat_nanosleep_restart(struct restart_block *restart)
{
@@ -177,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (ret) {
rmtp = restart->nanosleep.compat_rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
return ret;
}
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
mm_segment_t oldfs;
long ret;
- if (get_compat_timespec(&tu, rqtp))
+ if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
if (!timespec_valid(&tu))
@@ -211,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
@@ -238,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -250,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -276,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
if (tbuf) {
struct tms tms;
@@ -302,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
* types that can be passed to put_user()/get_user().
*/
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
{
old_sigset_t s;
long ret;
@@ -320,31 +372,60 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ memcpy(blocked->sig, &set, sizeof(set));
+}
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
#endif
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
@@ -362,15 +443,15 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
#ifdef COMPAT_RLIM_OLD_INFINITY
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, &r);
+ ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
set_fs(old_fs);
if (!ret) {
@@ -389,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
#endif
-asmlinkage long compat_sys_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
@@ -435,28 +516,11 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
-}
-
-asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
- struct compat_rusage __user *ru)
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
@@ -483,9 +547,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
}
}
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
- struct compat_siginfo __user *uinfo, int options,
- struct compat_rusage __user *uru)
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
{
siginfo_t info;
struct rusage ru;
@@ -503,9 +568,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
return ret;
if (uru) {
- ret = put_compat_rusage(&ru, uru);
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -527,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
return compat_get_bitmap(k, user_mask_ptr, len * 8);
}
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
- unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+ unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
@@ -547,8 +616,8 @@ out:
return retval;
}
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
@@ -578,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
int get_compat_itimerspec(struct itimerspec *dst,
const struct compat_itimerspec __user *src)
{
- if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
- get_compat_timespec(&dst->it_value, &src->it_value))
+ if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+ __compat_get_timespec(&dst->it_value, &src->it_value))
return -EFAULT;
return 0;
}
@@ -587,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst,
int put_compat_itimerspec(struct compat_itimerspec __user *dst,
const struct itimerspec *src)
{
- if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
- put_compat_timespec(&src->it_value, &dst->it_value))
+ if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+ __compat_put_timespec(&src->it_value, &dst->it_value))
return -EFAULT;
return 0;
}
-long compat_sys_timer_create(clockid_t which_clock,
- struct compat_sigevent __user *timer_event_spec,
- timer_t __user *created_timer_id)
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+ struct compat_sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
{
struct sigevent __user *event = NULL;
@@ -611,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
return sys_timer_create(which_clock, event, created_timer_id);
}
-long compat_sys_timer_settime(timer_t timer_id, int flags,
- struct compat_itimerspec __user *new,
- struct compat_itimerspec __user *old)
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ struct compat_itimerspec __user *, new,
+ struct compat_itimerspec __user *, old)
{
long err;
mm_segment_t oldfs;
@@ -634,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
return err;
}
-long compat_sys_timer_gettime(timer_t timer_id,
- struct compat_itimerspec __user *setting)
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct compat_itimerspec __user *, setting)
{
long err;
mm_segment_t oldfs;
@@ -651,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id,
return err;
}
-long compat_sys_clock_settime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
- if (get_compat_timespec(&ts, tp))
+ if (compat_get_timespec(&ts, tp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
@@ -668,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
return err;
}
-long compat_sys_clock_gettime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -680,13 +749,13 @@ long compat_sys_clock_gettime(clockid_t which_clock,
err = sys_clock_gettime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && put_compat_timespec(&ts, tp))
+ if (!err && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
-long compat_sys_clock_adjtime(clockid_t which_clock,
- struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+ struct compat_timex __user *, utp)
{
struct timex txc;
mm_segment_t oldfs;
@@ -708,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
return ret;
}
-long compat_sys_clock_getres(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -720,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
err = sys_clock_getres(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && tp && put_compat_timespec(&ts, tp))
+ if (!err && tp && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
@@ -730,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
long err;
mm_segment_t oldfs;
struct timespec tu;
- struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+ struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
restart->nanosleep.rmtp = (struct timespec __user *) &tu;
oldfs = get_fs();
@@ -739,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&tu, rmtp))
+ compat_put_timespec(&tu, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -749,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
return err;
}
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
- struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+ struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
long err;
mm_segment_t oldfs;
struct timespec in, out;
struct restart_block *restart;
- if (get_compat_timespec(&in, rqtp))
+ if (compat_get_timespec(&in, rqtp))
return -EFAULT;
oldfs = get_fs();
@@ -769,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&out, rmtp))
+ compat_put_timespec(&out, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -883,7 +952,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -894,10 +963,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
}
EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
@@ -913,7 +992,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
sigset_from_compat(&s, &s32);
if (uts) {
- if (get_compat_timespec(&t, uts))
+ if (compat_get_timespec(&t, uts))
return -EFAULT;
}
@@ -925,25 +1004,13 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
return ret;
-
-}
-
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
- struct compat_siginfo __user *uinfo)
-{
- siginfo_t info;
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
{
compat_time_t i;
struct timeval tv;
@@ -959,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
return i;
}
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
struct timespec tv;
int err;
@@ -979,32 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
- current->saved_sigmask = current->blocked;
- set_current_blocked(&newset);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
{
struct timex txc;
int err, ret;
@@ -1023,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
}
#ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
- compat_uptr_t __user *pages32,
- const int __user *nodes,
- int __user *status,
- int flags)
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
{
const void __user * __user *pages;
int i;
@@ -1043,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
- compat_ulong_t maxnode,
- const compat_ulong_t __user *old_nodes,
- const compat_ulong_t __user *new_nodes)
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
@@ -1077,69 +1119,20 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
{
- struct sysinfo s;
-
- do_sysinfo(&s);
-
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
-
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
-
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs();
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
+ set_fs(KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+ set_fs(old_fs);
+ if (compat_put_timespec(&t, interval))
return -EFAULT;
-
- return 0;
+ return ret;
}
/*
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075ee..c18b1f1ae51 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- entry->size = kernel_config_data_size;
+ proc_set_size(entry, kernel_config_data_size);
return 0;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 00000000000..5664985c46a
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,216 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+
+void context_tracking_cpu_set(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+}
+
+/**
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
+void context_tracking_user_enter(void)
+{
+ unsigned long flags;
+
+ /*
+ * Repeat the user_enter() check here because some archs may be calling
+ * this from asm and if no CPU needs context tracking, they shouldn't
+ * go further. Repeat the check here until they support the inline static
+ * key check.
+ */
+ if (!context_tracking_is_enabled())
+ return;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ /* Kernel threads aren't supposed to go to userspace */
+ WARN_ON_ONCE(!current->mm);
+
+ local_irq_save(flags);
+ if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ trace_user_enter(0);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
+ rcu_user_enter();
+ }
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ __this_cpu_write(context_tracking.state, IN_USER);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
+
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ /*
+ * Need to disable preemption in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ preempt_disable_notrace();
+ prev_ctx = exception_enter();
+ preempt_enable_no_resched_notrace();
+
+ preempt_schedule();
+
+ preempt_disable_notrace();
+ exception_exit(prev_ctx);
+ preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
+
+/**
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
+void context_tracking_user_exit(void)
+{
+ unsigned long flags;
+
+ if (!context_tracking_is_enabled())
+ return;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
+ rcu_user_exit();
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
+ __this_cpu_write(context_tracking.state, IN_KERNEL);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
+
+/**
+ * __context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
+void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e5702..a343bde710b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,30 +10,42 @@
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include <linux/lockdep.h>
+#include <trace/events/power.h>
+
+#include "smpboot.h"
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_done);
static RAW_NOTIFIER_HEAD(cpu_chain);
@@ -52,17 +64,30 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
};
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
+ cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
@@ -75,9 +100,14 @@ void put_online_cpus(void)
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
+
+ if (WARN_ON(!cpu_hotplug.refcount))
+ cpu_hotplug.refcount++; /* try to fix things up */
+
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -104,10 +134,11 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
* get_online_cpus() not an api which is called all that often.
*
*/
-static void cpu_hotplug_begin(void)
+void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
@@ -118,16 +149,35 @@ static void cpu_hotplug_begin(void)
}
}
-static void cpu_hotplug_done(void)
+void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
-#else /* #if CONFIG_HOTPLUG_CPU */
-static void cpu_hotplug_begin(void) {}
-static void cpu_hotplug_done(void) {}
-#endif /* #else #if CONFIG_HOTPLUG_CPU */
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -139,6 +189,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
int *nr_calls)
{
@@ -162,6 +217,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
BUG_ON(cpu_notify(val, v));
}
EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
@@ -171,16 +227,64 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask. While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+ struct task_struct *p;
+
+ /*
+ * This function is called after the cpu is taken down and marked
+ * offline, so its not like new tasks will ever get this cpu set in
+ * their mm mask. -- Peter Zijlstra
+ * Thus, we may use rcu_read_lock() here, instead of grabbing
+ * full-fledged tasklist_lock.
+ */
+ WARN_ON(cpu_online(cpu));
+ rcu_read_lock();
+ for_each_process(p) {
+ struct task_struct *t;
+
+ /*
+ * Main thread might exit, but other threads may still have
+ * a valid mm. Find one.
+ */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+}
+
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
+ cputime_t utime, stime;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
+ task_cputime(p, &utime, &stime);
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (p->utime || p->stime))
- printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
- "(state = %ld, flags = %x)\n",
+ (utime || stime))
+ pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
p->comm, task_pid_nr(p), cpu,
p->state, p->flags);
}
@@ -204,6 +308,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Park the stopper thread */
+ kthread_park(current);
return 0;
}
@@ -230,16 +336,37 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (err) {
nr_calls--;
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
- printk("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
goto out_release;
}
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so explicitly call both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+#ifdef CONFIG_PREEMPT
+ synchronize_sched();
+#endif
+ synchronize_rcu();
+
+ smpboot_park_threads(cpu);
+
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
+
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
+ smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -290,81 +417,75 @@ EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-
- if (cpu_online(cpu) || !cpu_present(cpu))
- return -EINVAL;
+ struct task_struct *idle;
cpu_hotplug_begin();
+
+ if (cpu_online(cpu) || !cpu_present(cpu)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+ pr_warn("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
/* Arch-specific enabling code. */
- ret = __cpu_up(cpu);
+ ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
+
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
cpu_hotplug_done();
return ret;
}
-int __cpuinit cpu_up(unsigned int cpu)
+int cpu_up(unsigned int cpu)
{
int err = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG
- int nid;
- pg_data_t *pgdat;
-#endif
-
if (!cpu_possible(cpu)) {
- printk(KERN_ERR "can't online cpu %d because it is not "
- "configured as may-hotadd at boot time\n", cpu);
+ pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
+ cpu);
#if defined(CONFIG_IA64)
- printk(KERN_ERR "please check additional_cpus= boot "
- "parameter\n");
+ pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
- nid = cpu_to_node(cpu);
- if (!node_online(nid)) {
- err = mem_online_node(nid);
- if (err)
- return err;
- }
-
- pgdat = NODE_DATA(nid);
- if (!pgdat) {
- printk(KERN_ERR
- "Can't online cpu %d due to NULL pgdat\n", cpu);
- return -ENOMEM;
- }
-
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
- mutex_unlock(&zonelists_mutex);
- }
-#endif
+ err = try_online_node(cpu_to_node(cpu));
+ if (err)
+ return err;
cpu_maps_update_begin();
@@ -384,14 +505,6 @@ EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-void __weak arch_disable_nonboot_cpus_begin(void)
-{
-}
-
-void __weak arch_disable_nonboot_cpus_end(void)
-{
-}
-
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
@@ -403,30 +516,28 @@ int disable_nonboot_cpus(void)
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
- arch_disable_nonboot_cpus_begin();
- printk("Disabling non-boot CPUs ...\n");
+ pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
- printk(KERN_ERR "Error taking CPU%d down: %d\n",
- cpu, error);
+ pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
- arch_disable_nonboot_cpus_end();
-
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
- printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+ pr_err("Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
return error;
@@ -450,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk(KERN_INFO "Enabling non-boot CPUs ...\n");
+ pr_info("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
- printk(KERN_INFO "CPU%d is up\n", cpu);
+ pr_info("CPU%d is up\n", cpu);
continue;
}
- printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+ pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
arch_enable_nonboot_cpus_end();
@@ -479,36 +592,6 @@ static int __init alloc_frozen_cpus(void)
core_initcall(alloc_frozen_cpus);
/*
- * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
- * hotplug when tasks are about to be frozen. Also, don't allow the freezer
- * to continue until any currently running CPU hotplug operation gets
- * completed.
- * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
- * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
- * CPU hotplug path and released only after it is complete. Thus, we
- * (and hence the freezer) will block here until any currently running CPU
- * hotplug operation gets completed.
- */
-void cpu_hotplug_disable_before_freeze(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
- cpu_maps_update_done();
-}
-
-
-/*
- * When tasks have been thawed, re-enable regular CPU hotplug (which had been
- * disabled while beginning to freeze tasks).
- */
-void cpu_hotplug_enable_after_thaw(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
- cpu_maps_update_done();
-}
-
-/*
* When callbacks for CPU hotplug notifications are being executed, we must
* ensure that the state of the system with respect to the tasks being frozen
* or not, as reported by the notification, remains unchanged *throughout the
@@ -527,12 +610,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
- cpu_hotplug_disable_before_freeze();
+ cpu_hotplug_disable();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
- cpu_hotplug_enable_after_thaw();
+ cpu_hotplug_enable();
break;
default:
@@ -545,6 +628,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
static int __init cpu_hotplug_pm_sync_init(void)
{
+ /*
+ * cpu_hotplug_pm_callback has higher priority than x86
+ * bsp_pm_callback which depends on cpu_hotplug_pm_callback
+ * to disable cpu hotplug to avoid cpu hotplug race.
+ */
pm_notifier(cpu_hotplug_pm_callback, 0);
return 0;
}
@@ -560,7 +648,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-void __cpuinit notify_cpu_starting(unsigned int cpu)
+void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
@@ -640,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e1530..9656a3c3650 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
/**
- * cpm_pm_enter - CPU low power entry notifier
+ * cpu_pm_enter - CPU low power entry notifier
*
* Notifies listeners that a single CPU is entering a low power state that may
* cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
* Must be called on the affected CPU with interrupts disabled. Platform is
* responsible for ensuring that cpu_pm_enter is not called twice on the same
* CPU before cpu_pm_exit is called. Notified drivers can include VFP
- * co-processor, interrupt controller and it's PM extensions, local CPU
+ * co-processor, interrupt controller and its PM extensions, local CPU
* timers context save/restore which shouldn't be interrupted. Hence it
* must be called with interrupts disabled.
*
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
EXPORT_SYMBOL_GPL(cpu_pm_enter);
/**
- * cpm_pm_exit - CPU low power exit notifier
+ * cpu_pm_exit - CPU low power exit notifier
*
* Notifies listeners that a single CPU is exiting a low power state that may
* have caused some blocks in the same power domain as the cpu to reset.
*
* Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
EXPORT_SYMBOL_GPL(cpu_pm_exit);
/**
- * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
*
* Notifies listeners that all cpus in a power domain are entering a low power
* state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
* Must be called after cpu_pm_enter has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
/**
- * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
*
* Notifies listeners that all cpus in a power domain are exiting form a
* low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
* Must be called after cpu_pm_exit has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a66..116a4164720 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,25 +59,9 @@
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/wait.h>
-/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
-
-/* Forward declare cgroup structures */
-struct cgroup_subsys cpuset_subsys;
-struct cpuset;
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -95,32 +79,47 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
-
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
};
-/* Retrieve the cpuset for a cgroup */
-static inline struct cpuset *cgroup_cs(struct cgroup *cont)
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
- struct cpuset, css);
+ return css ? container_of(css, struct cpuset, css) : NULL;
}
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
- return container_of(task_subsys_state(task, cpuset_subsys_id),
- struct cpuset, css);
+ return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
}
#ifdef CONFIG_NUMA
@@ -138,6 +137,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -148,6 +148,11 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -184,27 +189,53 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -226,18 +257,16 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers. They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
+ * CPU / memory hotplug is handled asynchronously.
*/
-#define CPUSET_NAME_LEN (128)
-#define CPUSET_NODELIST_LEN (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
* This is ugly, but preserves the userspace API for existing cpuset
@@ -268,59 +297,43 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. If we get
- * all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * until we find one that does have some online cpus. The top
+ * cpuset always has some cpus online.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_cpus(const struct cpuset *cs,
- struct cpumask *pmask)
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
- if (cs)
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
- else
- cpumask_copy(pmask, cpu_online_mask);
- BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
+ while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ cs = parent_cs(cs);
+ cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
}
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
* up the cpuset hierarchy until we find one that does have some
- * online mems. If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_HIGH_MEMORY].
+ * online mems. The top cpuset always has some mems online.
*
* One way or another, we guarantee to return some non-empty subset
- * of node_states[N_HIGH_MEMORY].
+ * of node_states[N_MEMORY].
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (cs && !nodes_intersects(cs->mems_allowed,
- node_states[N_HIGH_MEMORY]))
- cs = cs->parent;
- if (cs)
- nodes_and(*pmask, cs->mems_allowed,
- node_states[N_HIGH_MEMORY]);
- else
- *pmask = node_states[N_HIGH_MEMORY];
- BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+ while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ cs = parent_cs(cs);
+ nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
}
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -340,7 +353,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -355,7 +368,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
-static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
@@ -389,7 +402,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -402,52 +415,66 @@ static void free_trial_cpuset(struct cpuset *trial)
* Return 0 if valid, -errno if not.
*/
-static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
- struct cgroup *cont;
+ struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
}
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -468,31 +495,27 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs)
+ continue;
- if (cpumask_empty(cp->cpus_allowed))
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -501,7 +524,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* This function builds a partial partition of the systems CPUs
* A 'partial partition' is a set of non-overlapping subsets whose
* union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched.c
+ * The output of this function needs to be passed to kernel/sched/core.c
* partition_sched_domains() routine, which will rebuild the scheduler's
* load balancing domains (sched domains) as specified by that partial
* partition.
@@ -514,7 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -530,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* is a subset of one of these domains, while there are as
* many such domains as possible, each as small as possible.
* doms - Conversion of 'csa' to an array of cpumasks, for passing to
- * the kernel/sched.c routine partition_sched_domains() in a
+ * the kernel/sched/core.c routine partition_sched_domains() in a
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
@@ -552,7 +575,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -561,6 +583,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
doms = NULL;
dattr = NULL;
@@ -583,38 +606,34 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
}
- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
continue;
-
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -672,11 +691,8 @@ restart:
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
+ pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+ nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
@@ -719,155 +735,163 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
+ /*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+ if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ goto out;
+
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
+out:
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
}
+#endif /* CONFIG_SMP */
-static int generate_sched_domains(cpumask_var_t **domains,
- struct sched_domain_attr **attributes)
+void rebuild_sched_domains(void)
{
- *domains = NULL;
- return 1;
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
-#endif /* CONFIG_SMP */
-
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
+ * @cs: the cpuset in interest
*
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
+ * A cpuset's effective cpumask is the cpumask of the nearest ancestor
+ * with non-empty cpus. We use effective cpumask whenever:
+ * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
+ * if the cpuset they reside in has no cpus)
+ * - we want to retrieve task_cs(tsk)'s cpus_allowed.
*
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
+ * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
+ * exception. See comments there.
*/
-static void async_rebuild_sched_domains(void)
+static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
+ while (cpumask_empty(cs->cpus_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
+ * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
+ * @cs: the cpuset in interest
*
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
-void rebuild_sched_domains(void)
-{
- do_rebuild_sched_domains(NULL);
-}
-
-/**
- * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Call with cgroup_mutex held. May take callback_mutex during call.
- * Called for each task in a cgroup by cgroup_scan_tasks().
- * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
- * words, if its mask is not equal to its cpuset's mask).
+ * A cpuset's effective nodemask is the nodemask of the nearest ancestor
+ * with non-empty memss. We use effective nodemask whenever:
+ * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
+ * if the cpuset they reside in has no mems)
+ * - we want to retrieve task_cs(tsk)'s mems_allowed.
+ *
+ * Called with cpuset_mutex held.
*/
-static int cpuset_test_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
{
- return !cpumask_equal(&tsk->cpus_allowed,
- (cgroup_cs(scan->cg))->cpus_allowed);
+ while (nodes_empty(cs->mems_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner containing the cgroup of the task
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup whose
- * cpus_allowed mask needs to be changed.
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void cpuset_change_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static void update_tasks_cpumask(struct cpuset *cs)
{
- set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ css_task_iter_end(&it);
}
-/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
- *
- * Called with cgroup_mutex held
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
*
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Called with cpuset_mutex held
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
{
- struct cgroup_scanner scan;
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
- scan.cg = cs->css.cgroup;
- scan.test_task = cpuset_test_cpumask;
- scan.process_task = cpuset_change_cpumask;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ update_tasks_cpumask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
}
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
+ * @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- struct ptr_heap heap;
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -887,16 +911,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
return -EINVAL;
}
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
return retval;
is_load_balanced = is_sched_load_balance(trialcs);
@@ -905,16 +926,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- /*
- * Scan tasks in the cpuset, and update the cpumasks of any
- * that need an update.
- */
- update_tasks_cpumask(cs, &heap);
-
- heap_free(&heap);
+ update_tasks_cpumask_hier(cs, true);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -926,12 +941,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
- * during this call, as manage_mutex holds off any cpuset_attach()
- * calls. Therefore we don't need to take task_lock around the
- * call to guarantee_online_mems(), as we know no one is changing
- * our task's cpuset.
- *
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
@@ -942,12 +951,16 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
+ struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ rcu_read_unlock();
}
/*
@@ -964,7 +977,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
-repeat:
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
@@ -977,83 +989,30 @@ repeat:
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
- * get_mems_allowed(). If at least one node remains unchanged and
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
* tsk does not have a mempolicy, then an empty nodemask will not be
* possible when mems_allowed is larger than a word.
*/
need_loop = task_has_mempolicy(tsk) ||
!nodes_intersects(*newmems, tsk->mems_allowed);
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- /*
- * ensure checking ->mems_allowed_change_disable after setting all new
- * allowed nodes.
- *
- * the read-side task can see an nodemask with new allowed nodes and
- * old allowed nodes. and if it allocates page when cpuset clears newly
- * disallowed ones continuous, it can see the new allowed bits.
- *
- * And if setting all new allowed nodes is after the checking, setting
- * all new allowed nodes and clearing newly disallowed ones will be done
- * continuous, and the read-side task may find no node to alloc page.
- */
- smp_mb();
- /*
- * Allocation of memory is very fast, we needn't sleep when waiting
- * for the read-side.
- */
- while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
- task_unlock(tsk);
- if (!task_curr(tsk))
- yield();
- goto repeat;
+ if (need_loop) {
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
}
- /*
- * ensure checking ->mems_allowed_change_disable before clearing all new
- * disallowed nodes.
- *
- * if clearing newly disallowed bits before the checking, the read-side
- * task may find no node to alloc page.
- */
- smp_mb();
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
- task_unlock(tsk);
-}
-
-/*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
- */
-static void cpuset_change_nodemask(struct task_struct *p,
- struct cgroup_scanner *scan)
-{
- struct mm_struct *mm;
- struct cpuset *cs;
- int migrate;
- const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cgroup_mutex */
- cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, &newmems);
-
- cpuset_change_task_nodemask(p, &newmems);
-
- mm = get_task_mm(p);
- if (!mm)
- return;
-
- migrate = is_memory_migrate(cs);
+ if (need_loop) {
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
- mmput(mm);
+ task_unlock(tsk);
}
static void *cpuset_being_rebound;
@@ -1061,43 +1020,102 @@ static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @oldmem: old mems_allowed of cpuset cs
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
- struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ static nodemask_t newmems; /* protected by cpuset_mutex */
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- scan.cg = cs->css.cgroup;
- scan.test_task = NULL;
- scan.process_task = cpuset_change_nodemask;
- scan.heap = heap;
- scan.data = (nodemask_t *)oldmem;
+ guarantee_online_mems(mems_cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- cgroup_scan_tasks(&scan);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
cpuset_being_rebound = NULL;
}
/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!nodes_empty(cp->mems_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ update_tasks_nodemask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+}
+
+/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
* cpusets mems_allowed, and for each task in the cpuset,
@@ -1105,7 +1123,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1113,15 +1131,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
int retval;
- struct ptr_heap heap;
-
- if (!oldmem)
- return -ENOMEM;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+ * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
* it's read-only
*/
if (cs == &top_cpuset) {
@@ -1143,13 +1156,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_HIGH_MEMORY])) {
+ node_states[N_MEMORY])) {
retval = -EINVAL;
goto done;
}
}
- *oldmem = cs->mems_allowed;
- if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
+
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
@@ -1157,25 +1170,24 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval < 0)
- goto done;
-
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(cs, oldmem, &heap);
-
- heap_free(&heap);
+ update_tasks_nodemask_hier(cs, true);
done:
- NODEMASK_FREE(oldmem);
return retval;
}
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1189,50 +1201,29 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
}
-/*
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
- * @tsk: task to be updated
- * @scan: struct cgroup_scanner containing the cgroup of the task
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
- */
-static void cpuset_change_flag(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
-}
-
-/*
+/**
* update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
- *
- * Called with cgroup_mutex held
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- *
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
*/
-static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_flags(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ struct css_task_iter it;
+ struct task_struct *task;
- scan.cg = cs->css.cgroup;
- scan.test_task = NULL;
- scan.process_task = cpuset_change_flag;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
}
/*
@@ -1241,7 +1232,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1250,7 +1241,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
- struct ptr_heap heap;
int err;
trialcs = alloc_trial_cpuset(cs);
@@ -1266,10 +1256,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
- err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (err < 0)
- goto out;
-
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
@@ -1281,11 +1267,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
- update_tasks_flags(cs, &heap);
- heap_free(&heap);
+ update_tasks_flags(cs);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1389,64 +1374,98 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
+static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
struct task_struct *task;
int ret;
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
+ mutex_lock(&cpuset_mutex);
+
+ /*
+ * We allow to move tasks into an empty cpuset if sane_behavior
+ * flag is set.
+ */
+ ret = -ENOSPC;
+ if (!cgroup_sane_behavior(css->cgroup) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ goto out_unlock;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, tset) {
/*
- * Kthreads bound to specific cpus cannot be moved to a new
- * cpuset; we cannot change their cpu affinity and
- * isolating such threads by their set of allowed nodes is
- * unnecessary. Thus, cpusets are not applicable for such
- * threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before
- * cpus_allowed may be changed.
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
*/
- if (task->flags & PF_THREAD_BOUND)
- return -EINVAL;
- if ((ret = security_task_setscheduler(task)))
- return ret;
+ ret = -EINVAL;
+ if (task->flags & PF_NO_SETAFFINITY)
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
- return 0;
+static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ css_cs(css)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
+ /* static buf protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
- struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
- struct cpuset *cs = cgroup_cs(cgrp);
- struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
- cgroup_taskset_for_each(task, cgrp, tset) {
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cpus_cs, cpus_attach);
+
+ guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+
+ cgroup_taskset_for_each(task, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -1461,16 +1480,34 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_from = oldcs->mems_allowed;
cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(leader);
if (mm) {
+ struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
+
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
- if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+
+ /*
+ * old_mems_allowed is the same with mems_allowed here, except
+ * if this task is being moved automatically due to hotplug.
+ * In that case @mems_allowed has been updated and is empty,
+ * so @old_mems_allowed is the right nodesets that we migrate
+ * mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
+ }
mmput(mm);
}
+
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1490,14 +1527,18 @@ typedef enum {
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
-static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = 0;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1531,18 +1572,21 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
-static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1552,30 +1596,57 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ buf = strstrip(buf);
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
+ */
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
+ flush_work(&cpuset_hotplug_work);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
- goto out;
+ goto out_unlock;
}
- switch (cft->private) {
+ switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
@@ -1588,9 +1659,11 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
-out:
- cgroup_unlock();
- return retval;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+ return retval ?: nbytes;
}
/*
@@ -1600,72 +1673,46 @@ out:
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
- * A single large read to a buffer that crosses a page boundary is
- * ok, because the result being copied to user land is not recomputed
- * across a page fault.
*/
-
-static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
- size_t count;
-
- mutex_lock(&callback_mutex);
- count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
-
- return count;
-}
+ struct cpuset *cs = css_cs(seq_css(sf));
+ cpuset_filetype_t type = seq_cft(sf)->private;
+ ssize_t count;
+ char *buf, *s;
+ int ret = 0;
-static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-{
- size_t count;
+ count = seq_get_buf(sf, &buf);
+ s = buf;
mutex_lock(&callback_mutex);
- count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
- mutex_unlock(&callback_mutex);
-
- return count;
-}
-
-static ssize_t cpuset_common_file_read(struct cgroup *cont,
- struct cftype *cft,
- struct file *file,
- char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct cpuset *cs = cgroup_cs(cont);
- cpuset_filetype_t type = cft->private;
- char *page;
- ssize_t retval = 0;
- char *s;
-
- if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
- return -ENOMEM;
-
- s = page;
switch (type) {
case FILE_CPULIST:
- s += cpuset_sprintf_cpulist(s, cs);
+ s += cpulist_scnprintf(s, count, cs->cpus_allowed);
break;
case FILE_MEMLIST:
- s += cpuset_sprintf_memlist(s, cs);
+ s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
default:
- retval = -EINVAL;
- goto out;
+ ret = -EINVAL;
+ goto out_unlock;
}
- *s++ = '\n';
- retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
- free_page((unsigned long)page);
- return retval;
+ if (s < buf + count - 1) {
+ *s++ = '\n';
+ seq_commit(sf, s - buf);
+ } else {
+ seq_commit(sf, -1);
+ }
+out_unlock:
+ mutex_unlock(&callback_mutex);
+ return ret;
}
-static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1694,9 +1741,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
return 0;
}
-static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1717,16 +1764,16 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
static struct cftype files[] = {
{
.name = "cpus",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
@@ -1794,85 +1841,32 @@ static struct cftype files[] = {
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
- .name = "memory_pressure_enabled",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
-};
-
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- int err;
-
- err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
- if (err)
- return err;
- /* memory_pressure_enabled is in root cpuset only */
- if (!cont->parent)
- err = cgroup_add_file(cont, ss,
- &cft_memory_pressure_enabled);
- return err;
-}
-
-/*
- * post_clone() is called during cgroup_create() when the
- * clone_children mount argument was specified. The cgroup
- * can not yet have any tasks.
- *
- * Currently we refuse to set up the cgroup - thereby
- * refusing the task to be entered, and as a result refusing
- * the sys_unshare() or clone() which initiated it - if any
- * sibling cpusets have exclusive cpus or mem.
- *
- * If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
- * held.
- */
-static void cpuset_post_clone(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct cgroup *parent, *child;
- struct cpuset *cs, *parent_cs;
- parent = cgroup->parent;
- list_for_each_entry(child, &parent->children, sibling) {
- cs = cgroup_cs(child);
- if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
- return;
- }
- cs = cgroup_cs(cgroup);
- parent_cs = cgroup_cs(parent);
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
- mutex_lock(&callback_mutex);
- cs->mems_allowed = parent_cs->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
- return;
-}
+ { } /* terminate */
+};
/*
- * cpuset_create - create a cpuset
- * ss: cpuset cgroup subsystem
- * cont: control group that the new cpuset will be part of
+ * cpuset_css_alloc - allocate a cpuset css
+ * cgrp: control group that the new cpuset will be part of
*/
-static struct cgroup_subsys_state *cpuset_create(
- struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
- struct cpuset *parent;
- if (!cont->parent) {
+ if (!parent_css)
return &top_cpuset.css;
- }
- parent = cgroup_cs(cont->parent);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1880,49 +1874,107 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
- cs->parent = parent;
- number_of_cpusets++;
- return &cs->css ;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ cpuset_inc();
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * histrical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ mutex_unlock(&callback_mutex);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- number_of_cpusets--;
+ cpuset_dec();
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
-struct cgroup_subsys cpuset_subsys = {
- .name = "cpuset",
- .create = cpuset_create,
- .destroy = cpuset_destroy,
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .populate = cpuset_populate,
- .post_clone = cpuset_post_clone,
- .subsys_id = cpuset_subsys_id,
+ .base_cftypes = files,
.early_init = 1,
};
@@ -1953,226 +2005,230 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
- number_of_cpusets = 1;
return 0;
}
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
*/
-static void cpuset_do_move_task(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
- struct cgroup *new_cgroup = scan->data;
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = parent_cs(cs);
+ while (cpumask_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent_cs(parent);
- cgroup_attach_task(new_cgroup, tsk);
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
}
/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems;
+ bool is_empty;
+ bool sane = cgroup_sane_behavior(cs->css.cgroup);
- scan.cg = from->css.cgroup;
- scan.test_task = NULL; /* select all tasks in cgroup */
- scan.process_task = cpuset_do_move_task;
- scan.heap = NULL;
- scan.data = to->css.cgroup;
+retry:
+ wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- if (cgroup_scan_tasks(&scan))
- printk(KERN_ERR "move_member_tasks_to_cpuset: "
- "cgroup_scan_tasks failed\n");
-}
+ mutex_lock(&cpuset_mutex);
-/*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- */
-static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
-{
- struct cpuset *parent;
+ /*
+ * We have raced with task attaching. We wait until attaching
+ * is finished, so we won't attach a task to an empty cpuset.
+ */
+ if (cs->attach_in_progress) {
+ mutex_unlock(&cpuset_mutex);
+ goto retry;
+ }
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
+ * If sane_behavior flag is set, we need to update tasks' cpumask
+ * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
+ * call update_tasks_cpumask() if the cpuset becomes empty, as
+ * the tasks in it will be migrated to an ancestor.
*/
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
+ if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+ (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
+ update_tasks_cpumask(cs);
+
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
/*
- * Find its next-highest non-empty parent, (top cpuset
- * has online cpus, so can't be empty).
+ * If sane_behavior flag is set, we need to update tasks' nodemask
+ * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
+ * call update_tasks_nodemask() if the cpuset becomes empty, as
+ * the tasks in it will be migratd to an ancestor.
*/
- parent = cs->parent;
- while (cpumask_empty(parent->cpus_allowed) ||
- nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ if ((sane && nodes_empty(cs->mems_allowed)) ||
+ (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
- move_member_tasks_to_cpuset(cs, parent);
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+ *
+ * Otherwise move tasks to the nearest ancestor with execution
+ * resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (!sane && is_empty)
+ remove_tasks_in_empty_cpuset(cs);
}
-/*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
*
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'. But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static void scan_for_empty_cpusets(struct cpuset *root)
-{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- list_add_tail((struct list_head *)&root->stack_list, &queue);
-
- while (!list_empty(&queue)) {
- cp = list_first_entry(&queue, struct cpuset, stack_list);
- list_del(queue.next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &queue);
- }
+static void cpuset_hotplug_workfn(struct work_struct *work)
+{
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated, mems_updated;
- /* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
- nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
- continue;
+ mutex_lock(&cpuset_mutex);
+
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
- oldmems = cp->mems_allowed;
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
- /* Remove offline cpus and mems from this cpuset. */
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_HIGH_MEMORY]);
+ top_cpuset.mems_allowed = new_mems;
mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset);
+ }
+
+ mutex_unlock(&cpuset_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed) ||
- nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else {
- update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems, NULL);
+ /* if cpus or mems changed, we need to propagate to descendants */
+ if (cpus_updated || mems_updated) {
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ cpuset_hotplug_update_tasks(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
}
+ rcu_read_unlock();
}
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated)
+ rebuild_sched_domains();
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
-
- cgroup_lock();
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- mutex_unlock(&callback_mutex);
- scan_for_empty_cpusets(&top_cpuset);
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
-
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- oldmems = top_cpuset.mems_allowed;
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
- break;
- case MEM_OFFLINE:
- /*
- * needn't update top_cpuset.mems_allowed explicitly because
- * scan_for_empty_cpusets() will update it.
- */
- scan_for_empty_cpusets(&top_cpuset);
- break;
- default:
- break;
- }
- cgroup_unlock();
-
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+ top_cpuset.mems_allowed = node_states[N_MEMORY];
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
/**
@@ -2182,28 +2238,29 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
+ struct cpuset *cpus_cs;
+
mutex_lock(&callback_mutex);
- task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
- task_unlock(tsk);
+ rcu_read_lock();
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ guarantee_online_cpus(cpus_cs, pmask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- const struct cpuset *cs;
- int cpu;
+ struct cpuset *cpus_cs;
rcu_read_lock();
- cs = task_cs(tsk);
- if (cs)
- do_set_cpus_allowed(tsk, cs->cpus_allowed);
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2219,22 +2276,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
* set any mask even if it is not right from task_cs() pov,
* the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
*/
-
- cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
- if (cpu >= nr_cpu_ids) {
- /*
- * Either tsk->cpus_allowed is wrong (see above) or it
- * is actually empty. The latter case is only possible
- * if we are racing with remove_tasks_in_empty_cpuset().
- * Like above we can temporary set any mask and rely on
- * set_cpus_allowed_ptr() as synchronization point.
- */
- do_set_cpus_allowed(tsk, cpu_possible_mask);
- cpu = cpumask_any(cpu_active_mask);
- }
-
- return cpu;
}
void cpuset_init_current_mems_allowed(void)
@@ -2248,18 +2293,20 @@ void cpuset_init_current_mems_allowed(void)
*
* Description: Returns the nodemask_t mems_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
+ * subset of node_states[N_MEMORY], even if this means going outside the
* tasks cpuset.
**/
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
+ struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
- task_lock(tsk);
- guarantee_online_mems(task_cs(tsk), &mask);
- task_unlock(tsk);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &mask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
return mask;
@@ -2282,10 +2329,10 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
* callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
-static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
@@ -2352,7 +2399,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
*/
int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- const struct cpuset *cs; /* current cpuset ancestors */
+ struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2375,11 +2422,11 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
mutex_lock(&callback_mutex);
- task_lock(current);
+ rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
- task_unlock(current);
-
allowed = node_isset(node, cs->mems_allowed);
+ rcu_read_unlock();
+
mutex_unlock(&callback_mutex);
return allowed;
}
@@ -2423,17 +2470,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
*
@@ -2508,26 +2544,33 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
+#define CPUSET_NODELIST_LEN (256)
+
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @task: pointer to task_struct of some task.
+ * @tsk: pointer to task_struct of some task.
*
* Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log. Must hold task_lock(task) to allow
- * dereferencing task_cs(task).
+ * mems_allowed to the kernel log.
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- struct dentry *dentry;
+ /* Statically allocated to prevent using excess stack. */
+ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+ static DEFINE_SPINLOCK(cpuset_buffer_lock);
+ struct cgroup *cgrp;
- dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+ rcu_read_lock();
+
+ cgrp = task_cs(tsk)->css.cgroup;
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
- printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cpuset_name, cpuset_nodelist);
+ pr_info("%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+
+ rcu_read_unlock();
spin_unlock(&cpuset_buffer_lock);
}
@@ -2559,9 +2602,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
void __cpuset_memory_pressure_bump(void)
{
- task_lock(current);
+ rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
- task_unlock(current);
+ rcu_read_unlock();
}
#ifdef CONFIG_PROC_PID_CPUSET
@@ -2571,19 +2614,19 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2593,44 +2636,32 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
- css = task_subsys_state(tsk, cpuset_subsys_id);
- retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
- if (retval < 0)
- goto out_unlock;
- seq_puts(m, buf);
+ retval = -ENAMETOOLONG;
+ rcu_read_lock();
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
+ rcu_read_unlock();
+ if (!p)
+ goto out_put_task;
+ seq_puts(m, p);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+ retval = 0;
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Mems_allowed:\t");
+ seq_puts(m, "Mems_allowed:\t");
seq_nodemask(m, &task->mems_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Mems_allowed_list:\t");
+ seq_puts(m, "\n");
+ seq_puts(m, "Mems_allowed_list:\t");
seq_nodemask_list(m, &task->mems_allowed);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a404..e0573a43c7d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#if 0
@@ -29,17 +30,6 @@
static struct kmem_cache *cred_jar;
/*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
- .usage = ATOMIC_INIT(2),
- .tgid = 0,
- .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
-};
-#endif
-
-/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
@@ -48,6 +38,14 @@ struct cred init_cred = {
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
@@ -56,9 +54,6 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
-#ifdef CONFIG_KEYS
- .tgcred = &init_tgcred,
-#endif
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -87,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
}
/*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
-{
- struct thread_group_cred *tgcred =
- container_of(rcu, struct thread_group_cred, rcu);
-
- BUG_ON(atomic_read(&tgcred->usage) != 0);
-
- key_put(tgcred->session_keyring);
- key_put(tgcred->process_keyring);
- kfree(tgcred);
-}
-#endif
-
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = cred->tgcred;
-
- if (atomic_dec_and_test(&tgcred->usage))
- call_rcu(&tgcred->rcu, release_tgcred_rcu);
-#endif
-}
-
-/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
@@ -141,12 +106,14 @@ static void put_cred_rcu(struct rcu_head *rcu)
#endif
security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
- release_tgcred(cred);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -197,13 +164,6 @@ void exit_creds(struct task_struct *tsk)
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
-
- cred = (struct cred *) tsk->replacement_session_keyring;
- if (cred) {
- tsk->replacement_session_keyring = NULL;
- validate_creds(cred);
- put_cred(cred);
- }
}
/**
@@ -243,15 +203,6 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
- if (!new->tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
- atomic_set(&new->tgcred->usage, 1);
-#endif
-
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
@@ -302,11 +253,13 @@ struct cred *prepare_creds(void)
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
+ get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
- atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
@@ -330,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
*/
struct cred *prepare_exec_creds(void)
{
- struct thread_group_cred *tgcred = NULL;
struct cred *new;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred)
- return NULL;
-#endif
-
new = prepare_creds();
- if (!new) {
- kfree(tgcred);
+ if (!new)
return new;
- }
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
- /* create a new per-thread-group creds for all this set of threads to
- * share */
- memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
-
/* inherit the session keyring; new process keyring */
- key_get(tgcred->session_keyring);
- tgcred->process_keyring = NULL;
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
#endif
return new;
@@ -379,9 +313,6 @@ struct cred *prepare_exec_creds(void)
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
struct cred *new;
int ret;
@@ -411,11 +342,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
goto error_put;
}
- /* cache user_ns in cred. Doesn't need a refcount because it will
- * stay pinned by cred->user
- */
- new->user_ns = new->user->user_ns;
-
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
@@ -426,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
install_thread_keyring_to_cred(new);
}
- /* we share the process and session keyrings between all the threads in
- * a process - this is slightly icky as we violate COW credentials a
- * bit */
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
if (!(clone_flags & CLONE_THREAD)) {
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- ret = -ENOMEM;
- goto error_put;
- }
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
}
#endif
@@ -456,6 +372,31 @@ error_put:
return ret;
}
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -490,11 +431,11 @@ int commit_creds(struct cred *new)
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (old->euid != new->euid ||
- old->egid != new->egid ||
- old->fsuid != new->fsuid ||
- old->fsgid != new->fsgid ||
- !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
@@ -502,9 +443,9 @@ int commit_creds(struct cred *new)
}
/* alter the thread keyring */
- if (new->fsuid != old->fsuid)
+ if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(task);
- if (new->fsgid != old->fsgid)
+ if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(task);
/* do it
@@ -521,16 +462,16 @@ int commit_creds(struct cred *new)
alter_cred_subscribers(old, -2);
/* send notifications */
- if (new->uid != old->uid ||
- new->euid != old->euid ||
- new->suid != old->suid ||
- new->fsuid != old->fsuid)
+ if (!uid_eq(new->uid, old->uid) ||
+ !uid_eq(new->euid, old->euid) ||
+ !uid_eq(new->suid, old->suid) ||
+ !uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
- if (new->gid != old->gid ||
- new->egid != old->egid ||
- new->sgid != old->sgid ||
- new->fsgid != old->fsgid)
+ if (!gid_eq(new->gid, old->gid) ||
+ !gid_eq(new->egid, old->egid) ||
+ !gid_eq(new->sgid, old->sgid) ||
+ !gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
@@ -644,9 +585,6 @@ void __init cred_init(void)
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
const struct cred *old;
struct cred *new;
@@ -654,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
-#endif
-
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
@@ -675,16 +605,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
+ get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = NULL;
- new->tgcred = tgcred;
- new->request_key_auth = NULL;
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
@@ -799,9 +727,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- cred->uid, cred->euid, cred->suid, cred->fsuid);
+ from_kuid_munged(&init_user_ns, cred->uid),
+ from_kuid_munged(&init_user_ns, cred->euid),
+ from_kuid_munged(&init_user_ns, cred->suid),
+ from_kuid_munged(&init_user_ns, cred->fsuid));
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kgid_munged(&init_user_ns, cred->gid),
+ from_kgid_munged(&init_user_ns, cred->egid),
+ from_kgid_munged(&init_user_ns, cred->sgid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784ef..1adf62b39b9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
*/
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
+#include <linux/serial_core.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/console.h>
@@ -41,18 +42,19 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
+#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include "debug_core.h"
@@ -75,6 +77,8 @@ static int exception_level;
struct kgdb_io *dbg_io_ops;
static DEFINE_SPINLOCK(kgdb_registration_lock);
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
/* kgdb console driver is loaded */
static int kgdb_con_registered;
/* determine if kgdb console output should be used */
@@ -96,6 +100,7 @@ static int __init opt_kgdb_con(char *str)
early_param("kgdbcon", opt_kgdb_con);
module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
/*
* Holds information about breakpoints in a kernel. These breakpoints are
@@ -157,37 +162,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
* Weak aliases for breakpoint management,
* can be overriden by architectures when needed:
*/
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
if (err)
return err;
-
- return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
- BREAK_INSTR_SIZE);
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ return err;
}
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- return probe_kernel_write((char *)addr,
- (char *)bundle, BREAK_INSTR_SIZE);
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
int __weak kgdb_validate_break_address(unsigned long addr)
{
- char tmp_variable[BREAK_INSTR_SIZE];
+ struct kgdb_bkpt tmp;
int err;
- /* Validate setting the breakpoint and then removing it. In the
+ /* Validate setting the breakpoint and then removing it. If the
* remove fails, the kernel needs to emit a bad message because we
* are deep trouble not being able to put things back the way we
* found them.
*/
- err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+ tmp.bpt_addr = addr;
+ err = kgdb_arch_set_breakpoint(&tmp);
if (err)
return err;
- err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+ err = kgdb_arch_remove_breakpoint(&tmp);
if (err)
printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
"memory destroyed at: %lx", addr);
@@ -218,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm && current->mm->mmap_cache) {
- flush_cache_range(current->mm->mmap_cache,
- addr, addr + BREAK_INSTR_SIZE);
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
}
+
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
@@ -231,7 +245,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
*/
int dbg_activate_sw_breakpoints(void)
{
- unsigned long addr;
int error;
int ret = 0;
int i;
@@ -240,16 +253,15 @@ int dbg_activate_sw_breakpoints(void)
if (kgdb_break[i].state != BP_SET)
continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_set_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
if (error) {
ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+ printk(KERN_INFO "KGDB: BP install failed: %lx",
+ kgdb_break[i].bpt_addr);
continue;
}
- kgdb_flush_swbreak_addr(addr);
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
kgdb_break[i].state = BP_ACTIVE;
}
return ret;
@@ -298,7 +310,6 @@ int dbg_set_sw_break(unsigned long addr)
int dbg_deactivate_sw_breakpoints(void)
{
- unsigned long addr;
int error;
int ret = 0;
int i;
@@ -306,15 +317,14 @@ int dbg_deactivate_sw_breakpoints(void)
for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
if (kgdb_break[i].state != BP_ACTIVE)
continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+ printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
ret = error;
}
- kgdb_flush_swbreak_addr(addr);
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
kgdb_break[i].state = BP_SET;
}
return ret;
@@ -348,7 +358,6 @@ int kgdb_isremovedbreak(unsigned long addr)
int dbg_remove_all_break(void)
{
- unsigned long addr;
int error;
int i;
@@ -356,12 +365,10 @@ int dbg_remove_all_break(void)
for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
if (kgdb_break[i].state != BP_ACTIVE)
goto setundefined;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error)
printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
- addr);
+ kgdb_break[i].bpt_addr);
setundefined:
kgdb_break[i].state = BP_UNDEFINED;
}
@@ -527,7 +534,7 @@ return_normal:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
@@ -576,8 +583,12 @@ return_normal:
raw_spin_lock(&dbg_slave_lock);
#ifdef CONFIG_SMP
+ /* If send_ready set, slaves are already waiting */
+ if (ks->send_ready)
+ atomic_set(ks->send_ready, 1);
+
/* Signal the other CPUs to enter kgdb_wait() */
- if ((!kgdb_single_step) && kgdb_do_roundup)
+ else if ((!kgdb_single_step) && kgdb_do_roundup)
kgdb_roundup_cpus(flags);
#endif
@@ -651,7 +662,7 @@ kgdb_restore:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&masters_in_kgdb);
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
@@ -674,22 +685,46 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
+
+ memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
ks->signo = signo;
ks->err_code = ecode;
- ks->kgdb_usethreadid = 0;
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
- return 0; /* Ouch, double exception ! */
+ goto out; /* Ouch, double exception ! */
if (kgdb_info[ks->cpu].enter_kgdb != 0)
- return 0;
+ goto out;
+
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
+}
- return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
}
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
int kgdb_nmicallback(int cpu, void *regs)
{
#ifdef CONFIG_SMP
@@ -709,6 +744,31 @@ int kgdb_nmicallback(int cpu, void *regs)
return 1;
}
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
+ atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+ if (!kgdb_io_ready(0) || !send_ready)
+ return 1;
+
+ if (kgdb_info[cpu].enter_kgdb == 0) {
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->ex_vector = trapnr;
+ ks->signo = SIGTRAP;
+ ks->err_code = err_code;
+ ks->linux_regs = regs;
+ ks->send_ready = send_ready;
+ kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
static void kgdb_console_write(struct console *co, const char *s,
unsigned count)
{
@@ -752,7 +812,7 @@ static void sysrq_handle_dbg(int key)
static struct sysrq_key_op sysrq_dbg_op = {
.handler = sysrq_handle_dbg,
- .help_msg = "debug(G)",
+ .help_msg = "debug(g)",
.action_msg = "DEBUG",
};
#endif
@@ -784,6 +844,33 @@ void __init dbg_late_init(void)
kdb_init(KDB_INIT_FULL);
}
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+ /*
+ * Take the following action on reboot notify depending on value:
+ * 1 == Enter debugger
+ * 0 == [the default] detatch debug client
+ * -1 == Do nothing... and use this until the board resets
+ */
+ switch (kgdbreboot) {
+ case 1:
+ kgdb_breakpoint();
+ case -1:
+ goto done;
+ }
+ if (!dbg_kdb_mode)
+ gdbstub_exit(code);
+done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+ .notifier_call = dbg_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX,
+};
+
static void kgdb_register_callbacks(void)
{
if (!kgdb_io_module_registered) {
@@ -791,6 +878,8 @@ static void kgdb_register_callbacks(void)
kgdb_arch_init();
if (!dbg_is_early)
kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
+ register_reboot_notifier(&dbg_reboot_notifier);
atomic_notifier_chain_register(&panic_notifier_list,
&kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +901,8 @@ static void kgdb_unregister_callbacks(void)
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
+ unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
atomic_notifier_chain_unregister(&panic_notifier_list,
&kgdb_panic_event_nb);
kgdb_arch_exit();
@@ -952,7 +1043,7 @@ int dbg_io_get_char(void)
* otherwise as a quick means to stop program execution and "break" into
* the debugger.
*/
-void kgdb_breakpoint(void)
+noinline void kgdb_breakpoint(void)
{
atomic_inc(&kgdb_setting_breakpoint);
wmb(); /* Sync point before breakpoint */
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7..127d9bc49fb 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
+ atomic_t *send_ready;
};
/* Exception state values */
@@ -72,6 +73,8 @@ extern int dbg_kdb_mode;
#ifdef CONFIG_KGDB_KDB
extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad8..19d9a578c75 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/serial_core.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
len = len / 2;
remcom_out_buffer[len++] = 0;
+ kdb_common_init_state(ks);
kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
strcpy(remcom_out_buffer, "OK");
}
break;
@@ -1111,6 +1115,13 @@ void gdbstub_exit(int status)
unsigned char checksum, ch, buffer[3];
int loop;
+ if (!kgdb_connected)
+ return;
+ kgdb_connected = 0;
+
+ if (!dbg_io_ops || dbg_kdb_mode)
+ return;
+
buffer[0] = 'W';
buffer[1] = hex_asc_hi(status);
buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1140,6 @@ void gdbstub_exit(int status)
dbg_io_ops->write_char(hex_asc_lo(checksum));
/* make sure the output is flushed, lest the bootloader clobber it */
- dbg_io_ops->flush();
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459..70a504601dc 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!bp->bp_type) {
+ kdb_printf("Software breakpoints are unavailable.\n"
+ " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " OR use hw breaks: help bph\n");
+ }
+#endif
return 1;
}
return 0;
@@ -479,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
/*
* kdb_ss
*
- * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
- * commands.
+ * Process the 'ss' (Single Step) command.
*
* ss
- * ssb
*
* Parameters:
* argc Argument count
@@ -491,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
* Outputs:
* None.
* Returns:
- * KDB_CMD_SS[B] for success, a kdb error if failure.
+ * KDB_CMD_SS for success, a kdb error if failure.
* Locking:
* None.
* Remarks:
*
* Set the arch specific option to trigger a debug trap after the next
* instruction.
- *
- * For 'ssb', set the trace flag in the debug trap handler
- * after printing the current insn and return directly without
- * invoking the kdb command processor, until a branch instruction
- * is encountered.
*/
static int kdb_ss(int argc, const char **argv)
{
- int ssb = 0;
-
- ssb = (strcmp(argv[0], "ssb") == 0);
if (argc != 0)
return KDB_ARGCOUNT;
/*
* Set trace flag and go.
*/
KDB_STATE_SET(DOING_SS);
- if (ssb) {
- KDB_STATE_SET(DOING_SSB);
- return KDB_CMD_SSB;
- }
return KDB_CMD_SS;
}
@@ -554,8 +547,6 @@ void __init kdb_initbptab(void)
kdb_register_repeat("ss", kdb_ss, "",
"Single Step", 1, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("ssb", kdb_ss, "",
- "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41..fe15fff5df5 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,14 +15,13 @@
#include <linux/sched.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
-#include <asm/system.h>
#include "kdb_private.h"
static void kdb_show_stack(struct task_struct *p, void *addr)
{
int old_lvl = console_loglevel;
- console_loglevel = 15;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
kdb_set_current_task(p);
if (addr) {
@@ -130,6 +129,8 @@ kdb_bt(int argc, const char **argv)
}
/* Now the inactive tasks */
kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
if (task_curr(p))
continue;
if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff1..8859ca34dcf 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
#include <linux/kdb.h>
#include <linux/kdebug.h>
#include <linux/export.h>
+#include <linux/hardirq.h>
#include "kdb_private.h"
#include "../debug_core.h"
@@ -33,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
@@ -52,6 +69,12 @@ int kdb_stub(struct kgdb_state *ks)
if (atomic_read(&kgdb_setting_breakpoint))
reason = KDB_REASON_KEYBOARD;
+ if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+ reason = KDB_REASON_SYSTEM_NMI;
+
+ else if (in_nmi())
+ reason = KDB_REASON_NMI;
+
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
reason = KDB_REASON_BREAK;
@@ -90,13 +113,10 @@ int kdb_stub(struct kgdb_state *ks)
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
- kdb_initial_cpu = atomic_read(&kgdb_active);
- kdb_current_task = kgdb_info[ks->cpu].task;
- kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ kdb_common_init_state(ks);
/* Remove any breakpoints as needed by kdb and clear single step */
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
- KDB_STATE_CLEAR(DOING_SSB);
KDB_STATE_SET(PAGER);
/* zero out any offline cpu data */
for_each_present_cpu(i) {
@@ -121,9 +141,7 @@ int kdb_stub(struct kgdb_state *ks)
* Upon exit from the kdb main loop setup break points and restart
* the system based on the requested continue state
*/
- kdb_initial_cpu = -1;
- kdb_current_task = NULL;
- kdb_current_regs = NULL;
+ kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e..7c70812caea 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
{
int diag;
int linecount;
+ int colcount;
int logging, saved_loglevel = 0;
int saved_trap_printk;
int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
if (diag || linecount <= 1)
linecount = 24;
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
diag = kdbgetintenv("LOGGING", &logging);
if (diag)
logging = 0;
@@ -689,8 +694,8 @@ kdb_printit:
if (!dbg_kdb_mode && kgdb_connected) {
gdbstub_msg_write(kdb_buffer, retlen);
} else {
- if (!dbg_io_ops->is_console) {
- len = strlen(kdb_buffer);
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = retlen;
cp = kdb_buffer;
while (len--) {
dbg_io_ops->write_char(*cp);
@@ -705,19 +710,34 @@ kdb_printit:
}
if (logging) {
saved_loglevel = console_loglevel;
- console_loglevel = 0;
+ console_loglevel = CONSOLE_LOGLEVEL_SILENT;
printk(KERN_INFO "%s", kdb_buffer);
}
- if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
- kdb_nextline++;
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
/* check for having reached the LINES number of printed lines */
- if (kdb_nextline == linecount) {
+ if (kdb_nextline >= linecount) {
char buf1[16] = "";
-#if defined(CONFIG_SMP)
- char buf2[32];
-#endif
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
@@ -732,18 +752,10 @@ kdb_printit:
if (moreprompt == NULL)
moreprompt = "more> ";
-#if defined(CONFIG_SMP)
- if (strchr(moreprompt, '%')) {
- sprintf(buf2, moreprompt, get_cpu());
- put_cpu();
- moreprompt = buf2;
- }
-#endif
-
kdb_input_flush();
c = console_drivers;
- if (!dbg_io_ops->is_console) {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(moreprompt);
cp = moreprompt;
while (len--) {
@@ -776,7 +788,7 @@ kdb_printit:
kdb_grepping_flag = 0;
kdb_printf("\n");
} else if (buf1[0] == ' ') {
- kdb_printf("\n");
+ kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
} else if (buf1[0] == '\n') {
kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c..118527aa60e 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
static int kbd_exists;
+static int kbd_last_ret;
/*
* Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
return -1;
}
- if ((scancode & 0x80) != 0)
+ if ((scancode & 0x80) != 0) {
+ if (scancode == 0x9c)
+ kbd_last_ret = 0;
return -1;
+ }
scancode &= 0x7f;
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
return -1; /* ignore unprintables */
}
- if ((scancode & 0x7f) == 0x1c) {
- /*
- * enter key. All done. Absorb the release scancode.
- */
+ if (scancode == 0x1c) {
+ kbd_last_ret = 1;
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+ int scancode, scanstatus;
+
+ /*
+ * Nothing to clean up, since either
+ * ENTER was never pressed, or has already
+ * gotten cleaned up.
+ */
+ if (!kbd_last_ret)
+ return;
+
+ kbd_last_ret = 0;
+ /*
+ * Enter key. Need to absorb the break code here, lest it gets
+ * leaked out if we exit KDB as the result of processing 'g'.
+ *
+ * This has several interesting implications:
+ * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+ * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+ * only get a break code at the end of the repeated
+ * sequence. This means we can't propagate the repeated key
+ * press, and must swallow it away.
+ * + Need to handle possible PS/2 mouse input.
+ * + Need to handle mashed keys.
+ */
+
+ while (1) {
while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
- ;
+ cpu_relax();
/*
- * Fetch the scancode
+ * Fetch the scancode.
*/
scancode = inb(KBD_DATA_REG);
scanstatus = inb(KBD_STATUS_REG);
- while (scanstatus & KBD_STAT_MOUSE_OBF) {
- scancode = inb(KBD_DATA_REG);
- scanstatus = inb(KBD_STATUS_REG);
- }
+ /*
+ * Skip mouse input.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ continue;
- if (scancode != 0x9c) {
- /*
- * Wasn't an enter-release, why not?
- */
- kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
- scancode, scanstatus);
- }
+ /*
+ * If we see 0xe0, this is either a break code for KP
+ * ENTER, or a repeat make for KP ENTER. Either way,
+ * since the second byte is equivalent to an ENTER,
+ * skip the 0xe0 and try again.
+ *
+ * If we see 0x1c, this must be a repeat ENTER or KP
+ * ENTER (and we swallowed 0xe0 before). Try again.
+ *
+ * We can also see make and break codes for other keys
+ * mashed before or after pressing ENTER. Thus, if we
+ * see anything other than 0x9c, we have to try again.
+ *
+ * Note, if you held some key as ENTER was depressed,
+ * that break code would get leaked out.
+ */
+ if (scancode != 0x9c)
+ continue;
- return 13;
+ return;
}
-
- return keychar & 0xff;
}
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437..2f7c760305c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,12 +14,14 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/smp.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -122,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
};
#undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
@@ -138,11 +140,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
static char *__env[] = {
#if defined(CONFIG_SMP)
"PROMPT=[%d]kdb> ",
- "MOREPROMPT=[%d]more> ",
#else
"PROMPT=kdb> ",
- "MOREPROMPT=more> ",
#endif
+ "MOREPROMPT=more> ",
"RADIX=16",
"MDCOUNT=8", /* lines of md output */
KDB_PLATFORM_ENV,
@@ -174,7 +175,7 @@ static char *__env[] = {
(char *)0,
};
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
struct task_struct *kdb_curr_task(int cpu)
{
@@ -680,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
}
if (argc != 3)
return KDB_ARGCOUNT;
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set) {
- kdb_printf("Could not allocate new defcmd_set entry for %s\n",
- argv[1]);
- defcmd_set = save_defcmd_set;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
memcpy(defcmd_set, save_defcmd_set,
defcmd_set_count * sizeof(*defcmd_set));
- kfree(save_defcmd_set);
s = defcmd_set + defcmd_set_count;
memset(s, 0, sizeof(*s));
s->usable = 1;
s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
if (s->usage[0] == '"') {
- strcpy(s->usage, s->usage+1);
+ strcpy(s->usage, argv[2]+1);
s->usage[strlen(s->usage)-1] = '\0';
}
if (s->help[0] == '"') {
- strcpy(s->help, s->help+1);
+ strcpy(s->help, argv[3]+1);
s->help[strlen(s->help)-1] = '\0';
}
++defcmd_set_count;
defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
}
/*
@@ -1074,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
static void kdb_dumpregs(struct pt_regs *regs)
{
int old_lvl = console_loglevel;
- console_loglevel = 15;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
show_regs(regs);
kdb_trap_printk--;
@@ -1111,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
* KDB_CMD_GO User typed 'go'.
* KDB_CMD_CPU User switched to another cpu.
* KDB_CMD_SS Single step.
- * KDB_CMD_SSB Single step until branch.
*/
static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_dbtrap_t db_result)
@@ -1150,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
instruction_pointer(regs));
break;
- case KDB_DB_SSB:
- /*
- * In the midst of ssb command. Just return.
- */
- KDB_DEBUG_STATE("kdb_local 3", reason);
- return KDB_CMD_SSB; /* Continue with SSB command */
-
- break;
case KDB_DB_SS:
break;
case KDB_DB_SSBPT:
@@ -1192,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
instruction_pointer(regs));
kdb_dumpregs(regs);
break;
+ case KDB_REASON_SYSTEM_NMI:
+ kdb_printf("due to System NonMaskable Interrupt\n");
+ break;
case KDB_REASON_NMI:
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
@@ -1235,18 +1246,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*cmdbuf = '\0';
*(cmd_hist[cmd_head]) = '\0';
- if (KDB_FLAG(ONLY_DO_DUMP)) {
- /* kdb is off but a catastrophic error requires a dump.
- * Take the dump and reboot.
- * Turn on logging so the kdb output appears in the log
- * buffer in the dump.
- */
- const char *setargs[] = { "set", "LOGGING", "1" };
- kdb_set(2, setargs);
- kdb_reboot(0, NULL);
- /*NOTREACHED*/
- }
-
do_full_getstr:
#if defined(CONFIG_SMP)
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -1292,7 +1291,6 @@ do_full_getstr:
if (diag == KDB_CMD_GO
|| diag == KDB_CMD_CPU
|| diag == KDB_CMD_SS
- || diag == KDB_CMD_SSB
|| diag == KDB_CMD_KGDB)
break;
@@ -1379,12 +1377,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
break;
}
- if (result == KDB_CMD_SSB) {
- KDB_STATE_SET(DOING_SS);
- KDB_STATE_SET(DOING_SSB);
- break;
- }
-
if (result == KDB_CMD_KGDB) {
if (!KDB_STATE(DOING_KGDB))
kdb_printf("Entering please attach debugger "
@@ -1400,6 +1392,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
if (KDB_STATE(DOING_SS))
KDB_STATE_CLEAR(SSBPT);
+ /* Clean up any keyboard devices before leaving */
+ kdb_kbd_cleanup_state();
+
return result;
}
@@ -1978,6 +1973,8 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("Module Size modstruct Used by\n");
list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
@@ -2037,8 +2034,15 @@ static int kdb_env(int argc, const char **argv)
*/
static int kdb_dmesg(int argc, const char **argv)
{
- char *syslog_data[4], *start, *end, c = '\0', *p;
- int diag, logging, logsize, lines = 0, adjust = 0, n;
+ int diag;
+ int logging;
+ int lines = 0;
+ int adjust = 0;
+ int n = 0;
+ int skip = 0;
+ struct kmsg_dumper dumper = { .active = 1 };
+ size_t len;
+ char buf[201];
if (argc > 2)
return KDB_ARGCOUNT;
@@ -2061,22 +2065,10 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
- * logical start, end+1. */
- kdb_syslog_data(syslog_data);
- if (syslog_data[2] == syslog_data[3])
- return 0;
- logsize = syslog_data[1] - syslog_data[0];
- start = syslog_data[2];
- end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
- for (n = 0, p = start; p < end; ++p) {
- c = *KDB_WRAP(p);
- if (c == '\n')
- ++n;
- }
- if (c != '\n')
- ++n;
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ n++;
+
if (lines < 0) {
if (adjust >= n)
kdb_printf("buffer only contains %d lines, nothing "
@@ -2084,21 +2076,11 @@ static int kdb_dmesg(int argc, const char **argv)
else if (adjust - lines >= n)
kdb_printf("buffer only contains %d lines, last %d "
"lines printed\n", n, n - adjust);
- if (adjust) {
- for (; start < end && adjust; ++start) {
- if (*KDB_WRAP(start) == '\n')
- --adjust;
- }
- if (start < end)
- ++start;
- }
- for (p = start; p < end && lines; ++p) {
- if (*KDB_WRAP(p) == '\n')
- ++lines;
- }
- end = p;
+ skip = adjust;
+ lines = abs(lines);
} else if (lines > 0) {
- int skip = n - (adjust + lines);
+ skip = n - lines - adjust;
+ lines = abs(lines);
if (adjust >= n) {
kdb_printf("buffer only contains %d lines, "
"nothing printed\n", n);
@@ -2109,39 +2091,56 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_printf("buffer only contains %d lines, first "
"%d lines printed\n", n, lines);
}
- for (; start < end && skip; ++start) {
- if (*KDB_WRAP(start) == '\n')
- --skip;
- }
- for (p = start; p < end && lines; ++p) {
- if (*KDB_WRAP(p) == '\n')
- --lines;
- }
- end = p;
+ } else {
+ lines = n;
}
- /* Do a line at a time (max 200 chars) to reduce protocol overhead */
- c = '\n';
- while (start != end) {
- char buf[201];
- p = buf;
+
+ if (skip >= n || skip < 0)
+ return 0;
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ if (skip) {
+ skip--;
+ continue;
+ }
+ if (!lines--)
+ break;
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- while (start < end && (c = *KDB_WRAP(start)) &&
- (p - buf) < sizeof(buf)-1) {
- ++start;
- *p++ = c;
- if (c == '\n')
- break;
- }
- *p = '\0';
- kdb_printf("%s", buf);
+
+ kdb_printf("%.*s\n", (int)len - 1, buf);
}
- if (c != '\n')
- kdb_printf("\n");
return 0;
}
#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
/*
* kdb_cpu - This function implements the 'cpu' command.
* cpu [<cpunum>]
@@ -2354,69 +2353,6 @@ static int kdb_pid(int argc, const char **argv)
return 0;
}
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- * linked list and executes an arbitrary command for each
- * element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
- int diag = 0;
- unsigned long addr;
- long offset = 0;
- unsigned long va;
- unsigned long linkoffset;
- int nextarg;
- const char *command;
-
- if (argc != 3)
- return KDB_ARGCOUNT;
-
- nextarg = 1;
- diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
- if (diag)
- return diag;
-
- diag = kdbgetularg(argv[2], &linkoffset);
- if (diag)
- return diag;
-
- /*
- * Using the starting address as
- * the first element in the list, and assuming that
- * the list ends with a null pointer.
- */
-
- va = addr;
- command = kdb_strdup(argv[3], GFP_KDB);
- if (!command) {
- kdb_printf("%s: cannot duplicate command\n", __func__);
- return 0;
- }
- /* Recursive use of kdb_parse, do not use argv after this point */
- argv = NULL;
-
- while (va) {
- char buf[80];
-
- if (KDB_FLAG(CMD_INTERRUPT))
- goto out;
-
- sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
- diag = kdb_parse(buf);
- if (diag)
- goto out;
-
- addr = va + linkoffset;
- if (kdb_getword(&va, addr, sizeof(va)))
- goto out;
- }
-
-out:
- kfree(command);
- return diag;
-}
-
static int kdb_kgdb(int argc, const char **argv)
{
return KDB_CMD_KGDB;
@@ -2434,11 +2370,15 @@ static int kdb_help(int argc, const char **argv)
kdb_printf("-----------------------------"
"-----------------------------\n");
for_each_kdbcmd(kt, i) {
- if (kt->cmd_name)
- kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
- kt->cmd_usage, kt->cmd_help);
+ char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
}
return 0;
}
@@ -2743,7 +2683,7 @@ int kdb_register_repeat(char *cmd,
(kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
kfree(kdb_commands);
}
- memset(new + kdb_max_commands, 0,
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
kdb_command_extend * sizeof(*new));
kdb_commands = new;
kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2847,15 +2787,13 @@ static void __init kdb_inittab(void)
"Stack traceback", 1, KDB_REPEAT_NONE);
kdb_register_repeat("btp", kdb_bt, "<pid>",
"Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
- "Display stack all processes", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btc", kdb_bt, "",
"Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
KDB_REPEAT_NONE);
- kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
- "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
kdb_register_repeat("env", kdb_env, "",
"Show environment variables", 0, KDB_REPEAT_NONE);
kdb_register_repeat("set", kdb_set, "",
@@ -2886,6 +2824,10 @@ static void __init kdb_inittab(void)
kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
"Display syslog buffer", 0, KDB_REPEAT_NONE);
#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+ }
kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
"Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40..7afd3c8c41d 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
#define KDB_CMD_SS (-1003)
-#define KDB_CMD_SSB (-1004)
#define KDB_CMD_KGDB (-1005)
/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
* kdb control */
#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
-#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
- * DOING_SS is also set */
#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
* after one ss, independent of
* DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
typedef enum {
KDB_DB_BPT, /* Breakpoint */
KDB_DB_SS, /* Single-step trap */
- KDB_DB_SSB, /* Single step to branch */
KDB_DB_SSBPT, /* Single step over breakpoint */
KDB_DB_NOBPT /* Spurious breakpoint */
} kdb_dbtrap_t;
@@ -205,7 +201,6 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
extern unsigned long kdb_task_state(const struct task_struct *p,
@@ -246,6 +241,13 @@ extern void debug_kusage(void);
extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
#ifdef CONFIG_MODULES
extern struct list_head *kdb_modules;
#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d218..d35cc2d3a4c 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
if (!pfn_valid(pfn))
return 1;
page = pfn_to_page(pfn);
- vaddr = kmap_atomic(page, KM_KDB);
+ vaddr = kmap_atomic(page);
memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
- kunmap_atomic(vaddr, KM_KDB);
+ kunmap_atomic(vaddr);
return 0;
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053a..54996b71e66 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,20 +106,17 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
-
- /* Though tsk->delays accessed later, early exit avoids
- * unnecessary returning of other data
- */
- if (!tsk->delays)
- goto done;
+ cputime_t utime, stime, stimescaled, utimescaled;
tmp = (s64)d->cpu_run_real_total;
- cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ task_cputime(tsk, &utime, &stime);
+ cputime_to_timespec(utime + stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
- cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ cputime_to_timespec(utimescaled + stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -155,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
-done:
return 0;
}
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c..6c6262f86c1 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915efef66..e556751d15d 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
-
-#include <asm/elf.h>
-
+#include <linux/binfmts.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
return 0;
}
-int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
- unsigned long limit)
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
return 1;
}
-int __weak elf_core_write_extra_data(struct file *file, size_t *size,
- unsigned long limit)
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
{
return 1;
}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf..103f5d147b2 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
endif
obj-y := core.o ring_buffer.o callchain.o
+
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f39..97b67df8fbf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
err = alloc_callchain_buffers();
exit:
+ if (err)
+ atomic_dec(&nr_callchain_events);
+
mutex_unlock(&callchain_mutex);
return err;
@@ -153,11 +156,17 @@ put_callchain_entry(int rctx)
put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
}
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
int rctx;
struct perf_callchain_entry *entry;
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
entry = get_callchain_entry(&rctx);
if (rctx == -1)
@@ -168,18 +177,29 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
entry->nr = 0;
- if (!user_mode(regs)) {
+ if (kernel && !user_mode(regs)) {
perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
}
- if (regs) {
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
}
exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1b5c081d8b9..6b17ac1b0c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -36,6 +37,10 @@
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/mman.h>
#include "internal.h"
@@ -116,7 +121,15 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
- PERF_FLAG_PID_CGROUP)
+ PERF_FLAG_PID_CGROUP |\
+ PERF_FLAG_FD_CLOEXEC)
+
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+ (PERF_SAMPLE_BRANCH_KERNEL |\
+ PERF_SAMPLE_BRANCH_HV)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
@@ -128,12 +141,14 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
+static atomic_t nr_freq_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -154,25 +169,130 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
/*
* max perf event sample rate
*/
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
- DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT 25
+
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+void update_perf_cpu_limits(void)
+{
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+ do_div(tmp, 100);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+}
+
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ update_perf_cpu_limits();
return 0;
}
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done. This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+static DEFINE_PER_CPU(u64, running_sample_length);
+
+static void perf_duration_warn(struct irq_work *w)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ local_samples_len = __get_cpu_var(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ if (allowed_ns == 0)
+ return;
+
+ /* decay the counter by 1 average sample */
+ local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+ local_samples_len += sample_len_ns;
+ __get_cpu_var(running_sample_length) = local_samples_len;
+
+ /*
+ * note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * from having to maintain a count.
+ */
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ if (avg_local_sample_len <= allowed_ns)
+ return;
+
+ if (max_samples_per_tick <= 1)
+ return;
+
+ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+ update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
+}
+
static atomic64_t perf_event_id;
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -185,9 +305,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
-
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -225,6 +342,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
@@ -232,8 +363,8 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task)
{
- return container_of(task_subsys_state(task, perf_subsys_id),
- struct perf_cgroup, css);
+ return container_of(task_css(task, perf_event_cgrp_id),
+ struct perf_cgroup, css);
}
static inline bool
@@ -242,12 +373,22 @@ perf_cgroup_match(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- return !event->cgrp || event->cgrp == cpuctx->cgrp;
-}
+ /* @event doesn't care about cgroup */
+ if (!event->cgrp)
+ return true;
-static inline void perf_get_cgroup(struct perf_event *event)
-{
- css_get(&event->cgrp->css);
+ /* wants specific cgroup scope but @cpuctx isn't associated with any */
+ if (!cpuctx->cgrp)
+ return false;
+
+ /*
+ * Cgroup scoping is recursive. An event enabled for a cgroup is
+ * also enabled for all its descendant cgroups. If @cpuctx's
+ * cgroup is a descendant of @event's (the test covers identity
+ * case), it's a match.
+ */
+ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
}
static inline void perf_put_cgroup(struct perf_event *event)
@@ -363,6 +504,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
/*
* perf_cgroup_events says at least one
@@ -386,9 +529,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
- /* set cgrp before ctxsw in to
- * allow event_filter_match() to not
- * have to pass task around
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
*/
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -459,14 +603,14 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct file *file;
- int ret = 0, fput_needed;
+ struct fd f = fdget(fd);
+ int ret = 0;
- file = fget_light(fd, &fput_needed);
- if (!file)
+ if (!f.file)
return -EBADF;
- css = cgroup_css_from_dir(file, perf_subsys_id);
+ css = css_tryget_online_from_dir(f.file->f_dentry,
+ &perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -475,9 +619,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
- /* must be done before we fput() the file */
- perf_get_cgroup(event);
-
/*
* all events in a group must monitor
* the same cgroup because a task belongs
@@ -488,7 +629,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
ret = -EINVAL;
}
out:
- fput_light(file, fput_needed);
+ fdput(f);
return ret;
}
@@ -612,6 +753,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
#endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+ struct perf_cpu_context *cpuctx;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ int rotations = 0;
+
+ WARN_ON(!irqs_disabled());
+
+ cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+ rotations = perf_rotate_context(cpuctx);
+
+ /*
+ * arm timer if needed
+ */
+ if (rotations) {
+ hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ ret = HRTIMER_RESTART;
+ }
+
+ return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ continue;
+
+ hrtimer_cancel(&cpuctx->hrtimer);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+ int timer;
+
+ /* no multiplexing needed for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ /*
+ * check default is sane, if not set then force to
+ * default interval (1/tick)
+ */
+ timer = pmu->hrtimer_interval_ms;
+ if (timer < 1)
+ timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* not for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ if (hrtimer_active(hr))
+ return;
+
+ if (!hrtimer_callback_running(hr))
+ __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+ 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -666,6 +907,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
+ ctx->generation++;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -714,8 +956,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
- rcu_read_lock();
retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
@@ -731,6 +983,8 @@ retry:
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
goto retry;
}
@@ -740,6 +994,7 @@ retry:
}
}
rcu_read_unlock();
+ preempt_enable();
return ctx;
}
@@ -881,12 +1136,26 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+
+ ctx->generation++;
+}
+
+/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
}
/*
@@ -934,9 +1203,18 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
event->header_size = size;
}
@@ -952,6 +1230,9 @@ static void perf_event__id_header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ size += sizeof(data->id);
+
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
@@ -1020,6 +1301,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack--;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1040,6 +1324,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
@@ -1118,6 +1404,8 @@ event_sched_out(struct perf_event *event,
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
+ perf_pmu_disable(event->pmu);
+
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
@@ -1134,6 +1422,8 @@ event_sched_out(struct perf_event *event,
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+
+ perf_pmu_enable(event->pmu);
}
static void
@@ -1156,6 +1446,11 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
/*
* Cross CPU call to remove a performance event
*
@@ -1164,12 +1459,15 @@ group_sched_out(struct perf_event *group_event,
*/
static int __perf_remove_from_context(void *info)
{
- struct perf_event *event = info;
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
@@ -1194,10 +1492,14 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
lockdep_assert_held(&ctx->mutex);
@@ -1206,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event)
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, event);
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
- if (!task_function_call(task, __perf_remove_from_context, event))
+ if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -1228,6 +1530,8 @@ retry:
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
+ if (detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -1235,7 +1539,7 @@ retry:
/*
* Cross CPU call to disable a performance event
*/
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -1374,6 +1678,9 @@ event_sched_in(struct perf_event *event,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
+ int ret = 0;
+
+ lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -1396,10 +1703,13 @@ event_sched_in(struct perf_event *event,
*/
smp_wmb();
+ perf_pmu_disable(event->pmu);
+
if (event->pmu->add(event, PERF_EF_START)) {
event->state = PERF_EVENT_STATE_INACTIVE;
event->oncpu = -1;
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1415,7 +1725,10 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- return 0;
+out:
+ perf_pmu_enable(event->pmu);
+
+ return ret;
}
static int
@@ -1424,7 +1737,7 @@ group_sched_in(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = group_event->pmu;
+ struct pmu *pmu = ctx->pmu;
u64 now = ctx->time;
bool simulate = false;
@@ -1435,6 +1748,7 @@ group_sched_in(struct perf_event *group_event,
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -1481,6 +1795,8 @@ group_error:
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
+
return -EAGAIN;
}
@@ -1627,6 +1943,8 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
event->ctx = ctx;
+ if (event->cpu != -1)
+ event->cpu = cpu;
if (!task) {
/*
@@ -1691,7 +2009,16 @@ static int __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- if (WARN_ON_ONCE(!ctx->is_active))
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -1734,8 +2061,10 @@ static int __perf_event_enable(void *info)
* If this event can't go on and it's part of a
* group, then the whole group has to come off.
*/
- if (leader != event)
+ if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
+ perf_cpu_hrtimer_restart(cpuctx);
+ }
if (leader->attr.pinned) {
update_group_times(leader);
leader->state = PERF_EVENT_STATE_ERROR;
@@ -1860,22 +2189,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
}
/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled events.
- * If the number of enabled events is the same, then the set
- * of enabled events should be the same, because these are both
- * inherited contexts, therefore we can't access individual events
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
- return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
- && ctx1->parent_gen == ctx2->parent_gen
- && !ctx1->pin_count && !ctx2->pin_count;
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
}
static void __perf_event_sync_stat(struct perf_event *event,
@@ -1924,9 +2269,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
perf_event_update_userpage(next_event);
}
-#define list_next_entry(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
@@ -1958,7 +2300,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
+ struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
@@ -1970,10 +2312,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
return;
rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp[ctxn];
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent || !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
@@ -2001,6 +2351,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
+unlock:
rcu_read_unlock();
if (do_switch) {
@@ -2195,6 +2546,64 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
}
/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /* no need to flush branch stack if not changing task */
+ if (prev == task)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ /*
+ * check if the context has at least one
+ * event using PERF_SAMPLE_BRANCH_STACK
+ */
+ if (cpuctx->ctx.nr_branch_stack > 0
+ && pmu->flush_branch_stack) {
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->flush_branch_stack();
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
*
@@ -2225,6 +2634,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
*/
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
+
+ /* check for system-wide branch_stack events */
+ if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+ perf_branch_stack_sched_in(prev, task);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2363,16 +2776,18 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ perf_pmu_disable(event->pmu);
+
hwc = &event->hw;
- if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+ if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
- continue;
+ goto next;
/*
* stop the event and update event->count
@@ -2394,6 +2809,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+ next:
+ perf_pmu_enable(event->pmu);
}
perf_pmu_enable(ctx->pmu);
@@ -2418,7 +2835,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
* because they're strictly cpu affine and rotate_start is called with IRQs
* disabled, while rotate_context is called from IRQ context.
*/
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
int rotate = 0, remove = 1;
@@ -2457,7 +2874,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
+
+ return rotate;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (atomic_read(&nr_freq_events) ||
+ __this_cpu_read(perf_throttled_count))
+ return false;
+ else
+ return true;
}
+#endif
void perf_event_task_tick(void)
{
@@ -2478,10 +2908,6 @@ void perf_event_task_tick(void)
ctx = cpuctx->task_ctx;
if (ctx)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- if (cpuctx->jiffies_interval == 1 ||
- !(jiffies % cpuctx->jiffies_interval))
- perf_rotate_context(cpuctx);
}
}
@@ -2549,6 +2975,22 @@ out:
local_irq_restore(flags);
}
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -2771,35 +3213,51 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
-static void free_event(struct perf_event *event)
+static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
- irq_work_sync(&event->pending);
+ if (event->parent)
+ return;
- if (!event->parent) {
- if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec_deferred(&perf_sched_events);
- if (event->attr.mmap || event->attr.mmap_data)
- atomic_dec(&nr_mmap_events);
- if (event->attr.comm)
- atomic_dec(&nr_comm_events);
- if (event->attr.task)
- atomic_dec(&nr_task_events);
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- if (is_cgroup_event(event)) {
- atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_dec_deferred(&perf_sched_events);
- }
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
}
+ if (is_cgroup_event(event))
+ atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+}
- if (event->rb) {
- ring_buffer_put(event->rb);
- event->rb = NULL;
- }
+static void unaccount_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_dec(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_dec(&nr_comm_events);
+ if (event->attr.task)
+ atomic_dec(&nr_task_events);
+ if (event->attr.freq)
+ atomic_dec(&nr_freq_events);
if (is_cgroup_event(event))
- perf_detach_cgroup(event);
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (has_branch_stack(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+
+ unaccount_event_cpu(event, event->cpu);
+}
+
+static void __free_event(struct perf_event *event)
+{
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
if (event->destroy)
event->destroy(event);
@@ -2807,48 +3265,62 @@ static void free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->pmu)
+ module_put(event->pmu->module);
+
call_rcu(&event->rcu_head, free_event_rcu);
}
-int perf_event_release_kernel(struct perf_event *event)
+static void _free_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
+ irq_work_sync(&event->pending);
- WARN_ON_ONCE(ctx->parent_ctx);
- /*
- * There are two ways this annotation is useful:
- *
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
- *
- * 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
- */
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- raw_spin_unlock_irq(&ctx->lock);
- perf_remove_from_context(event);
- mutex_unlock(&ctx->mutex);
+ unaccount_event(event);
- free_event(event);
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
- return 0;
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+ __free_event(event);
+}
+
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
+{
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
+
+ _free_event(event);
}
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
{
- struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
- file->private_data = NULL;
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
@@ -2883,7 +3355,37 @@ static int perf_release(struct inode *inode, struct file *file)
put_task_struct(owner);
}
- return perf_event_release_kernel(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ perf_remove_from_context(event, true);
+ mutex_unlock(&ctx->mutex);
+
+ _free_event(event);
+}
+
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3027,30 +3529,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
unsigned int events = POLL_HUP;
/*
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
- * grabs the rb reference but perf_event_set_output() overrides it.
- * Here is the timeline for two threads T1, T2:
- * t0: T1, rb = rcu_dereference(event->rb)
- * t1: T2, old_rb = event->rb
- * t2: T2, event->rb = new rb
- * t3: T2, ring_buffer_detach(old_rb)
- * t4: T1, ring_buffer_attach(rb1)
- * t5: T1, poll_wait(event->waitq)
- *
- * To avoid this problem, we grab mmap_mutex in perf_poll()
- * thereby ensuring that the assignment of the new ring buffer
- * and the detachment of the old buffer appear atomic to perf_poll()
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (rb) {
- ring_buffer_attach(event, rb);
+ rb = event->rb;
+ if (rb)
events = atomic_xchg(&rb->poll, 0);
- }
- rcu_read_unlock();
-
mutex_unlock(&event->mmap_mutex);
poll_wait(file, &event->waitq, wait);
@@ -3095,16 +3580,15 @@ static void perf_event_for_each(struct perf_event *event,
event = event->group_leader;
perf_event_for_each_child(event, func);
- func(event);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
- perf_event_for_each_child(event, func);
+ perf_event_for_each_child(sibling, func);
mutex_unlock(&ctx->mutex);
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
struct perf_event_context *ctx = event->ctx;
- int ret = 0;
+ int ret = 0, active;
u64 value;
if (!is_sampling_event(event))
@@ -3128,6 +3612,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->attr.sample_period = value;
event->hw.sample_period = value;
}
+
+ active = (event->state == PERF_EVENT_STATE_ACTIVE);
+ if (active) {
+ perf_pmu_disable(ctx->pmu);
+ event->pmu->stop(event, PERF_EF_UPDATE);
+ }
+
+ local64_set(&event->hw.period_left, 0);
+
+ if (active) {
+ event->pmu->start(event, PERF_EF_RELOAD);
+ perf_pmu_enable(ctx->pmu);
+ }
+
unlock:
raw_spin_unlock_irq(&ctx->lock);
@@ -3136,21 +3634,18 @@ unlock:
static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static inline int perf_fget_light(int fd, struct fd *p)
{
- struct file *file;
-
- file = fget_light(fd, fput_needed);
- if (!file)
- return ERR_PTR(-EBADF);
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
- if (file->f_op != &perf_fops) {
- fput_light(file, *fput_needed);
- *fput_needed = 0;
- return ERR_PTR(-EBADF);
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return -EBADF;
}
-
- return file->private_data;
+ *p = f;
+ return 0;
}
static int perf_event_set_output(struct perf_event *event,
@@ -3180,22 +3675,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
+ case PERF_EVENT_IOC_ID:
+ {
+ u64 id = primary_event_id(event);
+
+ if (copy_to_user((void __user *)arg, &id, sizeof(id)))
+ return -EFAULT;
+ return 0;
+ }
+
case PERF_EVENT_IOC_SET_OUTPUT:
{
- struct perf_event *output_event = NULL;
- int fput_needed = 0;
int ret;
-
if (arg != -1) {
- output_event = perf_fget_light(arg, &fput_needed);
- if (IS_ERR(output_event))
- return PTR_ERR(output_event);
+ struct perf_event *output_event;
+ struct fd output;
+ ret = perf_fget_light(arg, &output);
+ if (ret)
+ return ret;
+ output_event = output.file->private_data;
+ ret = perf_event_set_output(event, output_event);
+ fdput(output);
+ } else {
+ ret = perf_event_set_output(event, NULL);
}
-
- ret = perf_event_set_output(event, output_event);
- if (output_event)
- fput_light(output_event->filp, fput_needed);
-
return ret;
}
@@ -3238,10 +3741,6 @@ int perf_event_task_disable(void)
return 0;
}
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
-
static int perf_event_index(struct perf_event *event)
{
if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3749,46 @@ static int perf_event_index(struct perf_event *event)
if (event->state != PERF_EVENT_STATE_ACTIVE)
return 0;
- return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+ return event->pmu->event_idx(event);
}
static void calc_timer_values(struct perf_event *event,
+ u64 *now,
u64 *enabled,
u64 *running)
{
- u64 now, ctx_time;
+ u64 ctx_time;
- now = perf_clock();
- ctx_time = event->shadow_ctx_time + now;
+ *now = perf_clock();
+ ctx_time = event->shadow_ctx_time + *now;
*enabled = ctx_time - event->tstamp_enabled;
*running = ctx_time - event->tstamp_running;
}
+static void perf_event_init_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ userpg = rb->user_page;
+
+ /* Allow new userspace to detect that bit 0 is deprecated */
+ userpg->cap_bit0_is_deprecated = 1;
+ userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+
+unlock:
+ rcu_read_unlock();
+}
+
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
/*
* Callers need to ensure there can be no nesting of this function, otherwise
* the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,9 +3798,13 @@ void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
struct ring_buffer *rb;
- u64 enabled, running;
+ u64 enabled, running, now;
rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
@@ -3286,13 +3814,9 @@ void perf_event_update_userpage(struct perf_event *event)
* because of locking issue as we can be called in
* NMI context
*/
- calc_timer_values(event, &enabled, &running);
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
+ calc_timer_values(event, &now, &enabled, &running);
userpg = rb->user_page;
-
/*
* Disable preemption so as to not let the corresponding user-space
* spin too long if we get preempted.
@@ -3302,7 +3826,7 @@ void perf_event_update_userpage(struct perf_event *event)
barrier();
userpg->index = perf_event_index(event);
userpg->offset = perf_event_count(event);
- if (event->state == PERF_EVENT_STATE_ACTIVE)
+ if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
userpg->time_enabled = enabled +
@@ -3311,6 +3835,8 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
+ arch_perf_update_userpage(userpg, now);
+
barrier();
++userpg->lock;
preempt_enable();
@@ -3356,32 +3882,47 @@ unlock:
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
+ struct ring_buffer *old_rb = NULL;
unsigned long flags;
- if (!list_empty(&event->rb_entry))
- return;
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
- spin_lock_irqsave(&rb->event_lock, flags);
- if (!list_empty(&event->rb_entry))
- goto unlock;
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
- list_add(&event->rb_entry, &rb->event_list);
-unlock:
- spin_unlock_irqrestore(&rb->event_lock, flags);
-}
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
-static void ring_buffer_detach(struct perf_event *event,
- struct ring_buffer *rb)
-{
- unsigned long flags;
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
- if (list_empty(&event->rb_entry))
- return;
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
+
+ rcu_assign_pointer(event->rb, rb);
- spin_lock_irqsave(&rb->event_lock, flags);
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
}
static void ring_buffer_wakeup(struct perf_event *event)
@@ -3390,13 +3931,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
- wake_up_all(&event->waitq);
-
-unlock:
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
rcu_read_unlock();
}
@@ -3425,18 +3963,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
- struct perf_event *event, *n;
- unsigned long flags;
-
if (!atomic_dec_and_test(&rb->refcount))
return;
- spin_lock_irqsave(&rb->event_lock, flags);
- list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- }
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3446,26 +3976,95 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
}
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->rb);
- struct user_struct *user = event->mmap_user;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = ring_buffer_get(event);
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ goto out_put;
+
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->pinned_vm -= event->mmap_locked;
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
mutex_unlock(&event->mmap_mutex);
+ put_event(event);
- ring_buffer_put(rb);
- free_uid(user);
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
}
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+out_put:
+ ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3515,12 +4114,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages == nr_pages)
- atomic_inc(&event->rb->refcount);
- else
+ if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
goto unlock;
}
@@ -3561,19 +4172,29 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->rb, rb);
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
atomic_long_add(user_extra, &user->locked_vm);
- event->mmap_locked = extra;
- event->mmap_user = get_current_user();
- vma->vm_mm->pinned_vm += event->mmap_locked;
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
unlock:
if (!ret)
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_RESERVED;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -3581,7 +4202,7 @@ unlock:
static int perf_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
@@ -3660,6 +4281,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask)
+{
+ int bit;
+
+ for_each_set_bit(bit, (const unsigned long *) &mask,
+ sizeof(mask) * BITS_PER_BYTE) {
+ u64 val;
+
+ val = perf_reg_value(regs, bit);
+ perf_output_put(handle, val);
+ }
+}
+
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+ struct pt_regs *regs)
+{
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ regs_user->regs = regs;
+ regs_user->abi = perf_reg_abi(current);
+ }
+}
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+ unsigned long addr = perf_user_stack_pointer(regs);
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+ struct pt_regs *regs)
+{
+ u64 task_size;
+
+ /* No regs, no stack pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ /*
+ * Check if we fit in with the requested stack size into the:
+ * - TASK_SIZE
+ * If we don't, we limit the size to the TASK_SIZE.
+ *
+ * - remaining sample size
+ * If we don't, we customize the stack size to
+ * fit in to the remaining sample size.
+ */
+
+ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+ stack_size = min(stack_size, (u16) task_size);
+
+ /* Current header size plus static size and dynamic size. */
+ header_size += 2 * sizeof(u64);
+
+ /* Do we fit in with the current stack dump size? */
+ if ((u16) (header_size + stack_size) < header_size) {
+ /*
+ * If we overflow the maximum size for the sample,
+ * we customize the stack dump size to fit in.
+ */
+ stack_size = USHRT_MAX - header_size - sizeof(u64);
+ stack_size = round_up(stack_size, sizeof(u64));
+ }
+
+ return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+ struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned long sp;
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user stack dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ sp = perf_user_stack_pointer(regs);
+ rem = __output_copy_user(handle, (void *) sp, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -3678,7 +4425,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_TIME)
data->time = perf_clock();
- if (sample_type & PERF_SAMPLE_ID)
+ if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -3717,6 +4464,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
}
void perf_event__output_id_sample(struct perf_event *event,
@@ -3782,7 +4532,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;
- if (sub != event)
+ if ((sub != event) &&
+ (sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
values[n++] = perf_event_count(sub);
@@ -3799,7 +4550,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
- u64 enabled = 0, running = 0;
+ u64 enabled = 0, running = 0, now;
u64 read_format = event->attr.read_format;
/*
@@ -3812,7 +4563,7 @@ static void perf_output_read(struct perf_output_handle *handle,
* NMI context
*/
if (read_format & PERF_FORMAT_TOTAL_TIMES)
- calc_timer_values(event, &enabled, &running);
+ calc_timer_values(event, &now, &enabled, &running);
if (event->attr.read_format & PERF_FORMAT_GROUP)
perf_output_read_group(handle, event, enabled, running);
@@ -3829,6 +4580,9 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, *header);
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+
if (sample_type & PERF_SAMPLE_IP)
perf_output_put(handle, data->ip);
@@ -3889,6 +4643,56 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
+ size_t size;
+
+ size = data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+
+ perf_output_put(handle, data->br_stack->nr);
+ perf_output_copy(handle, data->br_stack->entries, size);
+ } else {
+ /*
+ * we always store at least the value of nr
+ */
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ u64 abi = data->regs_user.abi;
+
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_user;
+ perf_output_sample_regs(handle,
+ data->regs_user.regs,
+ mask);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ perf_output_sample_ustack(handle,
+ data->stack_user_size,
+ data->regs_user.regs);
+ }
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -3925,7 +4729,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- data->callchain = perf_callchain(regs);
+ data->callchain = perf_callchain(event, regs);
if (data->callchain)
size += data->callchain->nr;
@@ -3944,6 +4748,58 @@ void perf_prepare_sample(struct perf_event_header *header,
WARN_ON_ONCE(size & (sizeof(u64)-1));
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ int size = sizeof(u64); /* nr */
+ if (data->br_stack) {
+ size += data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+ }
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_user(&data->regs_user, regs);
+
+ if (data->regs_user.regs) {
+ u64 mask = event->attr.sample_regs_user;
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ /*
+ * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * processed as the last one or have additional check added
+ * in case new sample type is added, because we could eat
+ * up the rest of the sample size.
+ */
+ struct perf_regs_user *uregs = &data->regs_user;
+ u16 stack_size = event->attr.sample_stack_user;
+ u16 size = sizeof(u64);
+
+ if (!uregs->abi)
+ perf_sample_regs_user(uregs, regs);
+
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ uregs->regs);
+
+ /*
+ * If there is something to dump, add space for the dump
+ * itself and for the field that tells the dynamic size,
+ * which is how many have been actually dumped.
+ */
+ if (stack_size)
+ size += sizeof(u64) + stack_size;
+
+ data->stack_user_size = stack_size;
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -4009,10 +4865,63 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+ perf_event_aux_output_cb output,
+ void *data)
+{
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
+}
+
+static void
+perf_event_aux(perf_event_aux_output_cb output, void *data,
+ struct perf_event_context *task_ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int ctxn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ goto next;
+ perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ if (task_ctx)
+ goto next;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto next;
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_aux_ctx(ctx, output, data);
+next:
+ put_cpu_ptr(pmu->pmu_cpu_context);
+ }
+
+ if (task_ctx) {
+ preempt_disable();
+ perf_event_aux_ctx(task_ctx, output, data);
+ preempt_enable();
+ }
+ rcu_read_unlock();
+}
+
/*
* task tracking -- fork/exit
*
- * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
*/
struct perf_task_event {
@@ -4030,14 +4939,25 @@ struct perf_task_event {
} event_id;
};
+static int perf_event_task_match(struct perf_event *event)
+{
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap2 || event->attr.mmap_data ||
+ event->attr.task;
+}
+
static void perf_event_task_output(struct perf_event *event,
- struct perf_task_event *task_event)
+ void *data)
{
+ struct perf_task_event *task_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
struct task_struct *task = task_event->task;
int ret, size = task_event->event_id.header.size;
+ if (!perf_event_task_match(event))
+ return;
+
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
@@ -4060,61 +4980,6 @@ out:
task_event->event_id.header.size = size;
}
-static int perf_event_task_match(struct perf_event *event)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm || event->attr.mmap ||
- event->attr.mmap_data || event->attr.task)
- return 1;
-
- return 0;
-}
-
-static void perf_event_task_ctx(struct perf_event_context *ctx,
- struct perf_task_event *task_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_task_match(event))
- perf_event_task_output(event, task_event);
- }
-}
-
-static void perf_event_task_event(struct perf_task_event *task_event)
-{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
- struct pmu *pmu;
- int ctxn;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
- goto next;
- perf_event_task_ctx(&cpuctx->ctx, task_event);
-
- ctx = task_event->task_ctx;
- if (!ctx) {
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- }
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
-}
-
static void perf_event_task(struct task_struct *task,
struct perf_event_context *task_ctx,
int new)
@@ -4143,7 +5008,9 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_task_event(&task_event);
+ perf_event_aux(perf_event_task_output,
+ &task_event,
+ task_ctx);
}
void perf_event_fork(struct task_struct *task)
@@ -4168,14 +5035,23 @@ struct perf_comm_event {
} event_id;
};
+static int perf_event_comm_match(struct perf_event *event)
+{
+ return event->attr.comm;
+}
+
static void perf_event_comm_output(struct perf_event *event,
- struct perf_comm_event *comm_event)
+ void *data)
{
+ struct perf_comm_event *comm_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
int ret;
+ if (!perf_event_comm_match(event))
+ return;
+
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
comm_event->event_id.header.size);
@@ -4197,39 +5073,10 @@ out:
comm_event->event_id.header.size = size;
}
-static int perf_event_comm_match(struct perf_event *event)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm)
- return 1;
-
- return 0;
-}
-
-static void perf_event_comm_ctx(struct perf_event_context *ctx,
- struct perf_comm_event *comm_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_comm_match(event))
- perf_event_comm_output(event, comm_event);
- }
-}
-
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
char comm[TASK_COMM_LEN];
unsigned int size;
- struct pmu *pmu;
- int ctxn;
memset(comm, 0, sizeof(comm));
strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4239,39 +5086,15 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
- goto next;
- perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_event_comm_ctx(ctx, comm_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_comm_output,
+ comm_event,
+ NULL);
}
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
- struct perf_event_context *ctx;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- perf_event_enable_on_exec(ctx);
- }
if (!atomic_read(&nr_comm_events))
return;
@@ -4283,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
- .misc = 0,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
@@ -4303,6 +5126,10 @@ struct perf_mmap_event {
const char *file_name;
int file_size;
+ int maj, min;
+ u64 ino;
+ u64 ino_generation;
+ u32 prot, flags;
struct {
struct perf_event_header header;
@@ -4315,14 +5142,39 @@ struct perf_mmap_event {
} event_id;
};
+static int perf_event_mmap_match(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
+
+ return (!executable && event->attr.mmap_data) ||
+ (executable && (event->attr.mmap || event->attr.mmap2));
+}
+
static void perf_event_mmap_output(struct perf_event *event,
- struct perf_mmap_event *mmap_event)
+ void *data)
{
+ struct perf_mmap_event *mmap_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
int ret;
+ if (!perf_event_mmap_match(event, data))
+ return;
+
+ if (event->attr.mmap2) {
+ mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
+ mmap_event->event_id.header.size += sizeof(mmap_event->maj);
+ mmap_event->event_id.header.size += sizeof(mmap_event->min);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
+ }
+
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
mmap_event->event_id.header.size);
@@ -4333,6 +5185,16 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
+
+ if (event->attr.mmap2) {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
+ }
+
__output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
@@ -4343,119 +5205,116 @@ out:
mmap_event->event_id.header.size = size;
}
-static int perf_event_mmap_match(struct perf_event *event,
- struct perf_mmap_event *mmap_event,
- int executable)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if ((!executable && event->attr.mmap_data) ||
- (executable && event->attr.mmap))
- return 1;
-
- return 0;
-}
-
-static void perf_event_mmap_ctx(struct perf_event_context *ctx,
- struct perf_mmap_event *mmap_event,
- int executable)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_mmap_match(event, mmap_event, executable))
- perf_event_mmap_output(event, mmap_event);
- }
-}
-
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
struct vm_area_struct *vma = mmap_event->vma;
struct file *file = vma->vm_file;
+ int maj = 0, min = 0;
+ u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
- const char *name;
- struct pmu *pmu;
- int ctxn;
-
- memset(tmp, 0, sizeof(tmp));
+ char *name;
if (file) {
+ struct inode *inode;
+ dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
/*
- * d_path works from the end of the rb backwards, so we
+ * d_path() works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
- if (!buf) {
- name = strncpy(tmp, "//enomem", sizeof(tmp));
- goto got_name;
- }
- name = d_path(&file->f_path, buf, PATH_MAX);
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
- name = strncpy(tmp, "//toolong", sizeof(tmp));
- goto got_name;
+ name = "//toolong";
+ goto cpy_name;
}
+ inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ gen = inode->i_generation;
+ maj = MAJOR(dev);
+ min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
+ goto got_name;
} else {
- if (arch_vma_name(mmap_event->vma)) {
- name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp));
- goto got_name;
- }
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
- if (!vma->vm_mm) {
- name = strncpy(tmp, "[vdso]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
- name = strncpy(tmp, "[heap]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
- name = strncpy(tmp, "[stack]", sizeof(tmp));
- goto got_name;
+ name = "[stack]";
+ goto cpy_name;
}
- name = strncpy(tmp, "//anon", sizeof(tmp));
- goto got_name;
+ name = "//anon";
+ goto cpy_name;
}
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
got_name:
- size = ALIGN(strlen(name)+1, sizeof(u64));
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
mmap_event->file_name = name;
mmap_event->file_size = size;
+ mmap_event->maj = maj;
+ mmap_event->min = min;
+ mmap_event->ino = ino;
+ mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
- mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
- goto next;
- perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_event_mmap_ctx(ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
- }
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_mmap_output,
+ mmap_event,
+ NULL);
kfree(buf);
}
@@ -4483,6 +5342,12 @@ void perf_event_mmap(struct vm_area_struct *vma)
.len = vma->vm_end - vma->vm_start,
.pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
},
+ /* .maj (attr_mmap2 only) */
+ /* .min (attr_mmap2 only) */
+ /* .ino (attr_mmap2 only) */
+ /* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
@@ -4560,6 +5425,7 @@ static int __perf_event_overflow(struct perf_event *event,
__this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
+ tick_nohz_full_kick();
ret = 1;
}
}
@@ -4618,6 +5484,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -4629,7 +5498,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
* sign as trigger.
*/
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 period = hwc->last_period;
@@ -4798,7 +5667,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct perf_event *event;
- struct hlist_node *node;
struct hlist_head *head;
rcu_read_lock();
@@ -4806,7 +5674,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (!head)
goto end;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
@@ -4839,7 +5707,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
if (rctx < 0)
return;
- perf_sample_data_init(&data, addr);
+ perf_sample_data_init(&data, addr, 0);
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
@@ -4865,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -4923,11 +5797,6 @@ static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
- if (event->cpu != -1) {
- swevent_hlist_put_cpu(event, event->cpu);
- return;
- }
-
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
@@ -4961,9 +5830,6 @@ static int swevent_hlist_get(struct perf_event *event)
int err;
int cpu, failed_cpu;
- if (event->cpu != -1)
- return swevent_hlist_get_cpu(event, event->cpu);
-
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
@@ -4986,7 +5852,7 @@ fail:
return err;
}
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
static void sw_perf_event_destroy(struct perf_event *event)
{
@@ -4994,17 +5860,23 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
- jump_label_dec(&perf_swevent_enabled[event_id]);
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
}
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5896,18 @@ static int perf_swevent_init(struct perf_event *event)
if (err)
return err;
- jump_label_inc(&perf_swevent_enabled[event_id]);
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
}
return 0;
}
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+ return 0;
+}
+
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
@@ -5040,6 +5917,8 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .event_idx = perf_swevent_event_idx,
};
#ifdef CONFIG_EVENT_TRACING
@@ -5073,25 +5952,50 @@ static int perf_tp_event_match(struct perf_event *event,
}
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
- struct pt_regs *regs, struct hlist_head *head, int rctx)
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
{
struct perf_sample_data data;
struct perf_event *event;
- struct hlist_node *node;
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
};
- perf_sample_data_init(&data, addr);
+ perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -5108,6 +6012,12 @@ static int perf_tp_event_init(struct perf_event *event)
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
err = perf_trace_init(event);
if (err)
return err;
@@ -5126,6 +6036,8 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .event_idx = perf_swevent_event_idx,
};
static inline void perf_tp_register(void)
@@ -5179,7 +6091,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
- perf_sample_data_init(&sample, bp->attr.bp_addr);
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
perf_swevent_event(bp, 1, &sample, regs);
@@ -5205,13 +6117,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event->pmu->read(event);
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
regs = get_irq_regs();
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (perf_event_overflow(event, &data, regs))
+ if (__perf_event_overflow(event, 1, &data, regs))
ret = HRTIMER_NORESTART;
}
@@ -5275,6 +6186,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5331,6 +6243,12 @@ static int cpu_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
perf_swevent_init_hrtimer(event);
return 0;
@@ -5345,6 +6263,8 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
+
+ .event_idx = perf_swevent_event_idx,
};
/*
@@ -5403,6 +6323,12 @@ static int task_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
perf_swevent_init_hrtimer(event);
return 0;
@@ -5417,6 +6343,8 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
+
+ .event_idx = perf_swevent_event_idx,
};
static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,11 +6372,16 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
perf_pmu_enable(pmu);
}
+static int perf_event_idx_default(struct perf_event *event)
+{
+ return event->hw.idx + 1;
+}
+
/*
* Ensures all contexts with the same task_ctx_nr have the same
* pmu_cpu_context too.
*/
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
struct pmu *pmu;
@@ -5472,8 +6405,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- if (cpuctx->active_pmu == old_pmu)
- cpuctx->active_pmu = pmu;
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
}
}
@@ -5505,16 +6438,64 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);
+
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
-static struct device_attribute pmu_dev_attrs[] = {
- __ATTR_RO(type),
- __ATTR_NULL,
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;
+
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;
+
+ if (timer < 1)
+ return -EINVAL;
+
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;
+
+ pmu->hrtimer_interval_ms = timer;
+
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }
+
+ return count;
+}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
- .dev_attrs = pmu_dev_attrs,
+ .dev_groups = pmu_dev_groups,
};
static void pmu_dev_release(struct device *dev)
@@ -5530,6 +6511,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (!pmu->dev)
goto out;
+ pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
ret = dev_set_name(pmu->dev, "%s", pmu->name);
if (ret)
@@ -5553,7 +6535,7 @@ free_dev:
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
int cpu, ret;
@@ -5569,13 +6551,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
pmu->name = name;
if (type < 0) {
- int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
- if (!err)
- goto free_pdc;
-
- err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
- if (err) {
- ret = err;
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
goto free_pdc;
}
}
@@ -5592,6 +6570,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
+ ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
@@ -5605,9 +6584,11 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
- cpuctx->jiffies_interval = 1;
+
+ __perf_cpu_hrtimer_init(cpuctx, cpu);
+
INIT_LIST_HEAD(&cpuctx->rotation_list);
- cpuctx->active_pmu = pmu;
+ cpuctx->unique_pmu = pmu;
}
got_cpu_context:
@@ -5633,6 +6614,9 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->event_idx)
+ pmu->event_idx = perf_event_idx_default;
+
list_add_rcu(&pmu->entry, &pmus);
ret = 0;
unlock:
@@ -5652,6 +6636,7 @@ free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
@@ -5673,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu)
put_device(pmu->dev);
free_pmu_context(pmu);
}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
struct pmu *perf_init_event(struct perf_event *event)
{
@@ -5686,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (ret)
@@ -5694,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event)
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (!ret)
@@ -5711,6 +6705,44 @@ unlock:
return pmu;
}
+static void account_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
+ }
+ if (is_cgroup_event(event))
+ atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void account_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_inc(&perf_sched_events.key);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_inc(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_inc(&nr_comm_events);
+ if (event->attr.task)
+ atomic_inc(&nr_task_events);
+ if (event->attr.freq) {
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_full_kick_all();
+ }
+ if (has_branch_stack(event))
+ static_key_slow_inc(&perf_sched_events.key);
+ if (is_cgroup_event(event))
+ static_key_slow_inc(&perf_sched_events.key);
+
+ account_event_cpu(event, event->cpu);
+}
+
/*
* Allocate and initialize a event structure
*/
@@ -5725,7 +6757,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
- long err;
+ long err = -EINVAL;
if ((unsigned)cpu >= nr_cpu_ids) {
if (!task || cpu != -1)
@@ -5750,12 +6782,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
+ INIT_LIST_HEAD(&event->active_entry);
+ INIT_HLIST_NODE(&event->hlist_entry);
+
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
+ atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
@@ -5764,18 +6800,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->parent = parent_event;
- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
if (task) {
event->attach_state = PERF_ATTACH_TASK;
+
+ if (attr->type == PERF_TYPE_TRACEPOINT)
+ event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
*/
- if (attr->type == PERF_TYPE_BREAKPOINT)
+ else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task;
#endif
}
@@ -5788,8 +6827,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
- if (attr->disabled)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event__state_init(event);
pmu = NULL;
@@ -5805,43 +6843,36 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* we currently do not support PERF_FORMAT_GROUP on inherited events
*/
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
- goto done;
+ goto err_ns;
pmu = perf_init_event(event);
-
-done:
- err = 0;
if (!pmu)
- err = -EINVAL;
- else if (IS_ERR(pmu))
+ goto err_ns;
+ else if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
-
- if (err) {
- if (event->ns)
- put_pid_ns(event->ns);
- kfree(event);
- return ERR_PTR(err);
+ goto err_ns;
}
if (!event->parent) {
- if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_sched_events.key);
- if (event->attr.mmap || event->attr.mmap_data)
- atomic_inc(&nr_mmap_events);
- if (event->attr.comm)
- atomic_inc(&nr_comm_events);
- if (event->attr.task)
- atomic_inc(&nr_task_events);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
- if (err) {
- free_event(event);
- return ERR_PTR(err);
- }
+ if (err)
+ goto err_pmu;
}
}
return event;
+
+err_pmu:
+ if (event->destroy)
+ event->destroy(event);
+ module_put(pmu->module);
+err_ns:
+ if (event->ns)
+ put_pid_ns(event->ns);
+ kfree(event);
+
+ return ERR_PTR(err);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -5908,6 +6939,61 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->read_format & ~(PERF_FORMAT_MAX-1))
return -EINVAL;
+ if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ u64 mask = attr->branch_sample_type;
+
+ /* only using defined bits */
+ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+ return -EINVAL;
+
+ /* at least one branch bit must be set */
+ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+ return -EINVAL;
+
+ /* propagate priv level, when not set for branch */
+ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+ /* exclude_kernel checked on syscall entry */
+ if (!attr->exclude_kernel)
+ mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ if (!attr->exclude_user)
+ mask |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!attr->exclude_hv)
+ mask |= PERF_SAMPLE_BRANCH_HV;
+ /*
+ * adjust user setting (for HW filter setup)
+ */
+ attr->branch_sample_type = mask;
+ }
+ /* privileged levels capture (kernel, hv): check permissions */
+ if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+ ret = perf_reg_validate(attr->sample_regs_user);
+ if (ret)
+ return ret;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+ if (!arch_perf_have_user_stack_dump())
+ return -ENOSYS;
+
+ /*
+ * We have __u32 type for the size, but so far
+ * we can only use __u16 as maximum due to the
+ * __u16 sample size limit.
+ */
+ if (attr->sample_stack_user >= USHRT_MAX)
+ ret = -EINVAL;
+ else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+ ret = -EINVAL;
+ }
+
out:
return ret;
@@ -5920,7 +7006,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL, *old_rb = NULL;
+ struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -5955,16 +7041,12 @@ set:
goto unlock;
}
- old_rb = event->rb;
- rcu_assign_pointer(event->rb, rb);
- if (old_rb)
- ring_buffer_detach(event, old_rb);
+ ring_buffer_attach(event, rb);
+
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_rb)
- ring_buffer_put(old_rb);
out:
return ret;
}
@@ -5986,13 +7068,13 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct file *group_file = NULL;
+ struct fd group = {NULL, 0};
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
int move_group = 0;
- int fput_needed = 0;
int err;
+ int f_flags = O_RDWR;
/* for future expandability... */
if (flags & ~PERF_FLAG_ALL)
@@ -6010,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -6021,17 +7106,18 @@ SYSCALL_DEFINE5(perf_event_open,
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
return -EINVAL;
- event_fd = get_unused_fd_flags(O_RDWR);
+ if (flags & PERF_FLAG_FD_CLOEXEC)
+ f_flags |= O_CLOEXEC;
+
+ event_fd = get_unused_fd_flags(f_flags);
if (event_fd < 0)
return event_fd;
if (group_fd != -1) {
- group_leader = perf_fget_light(group_fd, &fput_needed);
- if (IS_ERR(group_leader)) {
- err = PTR_ERR(group_leader);
+ err = perf_fget_light(group_fd, &group);
+ if (err)
goto err_fd;
- }
- group_file = group_leader->filp;
+ group_leader = group.file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6046,26 +7132,38 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
+ get_online_cpus();
+
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_task;
+ goto err_cpus;
}
if (flags & PERF_FLAG_PID_CGROUP) {
err = perf_cgroup_connect(pid, event, &attr, group_leader);
- if (err)
+ if (err) {
+ __free_event(event);
+ goto err_cpus;
+ }
+ }
+
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
goto err_alloc;
- /*
- * one more event:
- * - that has cgroup constraint on event->cpu
- * - that may need work on context switch
- */
- atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_inc(&perf_sched_events.key);
+ }
}
+ account_event(event);
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -6098,7 +7196,7 @@ SYSCALL_DEFINE5(perf_event_open,
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, cpu);
+ ctx = find_get_context(pmu, task, event->cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_alloc;
@@ -6146,7 +7244,8 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+ f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
goto err_context;
@@ -6156,35 +7255,44 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader, false);
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling);
+ perf_remove_from_context(sibling, false);
+ perf_event__state_init(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
put_ctx(gctx);
}
- event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
if (move_group) {
- perf_install_in_context(ctx, group_leader, cpu);
+ synchronize_rcu();
+ perf_install_in_context(ctx, group_leader, event->cpu);
get_ctx(ctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, cpu);
+ perf_install_in_context(ctx, sibling, event->cpu);
get_ctx(ctx);
}
}
- perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
+ perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
+ put_online_cpus();
+
event->owner = current;
mutex_lock(&current->perf_event_mutex);
@@ -6203,7 +7311,7 @@ SYSCALL_DEFINE5(perf_event_open,
* of the group leader will find the pointer to itself in
* perf_group_detach().
*/
- fput_light(group_file, fput_needed);
+ fdput(group);
fd_install(event_fd, event_file);
return event_fd;
@@ -6212,11 +7320,13 @@ err_context:
put_ctx(ctx);
err_alloc:
free_event(event);
+err_cpus:
+ put_online_cpus();
err_task:
if (task)
put_task_struct(task);
err_group_fd:
- fput_light(group_file, fput_needed);
+ fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -6250,17 +7360,17 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err;
}
+ account_event(event);
+
ctx = find_get_context(event->pmu, task, cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_free;
}
- event->filp = NULL;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -6273,6 +7383,41 @@ err:
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx;
+ struct perf_event_context *dst_ctx;
+ struct perf_event *event, *tmp;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+
+ mutex_lock(&src_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+ event_entry) {
+ perf_remove_from_context(event, false);
+ unaccount_event_cpu(event, src_cpu);
+ put_ctx(src_ctx);
+ list_add(&event->migrate_entry, &events);
+ }
+ mutex_unlock(&src_ctx->mutex);
+
+ synchronize_rcu();
+
+ mutex_lock(&dst_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+ mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
+
static void sync_child_event(struct perf_event *child_event,
struct task_struct *child)
{
@@ -6305,7 +7450,7 @@ static void sync_child_event(struct perf_event *child_event,
* Release the parent event, if this was the last
* reference to it.
*/
- fput(parent_event->filp);
+ put_event(parent_event);
}
static void
@@ -6313,13 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- if (child_event->parent) {
- raw_spin_lock_irq(&child_ctx->lock);
- perf_group_detach(child_event);
- raw_spin_unlock_irq(&child_ctx->lock);
- }
-
- perf_remove_from_context(child_event);
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -6334,8 +7485,8 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *tmp;
- struct perf_event_context *child_ctx;
+ struct perf_event *child_event, *next;
+ struct perf_event_context *child_ctx, *parent_ctx;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -6360,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@@ -6370,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
@@ -6381,32 +7548,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*
* __perf_event_exit_task()
* sync_child_event()
- * fput(parent_event->filp)
- * perf_release()
- * mutex_lock(&ctx->mutex)
+ * put_event()
+ * mutex_lock(&ctx->mutex)
*
* But since its the parent context it won't be the same instance.
*/
mutex_lock(&child_ctx->mutex);
-again:
- list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
- group_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
-
- list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
- /*
- * If the last event was a group event, it will have appended all
- * its siblings to the list, but we obtained 'tmp' before that which
- * will still point to the list head terminating the iteration.
- */
- if (!list_empty(&child_ctx->pinned_groups) ||
- !list_empty(&child_ctx->flexible_groups))
- goto again;
-
mutex_unlock(&child_ctx->mutex);
put_ctx(child_ctx);
@@ -6451,7 +7602,7 @@ static void perf_free_event(struct perf_event *event,
list_del_init(&event->child_list);
mutex_unlock(&parent->child_mutex);
- fput(parent->filp);
+ put_event(parent);
perf_group_detach(event);
list_del_event(event, ctx);
@@ -6531,6 +7682,12 @@ inherit_event(struct perf_event *parent_event,
NULL, NULL);
if (IS_ERR(child_event))
return child_event;
+
+ if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
get_ctx(child_ctx);
/*
@@ -6572,14 +7729,6 @@ inherit_event(struct perf_event *parent_event,
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
- * Get a reference to the parent filp - we will fput it
- * when the child event exits. This is safe to do because
- * we are in the parent and we know that the filp still
- * exists and has a nonzero count:
- */
- atomic_long_inc(&parent_event->filp->f_count);
-
- /*
* Link this into the parent event's child list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
@@ -6636,7 +7785,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/
- child_ctx = alloc_perf_context(event->pmu, child);
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -6673,6 +7822,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
* swapped under us.
*/
parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
/*
* No need to check if parent_ctx != NULL here; since we saw
@@ -6779,11 +7930,12 @@ static void __init perf_event_init_all_cpus(void)
}
}
-static void __cpuinit perf_event_init_cpu(int cpu)
+static void perf_event_init_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -6806,15 +7958,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
+ struct remove_event re = { .detach_group = false };
struct perf_event_context *ctx = __info;
- struct perf_event *event, *tmp;
perf_pmu_rotate_stop(ctx->pmu);
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_remove_from_context(event);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
- __perf_remove_from_context(event);
+ rcu_read_lock();
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
+ rcu_read_unlock();
}
static void perf_event_exit_cpu_context(int cpu)
@@ -6838,11 +7990,12 @@ static void perf_event_exit_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ perf_event_exit_cpu_context(cpu);
+
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
-
- perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
@@ -6868,7 +8021,7 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int __cpuinit
+static int
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
@@ -6884,7 +8037,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
-
default:
break;
}
@@ -6912,6 +8064,13 @@ void __init perf_event_init(void)
/* do not patch jump label more than once per second */
jump_label_rate_limit(&perf_sched_events, HZ);
+
+ /*
+ * Build time assertion that we keep the data_head at the intended
+ * location. IOW, validation we got the __reserved[] size right.
+ */
+ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+ != 1024);
}
static int __init perf_event_sysfs_init(void)
@@ -6943,8 +8102,8 @@ unlock:
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
- struct cgroup_subsys *ss, struct cgroup *cont)
+static struct cgroup_subsys_state *
+perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct perf_cgroup *jc;
@@ -6961,12 +8120,10 @@ static struct cgroup_subsys_state *perf_cgroup_create(
return &jc->css;
}
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct perf_cgroup *jc;
- jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
- struct perf_cgroup, css);
+ struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
+
free_percpu(jc->info);
kfree(jc);
}
@@ -6978,17 +8135,18 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void perf_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, tset)
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task)
+static void perf_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -7001,11 +8159,9 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
task_function_call(task, __perf_cgroup_move, task);
}
-struct cgroup_subsys perf_subsys = {
- .name = "perf_event",
- .subsys_id = perf_subsys_id,
- .create = perf_cgroup_create,
- .destroy = perf_cgroup_destroy,
+struct cgroup_subsys perf_event_cgrp_subsys = {
+ .css_alloc = perf_cgroup_css_alloc,
+ .css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
};
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38b..1559fb0b929 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
#include <linux/smp.h>
#include <linux/hw_breakpoint.h>
-
-
/*
* Constraints data
*/
+struct bp_cpuinfo {
+ /* Number of pinned cpu breakpoints in a cpu */
+ unsigned int cpu_pinned;
+ /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+ unsigned int *tsk_pinned;
+ /* Number of non-pinned cpu/task breakpoints in a cpu */
+ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
-/* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
-
-/* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
-
-/* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
-
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
static int nr_slots[TYPE_MAX];
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+ return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
+
/* Keep track of the breakpoints attached to tasks */
static LIST_HEAD(bp_task_head);
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
int i;
- unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
for (i = nr_slots[type] - 1; i >= 0; i--) {
if (tsk_pinned[i] > 0)
@@ -111,20 +114,29 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
*/
-static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
+static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
struct task_struct *tsk = bp->hw.bp_target;
struct perf_event *iter;
int count = 0;
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
+ if (iter->hw.bp_target == tsk &&
+ find_slot_idx(iter) == type &&
+ (iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
return count;
}
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+ if (bp->cpu >= 0)
+ return cpumask_of(bp->cpu);
+ return cpu_possible_mask;
+}
+
/*
* Report the number of pinned/un-pinned breakpoints we have in
* a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -133,34 +145,23 @@ static void
fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
enum bp_type_idx type)
{
- int cpu = bp->cpu;
- struct task_struct *tsk = bp->hw.bp_target;
-
- if (cpu >= 0) {
- slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
- if (!tsk)
- slots->pinned += max_task_bp_pinned(cpu, type);
- else
- slots->pinned += task_bp_pinned(bp, type);
- slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
-
- return;
- }
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
- for_each_online_cpu(cpu) {
- unsigned int nr;
+ for_each_cpu(cpu, cpumask) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+ int nr;
- nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
- if (!tsk)
+ nr = info->cpu_pinned;
+ if (!bp->hw.bp_target)
nr += max_task_bp_pinned(cpu, type);
else
- nr += task_bp_pinned(bp, type);
+ nr += task_bp_pinned(cpu, bp, type);
if (nr > slots->pinned)
slots->pinned = nr;
- nr = per_cpu(nr_bp_flexible[type], cpu);
-
+ nr = info->flexible;
if (nr > slots->flexible)
slots->flexible = nr;
}
@@ -180,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
/*
* Add a pinned breakpoint for the given task in our constraint table
*/
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
enum bp_type_idx type, int weight)
{
- unsigned int *tsk_pinned;
- int old_count = 0;
- int old_idx = 0;
- int idx = 0;
-
- old_count = task_bp_pinned(bp, type);
- old_idx = old_count - 1;
- idx = old_idx + weight;
-
- /* tsk_pinned[n] is the number of tasks having n breakpoints */
- tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
- if (enable) {
- tsk_pinned[idx]++;
- if (old_count > 0)
- tsk_pinned[old_idx]--;
- } else {
- tsk_pinned[idx]--;
- if (old_count > 0)
- tsk_pinned[old_idx]++;
- }
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int old_idx, new_idx;
+
+ old_idx = task_bp_pinned(cpu, bp, type) - 1;
+ new_idx = old_idx + weight;
+
+ if (old_idx >= 0)
+ tsk_pinned[old_idx]--;
+ if (new_idx >= 0)
+ tsk_pinned[new_idx]++;
}
/*
@@ -212,33 +203,26 @@ static void
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int weight)
{
- int cpu = bp->cpu;
- struct task_struct *tsk = bp->hw.bp_target;
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
- /* Pinned counter cpu profiling */
- if (!tsk) {
+ if (!enable)
+ weight = -weight;
- if (enable)
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
- else
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+ /* Pinned counter cpu profiling */
+ if (!bp->hw.bp_target) {
+ get_bp_info(bp->cpu, type)->cpu_pinned += weight;
return;
}
/* Pinned counter task profiling */
-
- if (!enable)
- list_del(&bp->hw.bp_list);
-
- if (cpu >= 0) {
- toggle_bp_task_slot(bp, cpu, enable, type, weight);
- } else {
- for_each_online_cpu(cpu)
- toggle_bp_task_slot(bp, cpu, enable, type, weight);
- }
+ for_each_cpu(cpu, cpumask)
+ toggle_bp_task_slot(bp, cpu, type, weight);
if (enable)
list_add_tail(&bp->hw.bp_list, &bp_task_head);
+ else
+ list_del(&bp->hw.bp_list);
}
/*
@@ -259,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to a single cpu, check:
*
- * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
- * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
*
* -> If there are already non-pinned counters in this cpu, it means
* there is already a free slot for them.
@@ -270,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to every cpus, check:
*
- * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
- * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
*
* -> This is roughly the same, except we check the number of per cpu
* bp for every cpu and we keep the max one. Same for the per tasks
@@ -282,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to a single cpu, check:
*
- * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
- * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
*
- * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ * -> Same checks as before. But now the info->flexible, if any, must keep
* one register at least (or they will never be fed).
*
* - If attached to every cpus, check:
*
- * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
- * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
+ * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
static int __reserve_bp_slot(struct perf_event *bp)
{
@@ -453,7 +437,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
int old_type = bp->attr.bp_type;
int err = 0;
- perf_event_disable(bp);
+ /*
+ * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+ * will not be possible to raise IPIs that invoke __perf_event_disable.
+ * So call the function directly after making sure we are targeting the
+ * current task.
+ */
+ if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+ __perf_event_disable(bp);
+ else
+ perf_event_disable(bp);
bp->attr.bp_addr = attr->bp_addr;
bp->attr.bp_type = attr->bp_type;
@@ -507,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
void *context)
{
- struct perf_event * __percpu *cpu_events, **pevent, *bp;
- long err;
+ struct perf_event * __percpu *cpu_events, *bp;
+ long err = 0;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -517,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
get_online_cpus();
for_each_online_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
-
- *pevent = bp;
-
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
- goto fail;
+ break;
}
- }
- put_online_cpus();
-
- return cpu_events;
-fail:
- for_each_online_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
- if (IS_ERR(*pevent))
- break;
- unregister_hw_breakpoint(*pevent);
+ per_cpu(*cpu_events, cpu) = bp;
}
put_online_cpus();
- free_percpu(cpu_events);
+ if (likely(!err))
+ return cpu_events;
+
+ unregister_wide_hw_breakpoint(cpu_events);
return (void __percpu __force *)ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -553,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
{
int cpu;
- struct perf_event **pevent;
- for_each_possible_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
- unregister_hw_breakpoint(*pevent);
- }
+ for_each_possible_cpu(cpu)
+ unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
+
free_percpu(cpu_events);
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -581,6 +562,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
if (bp->attr.type != PERF_TYPE_BREAKPOINT)
return -ENOENT;
+ /*
+ * no branch sampling for breakpoint events
+ */
+ if (has_branch_stack(bp))
+ return -EOPNOTSUPP;
+
err = register_perf_hw_breakpoint(bp);
if (err)
return err;
@@ -595,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
if (!(flags & PERF_EF_START))
bp->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(bp)) {
+ bp->hw.last_period = bp->hw.sample_period;
+ perf_swevent_set_period(bp);
+ }
+
return arch_install_hw_breakpoint(bp);
}
@@ -613,6 +605,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
bp->hw.state = PERF_HES_STOPPED;
}
+static int hw_breakpoint_event_idx(struct perf_event *bp)
+{
+ return 0;
+}
+
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
@@ -622,11 +619,12 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
+
+ .event_idx = hw_breakpoint_event_idx,
};
int __init init_hw_breakpoint(void)
{
- unsigned int **task_bp_pinned;
int cpu, err_cpu;
int i;
@@ -635,10 +633,11 @@ int __init init_hw_breakpoint(void)
for_each_possible_cpu(cpu) {
for (i = 0; i < TYPE_MAX; i++) {
- task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
- *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
- GFP_KERNEL);
- if (!*task_bp_pinned)
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
+ GFP_KERNEL);
+ if (!info->tsk_pinned)
goto err_alloc;
}
}
@@ -651,10 +650,10 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ kfree(get_bp_info(err_cpu, i)->tsk_pinned);
if (err_cpu == cpu)
break;
- for (i = 0; i < TYPE_MAX; i++)
- kfree(per_cpu(nr_task_bp_pinned[i], cpu));
}
return -ENOMEM;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90af..569b218782a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
/* Buffer handling */
@@ -15,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
- int writable; /* are we writable */
+ int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
@@ -30,6 +31,10 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
@@ -76,32 +81,73 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline void
-__output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+{ \
+ unsigned long size, written; \
+ \
+ do { \
+ size = min(handle->size, len); \
+ written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
+ \
+ len -= written; \
+ handle->addr += written; \
+ buf += written; \
+ handle->size -= written; \
+ if (!handle->size) { \
+ struct ring_buffer *rb = handle->rb; \
+ \
+ handle->page++; \
+ handle->page &= rb->nr_pages - 1; \
+ handle->addr = rb->data_pages[handle->page]; \
+ handle->size = PAGE_SIZE << page_order(rb); \
+ } \
+ } while (len && written == size); \
+ \
+ return len; \
+}
+
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
+{
+ memcpy(dst, src, n);
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
{
- do {
- unsigned long size = min_t(unsigned long, handle->size, len);
-
- memcpy(handle->addr, buf, size);
-
- len -= size;
- handle->addr += size;
- buf += size;
- handle->size -= size;
- if (!handle->size) {
- struct ring_buffer *rb = handle->rb;
-
- handle->page++;
- handle->page &= rb->nr_pages - 1;
- handle->addr = rb->data_pages[handle->page];
- handle->size = PAGE_SIZE << page_order(rb);
- }
- } while (len);
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
}
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
/* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
@@ -133,4 +179,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
recursion[rctx]--;
}
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7..146a5792b1d 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,28 +12,10 @@
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/circ_buf.h>
#include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
- unsigned long offset, unsigned long head)
-{
- unsigned long mask;
-
- if (!rb->writable)
- return true;
-
- mask = perf_data_size(rb) - 1;
-
- offset = (offset - tail) & mask;
- head = (head - tail) & mask;
-
- if ((int)(head - offset) < 0)
- return false;
-
- return true;
-}
-
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
@@ -75,15 +57,37 @@ again:
goto out;
/*
- * Publish the known good head. Rely on the full barrier implied
- * by atomic_dec_and_test() order the rb->head read and this
- * write.
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * if (LOAD ->data_tail) { LOAD ->data_head
+ * (A) smp_rmb() (C)
+ * STORE $data LOAD $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head STORE ->data_tail
+ * }
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * In our case (A) is a control dependency that separates the load of
+ * the ->data_tail and the stores of $data. In case ->data_tail
+ * indicates there is no room in the buffer to store $data we do not.
+ *
+ * D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
*/
+ smp_wmb(); /* B, matches C */
rb->user_page->data_head = head;
/*
- * Now check if we missed an update, rely on the (compiler)
- * barrier in atomic_dec_and_test() to re-read rb->head.
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
@@ -102,8 +106,7 @@ int perf_output_begin(struct perf_output_handle *handle,
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
- int have_lost;
- struct perf_sample_data sample_data;
+ int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
@@ -118,55 +121,72 @@ int perf_output_begin(struct perf_output_handle *handle,
event = event->parent;
rb = rcu_dereference(event->rb);
- if (!rb)
+ if (unlikely(!rb))
goto out;
- handle->rb = rb;
- handle->event = event;
-
- if (!rb->nr_pages)
+ if (unlikely(!rb->nr_pages))
goto out;
+ handle->rb = rb;
+ handle->event = event;
+
have_lost = local_read(&rb->lost);
- if (have_lost) {
- lost_event.header.size = sizeof(lost_event);
- perf_event_header__init_id(&lost_event.header, &sample_data,
- event);
- size += lost_event.header.size;
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
}
perf_output_get_handle(handle);
do {
- /*
- * Userspace could choose to issue a mb() before updating the
- * tail pointer. So that all reads will be completed before the
- * write is issued.
- */
tail = ACCESS_ONCE(rb->user_page->data_tail);
- smp_rmb();
offset = head = local_read(&rb->head);
- head += size;
- if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
+
+ /*
+ * The above forms a control dependency barrier separating the
+ * @tail load above from the data stores below. Since the @tail
+ * load is required to compute the branch to fail below.
+ *
+ * A, matches D; the full memory barrier userspace SHOULD issue
+ * after reading the data and before storing the new tail
+ * position.
+ *
+ * See perf_output_put_handle().
+ */
+
+ head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
- if (head - local_read(&rb->wakeup) > rb->watermark)
+ /*
+ * We rely on the implied barrier() by local_cmpxchg() to ensure
+ * none of the data stores below can be lifted up by the compiler.
+ */
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
- handle->page = offset >> (PAGE_SHIFT + page_order(rb));
- handle->page &= rb->nr_pages - 1;
- handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
- handle->addr = rb->data_pages[handle->page];
- handle->addr += handle->size;
- handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+ page_shift = PAGE_SHIFT + page_order(rb);
+
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
- if (have_lost) {
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
@@ -182,10 +202,16 @@ out:
return -ENOSPC;
}
-void perf_output_copy(struct perf_output_handle *handle,
+unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
- __output_copy(handle, buf, len);
+ return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+ unsigned int len)
+{
+ return __output_skip(handle, NULL, len);
}
void perf_output_end(struct perf_output_handle *handle)
@@ -206,7 +232,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
- rb->writable = 1;
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
@@ -306,11 +334,16 @@ void rb_free(struct ring_buffer *rb)
}
#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
- if (pgoff > (1UL << page_order(rb)))
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -330,10 +363,11 @@ static void rb_free_work(struct work_struct *work)
int i, nr;
rb = container_of(work, struct ring_buffer, work);
- nr = 1 << page_order(rb);
+ nr = data_page_nr(rb);
base = rb->user_page;
- for (i = 0; i < nr + 1; i++)
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
@@ -367,7 +401,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
- rb->nr_pages = 1;
+ rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 00000000000..6f3254e8c13
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1993 @@
+/*
+ * User-space Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2012
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h> /* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rmap.h> /* anon_vma_prepare */
+#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/ptrace.h> /* user_enable_single_step */
+#include <linux/kdebug.h> /* notifier mechanism */
+#include "../../mm/internal.h" /* munlock_vma_page */
+#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/uprobes.h>
+
+#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
+#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
+
+static struct rb_root uprobes_tree = RB_ROOT;
+/*
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
+ */
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
+
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+
+#define UPROBES_HASH_SZ 13
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+
+static struct percpu_rw_semaphore dup_mmap_sem;
+
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN 0
+
+struct uprobe {
+ struct rb_node rb_node; /* node in the rb tree */
+ atomic_t ref;
+ struct rw_semaphore register_rwsem;
+ struct rw_semaphore consumer_rwsem;
+ struct list_head pending_list;
+ struct uprobe_consumer *consumers;
+ struct inode *inode; /* Also hold a ref to inode */
+ loff_t offset;
+ unsigned long flags;
+
+ /*
+ * The generic code assumes that it has two members of unknown type
+ * owned by the arch-specific code:
+ *
+ * insn - copy_insn() saves the original instruction here for
+ * arch_uprobe_analyze_insn().
+ *
+ * ixol - potentially modified instruction to execute out of
+ * line, copied to xol_area by xol_get_insn_slot().
+ */
+ struct arch_uprobe arch;
+};
+
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
+/*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot. It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct page *page;
+
+ /*
+ * We keep the vma's vm_start rather than a pointer to the vma
+ * itself. The probed process or a naughty kernel module could make
+ * the vma go away, and we must handle that reasonably gracefully.
+ */
+ unsigned long vaddr; /* Page(s) of instruction slots */
+};
+
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ * - is_register: indicates if we are in register context.
+ * - Return 1 if the specified virtual address is in an
+ * executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+
+ if (is_register)
+ flags |= VM_WRITE;
+
+ return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
+}
+
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
+{
+ return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
+
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+ return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
+}
+
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma: vma that holds the pte pointing to page
+ * @addr: address the old @page is mapped at
+ * @page: the cowed page we are replacing by kpage
+ * @kpage: the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int err;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = addr;
+ const unsigned long mmun_end = addr + PAGE_SIZE;
+
+ /* For try_to_free_swap() and munlock_vma_page() below */
+ lock_page(page);
+
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ err = -EAGAIN;
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto unlock;
+
+ get_page(kpage);
+ page_add_new_anon_rmap(kpage, vma, addr);
+
+ if (!PageAnon(page)) {
+ dec_mm_counter(mm, MM_FILEPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
+ }
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ pte_unmap_unlock(ptep, ptl);
+
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_page(page);
+ put_page(page);
+
+ err = 0;
+ unlock:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ unlock_page(page);
+ return err;
+}
+
+/**
+ * is_swbp_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_swbp_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
+{
+ return *insn == UPROBE_SWBP_INSN;
+}
+
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ kunmap_atomic(kaddr);
+}
+
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+ uprobe_opcode_t old_opcode;
+ bool is_swbp;
+
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ is_swbp = is_swbp_insn(&old_opcode);
+
+ if (is_swbp_insn(new_opcode)) {
+ if (is_swbp) /* register: already installed? */
+ return 0;
+ } else {
+ if (!is_swbp) /* unregister: was it changed by us? */
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify is_trap_at_addr and
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
+ *
+ * uprobe_write_opcode - write the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held for write.
+ * Return 0 (success) or a negative errno.
+ */
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
+ uprobe_opcode_t opcode)
+{
+ struct page *old_page, *new_page;
+ struct vm_area_struct *vma;
+ int ret;
+
+retry:
+ /* Read the page with vaddr into memory */
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ if (ret <= 0)
+ return ret;
+
+ ret = verify_opcode(old_page, vaddr, &opcode);
+ if (ret <= 0)
+ goto put_old;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
+ ret = -ENOMEM;
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ if (!new_page)
+ goto put_old;
+
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ goto put_new;
+
+ __SetPageUptodate(new_page);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ ret = __replace_page(vma, vaddr, old_page, new_page);
+ if (ret)
+ mem_cgroup_uncharge_page(new_page);
+
+put_new:
+ page_cache_release(new_page);
+put_old:
+ put_page(old_page);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+
+/**
+ * set_swbp - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+}
+
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @auprobe: arch specific probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+}
+
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+ if (l->inode < r->inode)
+ return -1;
+
+ if (l->inode > r->inode)
+ return 1;
+
+ if (l->offset < r->offset)
+ return -1;
+
+ if (l->offset > r->offset)
+ return 1;
+
+ return 0;
+}
+
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe u = { .inode = inode, .offset = offset };
+ struct rb_node *n = uprobes_tree.rb_node;
+ struct uprobe *uprobe;
+ int match;
+
+ while (n) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ match = match_uprobe(&u, uprobe);
+ if (!match) {
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+ }
+
+ if (match < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ return NULL;
+}
+
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe;
+
+ spin_lock(&uprobes_treelock);
+ uprobe = __find_uprobe(inode, offset);
+ spin_unlock(&uprobes_treelock);
+
+ return uprobe;
+}
+
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+ struct rb_node **p = &uprobes_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct uprobe *u;
+ int match;
+
+ while (*p) {
+ parent = *p;
+ u = rb_entry(parent, struct uprobe, rb_node);
+ match = match_uprobe(uprobe, u);
+ if (!match) {
+ atomic_inc(&u->ref);
+ return u;
+ }
+
+ if (match < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ }
+
+ u = NULL;
+ rb_link_node(&uprobe->rb_node, parent, p);
+ rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+ /* get access + creation ref */
+ atomic_set(&uprobe->ref, 2);
+
+ return u;
+}
+
+/*
+ * Acquire uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ * increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ * get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+ struct uprobe *u;
+
+ spin_lock(&uprobes_treelock);
+ u = __insert_uprobe(uprobe);
+ spin_unlock(&uprobes_treelock);
+
+ return u;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe, *cur_uprobe;
+
+ uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+ if (!uprobe)
+ return NULL;
+
+ uprobe->inode = igrab(inode);
+ uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
+ init_rwsem(&uprobe->consumer_rwsem);
+
+ /* add to uprobes_tree, sorted on inode:offset */
+ cur_uprobe = insert_uprobe(uprobe);
+ /* a uprobe exists for this inode:offset combination */
+ if (cur_uprobe) {
+ kfree(uprobe);
+ uprobe = cur_uprobe;
+ iput(inode);
+ }
+
+ return uprobe;
+}
+
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ down_write(&uprobe->consumer_rwsem);
+ uc->next = uprobe->consumers;
+ uprobe->consumers = uc;
+ up_write(&uprobe->consumer_rwsem);
+}
+
+/*
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
+ * or return false.
+ */
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ struct uprobe_consumer **con;
+ bool ret = false;
+
+ down_write(&uprobe->consumer_rwsem);
+ for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+ if (*con == uc) {
+ *con = uc->next;
+ ret = true;
+ break;
+ }
+ }
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
+{
+ struct page *page;
+ /*
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
+ */
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ copy_from_page(page, offset, insn, nbytes);
+ page_cache_release(page);
+
+ return 0;
+}
+
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
+{
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = &uprobe->arch.insn;
+ int size = sizeof(uprobe->arch.insn);
+ int len, err = -EIO;
+
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
+
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
+ if (err)
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
+}
+
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
+ struct mm_struct *mm, unsigned long vaddr)
+{
+ int ret = 0;
+
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ return ret;
+
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ goto out;
+
+ ret = copy_insn(uprobe, file);
+ if (ret)
+ goto out;
+
+ ret = -ENOTSUPP;
+ if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
+ goto out;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
+ if (ret)
+ goto out;
+
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
+ smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+
+ out:
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long vaddr)
+{
+ bool first_uprobe;
+ int ret;
+
+ ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
+ if (ret)
+ return ret;
+
+ /*
+ * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+ * the task can hit this breakpoint right after __replace_page().
+ */
+ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ if (first_uprobe)
+ set_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ ret = set_swbp(&uprobe->arch, mm, vaddr);
+ if (!ret)
+ clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ else if (first_uprobe)
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ return ret;
+}
+
+static int
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ return set_orig_insn(&uprobe->arch, mm, vaddr);
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+/*
+ * There could be threads that have already hit the breakpoint. They
+ * will recheck the current insn and restart if find_uprobe() fails.
+ * See find_active_uprobe().
+ */
+static void delete_uprobe(struct uprobe *uprobe)
+{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
+ spin_lock(&uprobes_treelock);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
+ iput(uprobe->inode);
+ put_uprobe(uprobe);
+}
+
+struct map_info {
+ struct map_info *next;
+ struct mm_struct *mm;
+ unsigned long vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+ struct map_info *next = info->next;
+ kfree(info);
+ return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+ unsigned long pgoff = offset >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct map_info *curr = NULL;
+ struct map_info *prev = NULL;
+ struct map_info *info;
+ int more = 0;
+
+ again:
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (!valid_vma(vma, is_register))
+ continue;
+
+ if (!prev && !more) {
+ /*
+ * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * reclaim. This is optimistic, no harm done if it fails.
+ */
+ prev = kmalloc(sizeof(struct map_info),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (prev)
+ prev->next = NULL;
+ }
+ if (!prev) {
+ more++;
+ continue;
+ }
+
+ if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ continue;
+
+ info = prev;
+ prev = prev->next;
+ info->next = curr;
+ curr = info;
+
+ info->mm = vma->vm_mm;
+ info->vaddr = offset_to_vaddr(vma, offset);
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (!more)
+ goto out;
+
+ prev = curr;
+ while (curr) {
+ mmput(curr->mm);
+ curr = curr->next;
+ }
+
+ do {
+ info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+ if (!info) {
+ curr = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ info->next = prev;
+ prev = info;
+ } while (--more);
+
+ goto again;
+ out:
+ while (prev)
+ prev = free_map_info(prev);
+ return curr;
+}
+
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
+{
+ bool is_register = !!new;
+ struct map_info *info;
+ int err = 0;
+
+ percpu_down_write(&dup_mmap_sem);
+ info = build_map_info(uprobe->inode->i_mapping,
+ uprobe->offset, is_register);
+ if (IS_ERR(info)) {
+ err = PTR_ERR(info);
+ goto out;
+ }
+
+ while (info) {
+ struct mm_struct *mm = info->mm;
+ struct vm_area_struct *vma;
+
+ if (err && is_register)
+ goto free;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, info->vaddr);
+ if (!vma || !valid_vma(vma, is_register) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ goto unlock;
+
+ if (vma->vm_start > info->vaddr ||
+ vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
+ goto unlock;
+
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
+
+ unlock:
+ up_write(&mm->mmap_sem);
+ free:
+ mmput(mm);
+ info = free_map_info(info);
+ }
+ out:
+ percpu_up_write(&dup_mmap_sem);
+ return err;
+}
+
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
+}
+
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ int err;
+
+ if (WARN_ON(!consumer_del(uprobe, uc)))
+ return;
+
+ err = register_for_each_vma(uprobe, NULL);
+ /* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
+}
+
+/*
+ * uprobe_register - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @uc: information on howto handle the probe..
+ *
+ * Apart from the access refcount, uprobe_register() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple). Creation refcount stops uprobe_unregister from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @uc for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+ int ret;
+
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
+ return -EINVAL;
+
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
+ /* Racy, just to catch the obvious mistakes */
+ if (offset > i_size_read(inode))
+ return -EINVAL;
+
+ retry:
+ uprobe = alloc_uprobe(inode, offset);
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
+ }
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ return ret;
+}
+
+/*
+ * uprobe_unregister - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return;
+
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
+
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
+ }
+ up_read(&mm->mmap_sem);
+
+ return err;
+}
+
+static struct rb_node *
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
+{
+ struct rb_node *n = uprobes_tree.rb_node;
+
+ while (n) {
+ struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
+
+ if (inode < u->inode) {
+ n = n->rb_left;
+ } else if (inode > u->inode) {
+ n = n->rb_right;
+ } else {
+ if (max < u->offset)
+ n = n->rb_left;
+ else if (min > u->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
+ }
+
+ return n;
+}
+
+/*
+ * For a given range in vma, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *head)
+{
+ loff_t min, max;
+ struct rb_node *n, *t;
+ struct uprobe *u;
+
+ INIT_LIST_HEAD(head);
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ if (n) {
+ for (t = n; t; t = rb_prev(t)) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset < min)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ for (t = n; (t = rb_next(t)); ) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset > max)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ }
+ spin_unlock(&uprobes_treelock);
+}
+
+/*
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ *
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
+ */
+int uprobe_mmap(struct vm_area_struct *vma)
+{
+ struct list_head tmp_list;
+ struct uprobe *uprobe, *u;
+ struct inode *inode;
+
+ if (no_uprobe_events() || !valid_vma(vma, true))
+ return 0;
+
+ inode = file_inode(vma->vm_file);
+ if (!inode)
+ return 0;
+
+ mutex_lock(uprobes_mmap_hash(inode));
+ build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
+ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+ install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ }
+ put_uprobe(uprobe);
+ }
+ mutex_unlock(uprobes_mmap_hash(inode));
+
+ return 0;
+}
+
+static bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ loff_t min, max;
+ struct inode *inode;
+ struct rb_node *n;
+
+ inode = file_inode(vma->vm_file);
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ spin_unlock(&uprobes_treelock);
+
+ return !!n;
+}
+
+/*
+ * Called in context of a munmap of a vma.
+ */
+void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ if (no_uprobe_events() || !valid_vma(vma, false))
+ return;
+
+ if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+ return;
+
+ if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
+ test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ return;
+
+ if (vma_has_uprobes(vma, start, end))
+ set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+}
+
+/* Slot allocation for XOL */
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
+{
+ int ret = -EALREADY;
+
+ down_write(&mm->mmap_sem);
+ if (mm->uprobes_state.xol_area)
+ goto fail;
+
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
+ }
+
+ ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+ if (ret)
+ goto fail;
+
+ smp_wmb(); /* pairs with get_xol_area() */
+ mm->uprobes_state.xol_area = area;
+ fail:
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+static struct xol_area *__create_xol_area(unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
+
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
+ if (unlikely(!area))
+ goto out;
+
+ area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+ if (!area->bitmap)
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
+
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
+ set_bit(0, area->bitmap);
+ atomic_set(&area->slot_count, 1);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+
+ if (!xol_add_vma(mm, area))
+ return area;
+
+ __free_page(area->page);
+ free_bitmap:
+ kfree(area->bitmap);
+ free_area:
+ kfree(area);
+ out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
+ area = mm->uprobes_state.xol_area;
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
+}
+
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+ struct xol_area *area = mm->uprobes_state.xol_area;
+
+ if (!area)
+ return;
+
+ put_page(area->page);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
+void uprobe_start_dup_mmap(void)
+{
+ percpu_down_read(&dup_mmap_sem);
+}
+
+void uprobe_end_dup_mmap(void)
+{
+ percpu_up_read(&dup_mmap_sem);
+}
+
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ newmm->uprobes_state.xol_area = NULL;
+
+ if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
+ set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
+ set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ }
+}
+
+/*
+ * - search for a free slot.
+ */
+static unsigned long xol_take_insn_slot(struct xol_area *area)
+{
+ unsigned long slot_addr;
+ int slot_nr;
+
+ do {
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ break;
+
+ slot_nr = UINSNS_PER_PAGE;
+ continue;
+ }
+ wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+ } while (slot_nr >= UINSNS_PER_PAGE);
+
+ slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
+ atomic_inc(&area->slot_count);
+
+ return slot_addr;
+}
+
+/*
+ * xol_get_insn_slot - allocate a slot for xol.
+ * Returns the allocated slot address or 0.
+ */
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+{
+ struct xol_area *area;
+ unsigned long xol_vaddr;
+
+ area = get_xol_area();
+ if (!area)
+ return 0;
+
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
+ return 0;
+
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+
+ return xol_vaddr;
+}
+
+/*
+ * xol_free_insn_slot - If slot was earlier allocated by
+ * @xol_get_insn_slot(), make the slot available for
+ * subsequent requests.
+ */
+static void xol_free_insn_slot(struct task_struct *tsk)
+{
+ struct xol_area *area;
+ unsigned long vma_end;
+ unsigned long slot_addr;
+
+ if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ return;
+
+ slot_addr = tsk->utask->xol_vaddr;
+ if (unlikely(!slot_addr))
+ return;
+
+ area = tsk->mm->uprobes_state.xol_area;
+ vma_end = area->vaddr + PAGE_SIZE;
+ if (area->vaddr <= slot_addr && slot_addr < vma_end) {
+ unsigned long offset;
+ int slot_nr;
+
+ offset = slot_addr - area->vaddr;
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ if (slot_nr >= UINSNS_PER_PAGE)
+ return;
+
+ clear_bit(slot_nr, area->bitmap);
+ atomic_dec(&area->slot_count);
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
+
+ tsk->utask->xol_vaddr = 0;
+ }
+}
+
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+ void *src, unsigned long len)
+{
+ /* Initialize the slot */
+ copy_to_page(page, vaddr, src, len);
+
+ /*
+ * We probably need flush_icache_user_range() but it needs vma.
+ * This should work on most of architectures by default. If
+ * architecture needs to do something different it can define
+ * its own version of the function.
+ */
+ flush_dcache_page(page);
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+ return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+ struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
+
+ if (!utask)
+ return;
+
+ if (utask->active_uprobe)
+ put_uprobe(utask->active_uprobe);
+
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
+ xol_free_insn_slot(t);
+ kfree(utask);
+ t->utask = NULL;
+}
+
+/*
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *get_utask(void)
+{
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
+}
+
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->dup_xol_addr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ t->utask->dup_xol_addr = area->vaddr;
+ init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+ task_work_add(t, &t->utask->dup_xol_work, true);
+}
+
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
+ struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
+
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
+
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
+}
+
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
+
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
+
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
+}
+
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep. When xol insn itself
+ * triggers the signal, restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip). This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+ struct task_struct *t = current;
+ struct uprobe_task *utask = t->utask;
+
+ if (likely(!utask || !utask->active_uprobe))
+ return false;
+
+ WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+
+ if (signal_pending(t)) {
+ spin_lock_irq(&t->sighand->siglock);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ spin_unlock_irq(&t->sighand->siglock);
+
+ if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+ utask->state = UTASK_SSTEP_TRAPPED;
+ set_tsk_thread_flag(t, TIF_UPROBE);
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ }
+ }
+
+ return true;
+}
+
+static void mmf_recalc_uprobes(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!valid_vma(vma, false))
+ continue;
+ /*
+ * This is not strictly accurate, we can race with
+ * uprobe_unregister() and see the already removed
+ * uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
+ */
+ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
+ return;
+ }
+
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+}
+
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ struct page *page;
+ uprobe_opcode_t opcode;
+ int result;
+
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+
+ result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ if (result < 0)
+ return result;
+
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ put_page(page);
+ out:
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
+}
+
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, bp_vaddr);
+ if (vma && vma->vm_start <= bp_vaddr) {
+ if (valid_vma(vma, false)) {
+ struct inode *inode = file_inode(vma->vm_file);
+ loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+
+ uprobe = find_uprobe(inode, offset);
+ }
+
+ if (!uprobe)
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
+ } else {
+ *is_swbp = -EFAULT;
+ }
+
+ if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ mmf_recalc_uprobes(mm);
+ up_read(&mm->mmap_sem);
+
+ return uprobe;
+}
+
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
+
+ remove &= rc;
+ }
+
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+ utask->depth--;
+
+ if (!chained)
+ break;
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ unsigned long bp_vaddr;
+ int uninitialized_var(is_swbp);
+
+ bp_vaddr = uprobe_get_swbp_addr(regs);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (!uprobe) {
+ if (is_swbp > 0) {
+ /* No matching uprobe; signal SIGTRAP. */
+ send_sig(SIGTRAP, current, 0);
+ } else {
+ /*
+ * Either we raced with uprobe_unregister() or we can't
+ * access this memory. The latter is only possible if
+ * another thread plays with our ->mm. In both cases
+ * we can simply restart. If this vma was unmapped we
+ * can pretend this insn was not executed yet and get
+ * the (correct) SIGSEGV after restart.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+ }
+ return;
+ }
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
+ /*
+ * TODO: move copy_insn/etc into _register and remove this hack.
+ * After we hit the bp, _unregister + _register can install the
+ * new and not-yet-analyzed uprobe at the same address, restart.
+ */
+ smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+ goto out;
+
+ /* Tracing handlers use ->utask to communicate with fetch methods */
+ if (!get_utask())
+ goto out;
+
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ goto out;
+
+ handler_chain(uprobe, regs);
+
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ goto out;
+
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
+ return;
+
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+out:
+ put_uprobe(uprobe);
+}
+
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ int err = 0;
+
+ uprobe = utask->active_uprobe;
+ if (utask->state == UTASK_SSTEP_ACK)
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
+ else if (utask->state == UTASK_SSTEP_TRAPPED)
+ arch_uprobe_abort_xol(&uprobe->arch, regs);
+ else
+ WARN_ON_ONCE(1);
+
+ put_uprobe(uprobe);
+ utask->active_uprobe = NULL;
+ utask->state = UTASK_RUNNING;
+ xol_free_insn_slot(current);
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* see uprobe_deny_signal() */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
+}
+
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
+ * allows the thread to return from interrupt. After that handle_swbp()
+ * sets utask->active_uprobe.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
+ * and allows the thread to return from interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+
+ clear_thread_flag(TIF_UPROBE);
+
+ utask = current->utask;
+ if (utask && utask->active_uprobe)
+ handle_singlestep(utask, regs);
+ else
+ handle_swbp(regs);
+}
+
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
+ return 0;
+
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (!current->mm || !utask || !utask->active_uprobe)
+ /* task is currently not uprobed */
+ return 0;
+
+ utask->state = UTASK_SSTEP_ACK;
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+static struct notifier_block uprobe_exception_nb = {
+ .notifier_call = arch_uprobe_exception_notify,
+ .priority = INT_MAX-1, /* notified after kprobes, kgdb */
+};
+
+static int __init init_uprobes(void)
+{
+ int i;
+
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
+ mutex_init(&uprobes_mmap_mutex[i]);
+
+ if (percpu_init_rwsem(&dup_mmap_sem))
+ return -ENOMEM;
+
+ return register_die_notifier(&uprobe_exception_nb);
+}
+__initcall(init_uprobes);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae37422..83d4382f569 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
- .pers_low = 0, /* PER_LINUX personality. */
+ .pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
- return (ep);
+ return ep;
}
int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
out:
write_unlock(&exec_domains_lock);
- return (err);
+ return err;
}
+EXPORT_SYMBOL(register_exec_domain);
int
unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
write_unlock(&exec_domains_lock);
return 0;
}
+EXPORT_SYMBOL(unregister_exec_domain);
int __set_personality(unsigned int personality)
{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
return 0;
}
+EXPORT_SYMBOL(__set_personality);
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
return old;
}
-
-
-EXPORT_SYMBOL(register_exec_domain);
-EXPORT_SYMBOL(unregister_exec_domain);
-EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6..e5c4668f179 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -52,6 +52,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
+#include <linux/shm.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -73,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
__this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
+ list_del_rcu(&p->thread_node);
}
/*
@@ -84,6 +86,7 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
@@ -122,9 +125,10 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime += tsk->utime;
- sig->stime += tsk->stime;
- sig->gtime += tsk->gtime;
+ task_cputime(tsk, &utime, &stime);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -309,244 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-/**
- * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to kthreadd so it
- * isn't in the way of other processes and is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited from a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
- */
-static void reparent_to_kthreadd(void)
-{
- write_lock_irq(&tasklist_lock);
-
- ptrace_unlink(current);
- /* Reparent to init */
- current->real_parent = current->parent = kthreadd_task;
- list_move_tail(&current->sibling, &current->real_parent->children);
-
- /* Set the exit signal to SIGCHLD so we signal init on exit */
- current->exit_signal = SIGCHLD;
-
- if (task_nice(current) < 0)
- set_user_nice(current, 0);
- /* cpus_allowed? */
- /* rt_priority? */
- /* signals? */
- memcpy(current->signal->rlim, init_task.signal->rlim,
- sizeof(current->signal->rlim));
-
- atomic_inc(&init_cred.usage);
- commit_creds(&init_cred);
- write_unlock_irq(&tasklist_lock);
-}
-
-void __set_special_pids(struct pid *pid)
-{
- struct task_struct *curr = current->group_leader;
-
- if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
-
- if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
-}
-
-static void set_special_pids(struct pid *pid)
-{
- write_lock_irq(&tasklist_lock);
- __set_special_pids(pid);
- write_unlock_irq(&tasklist_lock);
-}
-
-/*
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- /* This is only needed for daemonize()'ed kthreads */
- sigdelset(&current->blocked, sig);
- /*
- * Kernel threads handle their own signals. Let the signal code
- * know it'll be handled, so that they don't get converted to
- * SIGKILL or just silently dropped.
- */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(allow_signal);
-
-int disallow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(disallow_signal);
-
-/*
- * Put all the gunge required to become a kernel thread without
- * attached user resources in one place where it belongs.
- */
-
-void daemonize(const char *name, ...)
-{
- va_list args;
- sigset_t blocked;
-
- va_start(args, name);
- vsnprintf(current->comm, sizeof(current->comm), name, args);
- va_end(args);
-
- /*
- * If we were started as result of loading a module, close all of the
- * user space pages. We don't need them, and if we didn't close them
- * they would be locked into memory.
- */
- exit_mm(current);
- /*
- * We don't want to have TIF_FREEZE set if the system-wide hibernation
- * or suspend transition begins right now.
- */
- current->flags |= (PF_NOFREEZE | PF_KTHREAD);
-
- if (current->nsproxy != &init_nsproxy) {
- get_nsproxy(&init_nsproxy);
- switch_task_namespaces(current, &init_nsproxy);
- }
- set_special_pids(&init_struct_pid);
- proc_clear_tty(current);
-
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* Become as one with the init task */
-
- daemonize_fs_struct();
- exit_files(current);
- current->files = init_task.files;
- atomic_inc(&current->files->count);
-
- reparent_to_kthreadd();
-}
-
-EXPORT_SYMBOL(daemonize);
-
-static void close_files(struct files_struct * files)
-{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
-
- /*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock because this is the last reference to the
- * files structure. But use RCU to shut RCU-lockdep up.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- rcu_read_unlock();
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file) {
- filp_close(file, files);
- cond_resched();
- }
- }
- i++;
- set >>= 1;
- }
- }
-}
-
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
-void put_files_struct(struct files_struct *files)
-{
- struct fdtable *fdt;
-
- if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /*
- * Free the fd and fdset arrays if we expanded them.
- * If the fdtable was embedded, pass files for freeing
- * at the end of the RCU grace period. Otherwise,
- * you can free files immediately.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- if (fdt != &files->fdtab)
- kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
- rcu_read_unlock();
- }
-}
-
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
-void exit_files(struct task_struct *tsk)
-{
- struct files_struct * files = tsk->files;
-
- if (files) {
- task_lock(tsk);
- tsk->files = NULL;
- task_unlock(tsk);
- put_files_struct(files);
- }
-}
-
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
@@ -589,14 +356,18 @@ retry:
}
/*
- * Search through everything else. We should not get
- * here often
+ * Search through everything else, we should not get here often.
*/
- do_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- } while_each_thread(g, c);
-
+ for_each_process(g) {
+ if (g->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ if (c->mm)
+ break;
+ }
+ }
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
@@ -628,7 +399,7 @@ assign_new_owner:
task_unlock(c);
put_task_struct(c);
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
@@ -642,6 +413,7 @@ static void exit_mm(struct task_struct * tsk)
mm_release(tsk, mm);
if (!mm)
return;
+ sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
@@ -668,7 +440,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
@@ -686,11 +458,11 @@ static void exit_mm(struct task_struct * tsk)
}
/*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ * child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
__releases(&tasklist_lock)
@@ -710,17 +482,37 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
if (unlikely(pid_ns->child_reaper == father)) {
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns))
- panic("Attempted to kill init!");
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?:
+ father->exit_code);
+ }
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
+ } else if (father->signal->has_child_subreaper) {
+ struct task_struct *reaper;
+
/*
- * We can not clear ->child_reaper or leave it alone.
- * There may by stealth EXIT_DEAD tasks on ->children,
- * forget_original_parent() must move them somewhere.
+ * Find the first ancestor marked as child_subreaper.
+ * Note that the code below checks same_thread_group(reaper,
+ * pid_ns->child_reaper). This is what we need to DTRT in a
+ * PID namespace. However we still need the check above, see
+ * http://marc.info/?l=linux-kernel&m=131385460420380
*/
- pid_ns->child_reaper = init_pid_ns.child_reaper;
+ for (reaper = father->real_parent;
+ reaper != &init_task;
+ reaper = reaper->real_parent) {
+ if (same_thread_group(reaper, pid_ns->child_reaper))
+ break;
+ if (!reaper->signal->is_child_subreaper)
+ continue;
+ thread = reaper;
+ do {
+ if (!(thread->flags & PF_EXITING))
+ return reaper;
+ } while_each_thread(reaper, thread);
+ }
}
return pid_ns->child_reaper;
@@ -743,7 +535,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (same_thread_group(p->real_parent, father))
return;
- /* We don't want people slaying init. */
+ /* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
@@ -812,31 +604,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk);
- exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
- /* Let father know we died
- *
- * Thread signals are configurable, but you aren't going to use
- * that to send signals to arbitrary processes.
- * That stops right now.
- *
- * If the parent exec id doesn't match the exec id we saved
- * when we started then we know the parent has changed security
- * domain.
- *
- * If our self_exec id doesn't match our parent_exec_id then
- * we have changed execution domain as these two values started
- * the same after a fork.
- */
- if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
- (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id))
- tsk->exit_signal = SIGCHLD;
-
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -876,9 +648,9 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
- "left\n",
- current->comm, free);
+ printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+ "%lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -935,8 +707,6 @@ void do_exit(long code)
schedule();
}
- exit_irq_thread();
-
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
@@ -953,7 +723,7 @@ void do_exit(long code)
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
- sync_mm_rss(tsk, tsk->mm);
+ sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
@@ -979,7 +749,10 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
- check_stack_usage();
+ if (group_dead)
+ disassociate_ctty(1);
+ exit_task_namespaces(tsk);
+ exit_task_work(tsk);
exit_thread();
/*
@@ -990,21 +763,17 @@ void do_exit(long code)
*/
perf_event_exit_task(tsk);
- cgroup_exit(tsk, 1);
-
- if (group_dead)
- disassociate_ctty(1);
+ cgroup_exit(tsk);
module_put(task_thread_info(tsk)->exec_domain->module);
- proc_exit_connector(tsk);
-
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
- ptrace_put_breakpoints(tsk);
+ flush_ptrace_hw_breakpoint(tsk);
exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
@@ -1018,7 +787,7 @@ void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held(tsk);
+ debug_check_no_locks_held();
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
@@ -1030,10 +799,14 @@ void do_exit(long code)
exit_io_context(tsk);
if (tsk->splice_pipe)
- __free_pipe_info(tsk->splice_pipe);
+ free_pipe_info(tsk->splice_pipe);
+
+ if (tsk->task_frag.page)
+ put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
@@ -1206,7 +979,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
- uid_t uid = __task_cred(p)->uid;
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
if (!likely(wo->wo_flags & WEXITED))
@@ -1228,17 +1001,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
+ traced = ptrace_reparented(p);
/*
- * Try to move the task's state to DEAD
- * only one thread is allowed to do this:
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = xchg(&p->exit_state, EXIT_DEAD);
- if (state != EXIT_ZOMBIE) {
- BUG_ON(state != EXIT_DEAD);
+ state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
- }
-
- traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
* thread_group_leader() to filter out sub-threads.
@@ -1264,17 +1033,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_times() to get times for the thread
+ * We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
- psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
@@ -1299,7 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to EXIT_DEAD.
+ * thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
@@ -1336,22 +1105,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (!retval)
retval = pid;
- if (traced) {
+ if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
- /*
- * If this is not a sub-thread, notify the parent.
- * If parent wants a zombie, don't release it now.
- */
- if (thread_group_leader(p) &&
- !do_notify_parent(p, p->exit_signal)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
- if (p != NULL)
+ if (state == EXIT_DEAD)
release_task(p);
return retval;
@@ -1419,7 +1185,7 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
@@ -1492,7 +1258,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
@@ -1528,7 +1294,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
- int ret = eligible_child(wo, p);
+ int ret;
+
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1546,33 +1317,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- /* dead body doesn't have much to contribute */
- if (unlikely(p->exit_state == EXIT_DEAD)) {
+ if (unlikely(p->exit_state == EXIT_TRACE)) {
/*
- * But do not ignore this task until the tracer does
- * wait_task_zombie()->do_notify_parent().
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
*/
- if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+ if (likely(!ptrace))
wo->notask_error = 0;
return 0;
}
- /* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
- * A zombie ptracee is only visible to its ptracer.
- * Notification and reaping will be cascaded to the real
- * parent when the ptracer detaches.
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
*/
- if (likely(!ptrace) && unlikely(p->ptrace)) {
- /* it will become visible, clear notask_error */
- wo->notask_error = 0;
- return 0;
- }
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
- if (!delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
/*
* Allow access to stopped/continued state via zombie by
@@ -1598,19 +1380,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
wo->notask_error = 0;
} else {
/*
- * If @p is ptraced by a task in its real parent's group,
- * hide group stop/continued state when looking at @p as
- * the real parent; otherwise, a single stop can be
- * reported twice as group and ptrace stops.
- *
- * If a ptracer wants to distinguish the two events for its
- * own children, it should create a separate process which
- * takes the role of real parent.
- */
- if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
- return 0;
-
- /*
* @p is alive and it's gonna stop, continue or exit, so
* there always is something to wait for.
*/
@@ -1809,9 +1578,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1849,8 +1615,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b824..d8a6446adbc 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
+/* Cleared by build time tools if the table is already sorted. */
+u32 __initdata __visible main_extable_sort_needed = 1;
+
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- sort_extable(__start___ex_table, __stop___ex_table);
+ if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ pr_notice("Sorting __ex_table...\n");
+ sort_extable(__start___ex_table, __stop___ex_table);
+ }
}
/* Given an address, look for it in the exception tables. */
@@ -55,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
- addr <= (unsigned long)_einittext)
+ addr < (unsigned long)_einittext)
return 1;
return 0;
}
@@ -63,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
- addr <= (unsigned long)_etext)
+ addr < (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index b77fd559c78..6a13c46cd87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,12 +28,15 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -47,6 +50,7 @@
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
@@ -66,6 +70,10 @@
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
+#include <linux/signalfd.h>
+#include <linux/uprobes.h>
+#include <linux/aio.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -110,32 +118,69 @@ int nr_processes(void)
return total;
}
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct_node(node) \
- kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
-# define free_task_struct(tsk) \
- kmem_cache_free(task_struct_cachep, (tsk))
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
+
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+ kmem_cache_free(task_struct_cachep, tsk);
+}
#endif
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
+
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
-#ifdef CONFIG_DEBUG_STACK_USAGE
- gfp_t mask = GFP_KERNEL | __GFP_ZERO;
-#else
- gfp_t mask = GFP_KERNEL;
-#endif
- struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+
+static void free_thread_info(struct thread_info *ti)
+{
+ kmem_cache_free(thread_info_cache, ti);
}
+
+void thread_info_cache_init(void)
+{
+ thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ THREAD_SIZE, 0, NULL);
+ BUG_ON(thread_info_cache == NULL);
+}
+# endif
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -166,9 +211,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
void free_task(struct task_struct *tsk)
{
account_kernel_stack(tsk->stack, -1);
+ arch_release_thread_info(tsk->stack);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
+ arch_release_task_struct(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -192,6 +240,8 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
+ security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -201,17 +251,11 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
void __init fork_init(unsigned long mempages)
{
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
@@ -243,7 +287,7 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC];
}
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
@@ -258,21 +302,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
int node = tsk_fork_get_node(orig);
int err;
- prepare_to_copy(orig);
-
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
ti = alloc_thread_info_node(tsk, node);
- if (!ti) {
- free_task_struct(tsk);
- return NULL;
- }
+ if (!ti)
+ goto free_tsk;
err = arch_dup_task_struct(tsk, orig);
if (err)
- goto out;
+ goto free_ti;
tsk->stack = ti;
@@ -295,13 +335,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
+ tsk->task_frag.page = NULL;
account_kernel_stack(ti, 1);
return tsk;
-out:
+free_ti:
free_thread_info(ti);
+free_tsk:
free_task_struct(tsk);
return NULL;
}
@@ -313,10 +355,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
- struct mempolicy *pol;
+ uprobe_start_dup_mmap();
down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
/*
* Not linked in yet - no deadlock potential:
*/
@@ -324,9 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_cache = NULL;
- mm->free_area_cache = oldmm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ mm->vmacache_seqnum = 0;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -345,16 +386,15 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- long pages = vma_pages(mpnt);
- mm->total_vm -= pages;
vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -pages);
+ -vma_pages(mpnt));
continue;
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
- if (security_vm_enough_memory(len))
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
}
@@ -363,11 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
goto fail_nomem;
*tmp = *mpnt;
INIT_LIST_HEAD(&tmp->anon_vma_chain);
- pol = mpol_dup(vma_policy(mpnt));
- retval = PTR_ERR(pol);
- if (IS_ERR(pol))
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
goto fail_nomem_policy;
- vma_set_policy(tmp, pol);
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
@@ -375,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
@@ -386,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mapping->i_mmap_writable++;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- vma_prio_tree_add(tmp, mpnt);
+ if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+ vma_nonlinear_insert(tmp,
+ &mapping->i_mmap_nonlinear);
+ else
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -427,9 +470,10 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ uprobe_end_dup_mmap();
return retval;
fail_nomem_anon_vma_fork:
- mpol_put(pol);
+ mpol_put(vma_policy(tmp));
fail_nomem_policy:
kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
@@ -479,7 +523,7 @@ static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
- INIT_HLIST_HEAD(&mm->ioctx_list);
+ mm->ioctx_table = NULL;
#endif
}
@@ -489,19 +533,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
- mm->flags = (current->mm) ?
- (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
- mm->nr_ptes = 0;
+ atomic_long_set(&mm->nr_ptes, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
+ clear_tlb_flush_pending(mm);
- if (likely(!mm_alloc_pgd(mm))) {
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
mm->def_flags = 0;
+ }
+
+ if (likely(!mm_alloc_pgd(mm))) {
mmu_notifier_mm_init(mm);
return mm;
}
@@ -510,6 +558,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
return NULL;
}
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
/*
* Allocate and initialize an mm_struct.
*/
@@ -537,9 +602,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON(mm->pmd_huge_pte);
-#endif
+ check_mm(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -552,6 +615,7 @@ void mmput(struct mm_struct *mm)
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) {
+ uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
@@ -562,7 +626,6 @@ void mmput(struct mm_struct *mm)
list_del(&mm->mmlist);
spin_unlock(&mmlist_lock);
}
- put_swap_token(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
@@ -570,26 +633,6 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
- fput(mm->exe_file);
- mm->exe_file = NULL;
- }
-
-}
-
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
if (new_exe_file)
@@ -597,15 +640,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
if (mm->exe_file)
fput(mm->exe_file);
mm->exe_file = new_exe_file;
- mm->num_exe_file_vmas = 0;
}
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of
- * VM_EXECUTABLE vmas */
+ /* We need mmap_sem to protect against races with removal of exe_file */
down_read(&mm->mmap_sem);
exe_file = mm->exe_file;
if (exe_file)
@@ -667,6 +708,38 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
return mm;
}
+static void complete_vfork_done(struct task_struct *tsk)
+{
+ struct completion *vfork;
+
+ task_lock(tsk);
+ vfork = tsk->vfork_done;
+ if (likely(vfork)) {
+ tsk->vfork_done = NULL;
+ complete(vfork);
+ }
+ task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+ struct completion *vfork)
+{
+ int killed;
+
+ freezer_do_not_count();
+ killed = wait_for_completion_killable(vfork);
+ freezer_count();
+
+ if (killed) {
+ task_lock(child);
+ child->vfork_done = NULL;
+ task_unlock(child);
+ }
+
+ put_task_struct(child);
+ return killed;
+}
+
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
@@ -682,8 +755,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
*/
void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- struct completion *vfork_done = tsk->vfork_done;
-
/* Get rid of any futexes when releasing the mm */
#ifdef CONFIG_FUTEX
if (unlikely(tsk->robust_list)) {
@@ -700,20 +771,17 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
exit_pi_state_list(tsk);
#endif
+ uprobe_free_utask(tsk);
+
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
- /* notify parent sleeping on vfork() */
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
-
/*
* If we're exiting normally, clear a user-space tid field if
* requested. We leave this alone when dying by signal, to leave
* the value intact in a core dump, and to save the unnecessary
- * trouble otherwise. Userland only wants this done for a sys_exit.
+ * trouble, say, a killed vfork parent shouldn't touch this mm.
+ * Userland only wants this done for a sys_exit.
*/
if (tsk->clear_child_tid) {
if (!(tsk->flags & PF_SIGNALED) &&
@@ -728,20 +796,24 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
}
tsk->clear_child_tid = NULL;
}
+
+ /*
+ * All done, finally we can wake up parent and return this mm to him.
+ * Also kthread_stop() uses this completion for synchronization.
+ */
+ if (tsk->vfork_done)
+ complete_vfork_done(tsk);
}
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
-struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
- if (!oldmm)
- return NULL;
-
mm = allocate_mm();
if (!mm)
goto fail_nomem;
@@ -749,14 +821,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
mm_init_cpumask(mm);
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
-
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -818,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
@@ -830,10 +900,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
goto fail_nomem;
good_mm:
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
@@ -901,9 +967,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
* Share io context with parent, if CLONE_IO is set
*/
if (clone_flags & CLONE_IO) {
- tsk->io_context = ioc_task_link(ioc);
- if (unlikely(!tsk->io_context))
- return -ENOMEM;
+ ioc_task_link(ioc);
+ tsk->io_context = ioc;
} else if (ioprio_valid(ioc->ioprio)) {
new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
if (unlikely(!new_ioc))
@@ -935,8 +1000,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count))
+ if (atomic_dec_and_test(&sighand->count)) {
+ signalfd_cleanup(sighand);
kmem_cache_free(sighand_cachep, sighand);
+ }
}
@@ -977,9 +1044,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
+
+ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
+
init_waitqueue_head(&sig->wait_chldexit);
- if (clone_flags & CLONE_NEWPID)
- sig->flags |= SIGNAL_UNKILLABLE;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
@@ -1000,25 +1070,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_rwsem(&sig->group_rwsem);
#endif
- sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+ sig->has_child_subreaper = current->signal->has_child_subreaper ||
+ current->signal->is_child_subreaper;
+
mutex_init(&sig->cred_guard_mutex);
return 0;
}
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
-{
- unsigned long new_flags = p->flags;
-
- new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
- new_flags |= PF_FORKNOEXEC;
- new_flags |= PF_STARTING;
- p->flags = new_flags;
-}
-
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@ -1030,17 +1092,19 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- plist_head_init(&p->pi_waiters);
+ p->pi_waiters = RB_ROOT;
+ p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
+ p->pi_top_task = NULL;
#endif
}
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
mm->owner = p;
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Initialize POSIX timer handling for a single task.
@@ -1055,6 +1119,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}
+static inline void
+init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
+{
+ task->pids[type].pid = pid;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1065,7 +1135,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
*/
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
@@ -1073,11 +1142,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1103,6 +1174,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
+ /*
+ * If the new process will be in a different pid or user namespace
+ * do not allow it to share a thread group or signal handlers or
+ * parent with the forking task.
+ */
+ if (clone_flags & CLONE_SIGHAND) {
+ if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
+ (task_active_pid_ns(current) !=
+ current->nsproxy->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
@@ -1113,6 +1196,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1123,8 +1207,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
- if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- p->real_cred->user != INIT_USER)
+ if (p->real_cred->user != INIT_USER &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
@@ -1145,9 +1229,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- copy_flags(clone_flags, p);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
@@ -1158,9 +1242,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = p->prev_stime = 0;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqlock_init(&p->vtime_seqlock);
+ p->vtime_snap = 0;
+ p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
@@ -1185,21 +1275,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_cgroup;
+ goto bad_fork_cleanup_threadgroup_lock;
}
- mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+ seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- p->hardirqs_enabled = 1;
-#else
p->hardirqs_enabled = 0;
-#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
@@ -1221,13 +1307,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
- sched_fork(p);
+ retval = sched_fork(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
@@ -1260,22 +1352,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+ retval = copy_thread(clone_flags, stack_start, stack_size, p);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (!pid)
goto bad_fork_cleanup_io;
}
- p->pid = pid_nr(pid);
- p->tgid = p->pid;
- if (clone_flags & CLONE_THREAD)
- p->tgid = current->tgid;
-
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
@@ -1310,28 +1397,32 @@ static struct task_struct *copy_process(unsigned long clone_flags,
clear_all_latency_tracing(p);
/* ok, now we should be set up.. */
- p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
- p->pdeath_signal = 0;
- p->exit_state = 0;
+ p->pid = pid_nr(pid);
+ if (clone_flags & CLONE_THREAD) {
+ p->exit_signal = -1;
+ p->group_leader = current->group_leader;
+ p->tgid = current->tgid;
+ } else {
+ if (clone_flags & CLONE_PARENT)
+ p->exit_signal = current->group_leader->exit_signal;
+ else
+ p->exit_signal = (clone_flags & CSIGNAL);
+ p->group_leader = p;
+ p->tgid = p->pid;
+ }
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
- /*
- * Ok, make it visible to the rest of the system.
- * We dont wake it up yet.
- */
- p->group_leader = p;
+ p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
+ p->task_works = NULL;
- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
- /* Need tasklist lock for parent etc handling! */
+ /*
+ * Make it visible to the rest of the system, but dont wake it up yet.
+ * Need tasklist lock for parent etc handling!
+ */
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
@@ -1361,36 +1452,44 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_free_pid;
}
- if (clone_flags & CLONE_THREAD) {
- current->signal->nr_threads++;
- atomic_inc(&current->signal->live);
- atomic_inc(&current->signal->sigcnt);
- p->group_leader = current->group_leader;
- list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
- }
-
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
+ init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
- if (is_child_reaper(pid))
- p->nsproxy->pid_ns->child_reaper = p;
+ init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ init_task_pid(p, PIDTYPE_SID, task_session(current));
+
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
- attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
- attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ attach_pid(p, PIDTYPE_PGID);
+ attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
+ } else {
+ current->signal->nr_threads++;
+ atomic_inc(&current->signal->live);
+ atomic_inc(&current->signal->sigcnt);
+ list_add_tail_rcu(&p->thread_group,
+ &p->group_leader->thread_group);
+ list_add_tail_rcu(&p->thread_node,
+ &p->signal->thread_head);
}
- attach_pid(p, PIDTYPE_PID, pid);
+ attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
@@ -1398,6 +1497,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
+ uprobe_copy_process(p, clone_flags);
return p;
@@ -1429,11 +1529,10 @@ bad_fork_cleanup_policy:
perf_event_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_cgroup:
+bad_fork_cleanup_threadgroup_lock:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
@@ -1445,12 +1544,6 @@ fork_out:
return ERR_PTR(retval);
}
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-{
- memset(regs, 0, sizeof(struct pt_regs));
- return regs;
-}
-
static inline void init_idle_pids(struct pid_link *links)
{
enum pid_type type;
@@ -1461,13 +1554,10 @@ static inline void init_idle_pids(struct pid_link *links)
}
}
-struct task_struct * __cpuinit fork_idle(int cpu)
+struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- struct pt_regs regs;
-
- task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
- &init_struct_pid, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1484,7 +1574,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
@@ -1494,27 +1583,12 @@ long do_fork(unsigned long clone_flags,
long nr;
/*
- * Do some preliminary argument and permissions checking before we
- * actually start allocating stuff
- */
- if (clone_flags & CLONE_NEWUSER) {
- if (clone_flags & CLONE_THREAD)
- return -EINVAL;
- /* hopefully this check will go away when userns support is
- * complete
- */
- if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- !capable(CAP_SETGID))
- return -EPERM;
- }
-
- /*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
- if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1526,7 +1600,7 @@ long do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, regs, stack_size,
+ p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
@@ -1534,10 +1608,12 @@ long do_fork(unsigned long clone_flags,
*/
if (!IS_ERR(p)) {
struct completion vfork;
+ struct pid *pid;
trace_sched_process_fork(current, p);
- nr = task_pid_vnr(p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
@@ -1545,34 +1621,84 @@ long do_fork(unsigned long clone_flags,
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
+ get_task_struct(p);
}
- /*
- * We set PF_STARTING at creation in case tracing wants to
- * use this to distinguish a fully live task from one that
- * hasn't finished SIGSTOP raising yet. Now we clear it
- * and set the child going.
- */
- p->flags &= ~PF_STARTING;
-
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
- ptrace_event(trace, nr);
+ ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
- freezer_do_not_count();
- wait_for_completion(&vfork);
- freezer_count();
- ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL);
+}
+
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+ return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+ /* can not support in nommu mode */
+ return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int, tls_val,
+ int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int, stack_size,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#endif
+{
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+}
+#endif
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
@@ -1622,7 +1748,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
{
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
@@ -1689,19 +1816,35 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
- err = check_unshare_flags(unshare_flags);
- if (err)
- goto bad_unshare_out;
-
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto bad_unshare_out;
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -1715,11 +1858,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
- err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+ err = unshare_userns(unshare_flags, &new_cred);
if (err)
goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
- if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1727,10 +1874,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
exit_sem(current);
}
- if (new_nsproxy) {
+ if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
- new_nsproxy = NULL;
- }
task_lock(current);
@@ -1752,11 +1897,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
task_unlock(current);
- }
- if (new_nsproxy)
- put_nsproxy(new_nsproxy);
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
+ }
+bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed..aa6a8aadb91 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
bool pm_freezing;
bool pm_nosig_freezing;
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
+
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
@@ -33,7 +39,7 @@ static DEFINE_SPINLOCK(freezer_lock);
*/
bool freezing_slow_path(struct task_struct *p)
{
- if (p->flags & PF_NOFREEZE)
+ if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
@@ -99,9 +105,9 @@ static void fake_signal_wake_up(struct task_struct *p)
* freeze_task - send a freeze request to given task
* @p: task to send the request to
*
- * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
- * flag and either sending a fake signal to it or waking it up, depending
- * on whether it has %PF_FREEZER_NOSIG set.
+ * If @p is freezing, the freeze request is sent either by sending a fake
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ * thread).
*
* RETURNS:
* %false, if @p is not freezing or already frozen; %true, otherwise
@@ -110,23 +116,28 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
+ /*
+ * This check can race with freezer_do_not_count, but worst case that
+ * will result in an extra wakeup being sent to the task. It does not
+ * race with freezer_count(), the barriers in freezer_count() and
+ * freezer_should_skip() ensure that either freezer_count() sees
+ * freezing == true in try_to_freeze() and freezes, or
+ * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
+ * normally.
+ */
+ if (freezer_should_skip(p))
+ return false;
+
spin_lock_irqsave(&freezer_lock, flags);
if (!freezing(p) || frozen(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
- if (!(p->flags & PF_KTHREAD)) {
+ if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
- /*
- * fake_signal_wake_up() goes through p's scheduler
- * lock and guarantees that TASK_STOPPED/TRACED ->
- * TASK_RUNNING transition can't race with task state
- * testing in try_to_freeze_tasks().
- */
- } else {
+ else
wake_up_state(p, TASK_INTERRUPTIBLE);
- }
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173..b632b5f3f09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,14 +59,121 @@
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
+#include <linux/bootmem.h>
#include <asm/futex.h>
-#include "rtmutex_common.h"
+#include "locking/rtmutex_common.h"
-int __read_mostly futex_cmpxchg_enabled;
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `-------> mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+int __read_mostly futex_cmpxchg_enabled;
+#endif
/*
* Futex flags used to encode options to functions and preserve them across
@@ -143,11 +250,59 @@ static const struct futex_q futex_q_init = {
* waiting on a futex.
*/
struct futex_hash_bucket {
+ atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
+
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static inline void futex_get_mm(union futex_key *key)
+{
+ atomic_inc(&key->private.mm->mm_count);
+ /*
+ * Ensure futex_get_mm() implies a full barrier such that
+ * get_futex_key() implies a full barrier. This is relied upon
+ * as full barrier (B), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
/*
* We hash on the keys returned from get_futex_key (see below).
@@ -157,7 +312,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
- return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+ return &futex_queues[hash & (futex_hashsize - 1)];
}
/*
@@ -183,10 +338,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode);
+ ihold(key->shared.inode); /* implies MB (B) */
break;
case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
+ futex_get_mm(key); /* implies MB (B) */
break;
}
}
@@ -221,10 +376,11 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -246,6 +402,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
return -EINVAL;
address -= key->both.offset;
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -254,11 +413,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
- return -EFAULT;
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
return 0;
}
@@ -283,7 +440,7 @@ again:
put_page(page);
/* serialize against __split_huge_page_splitting() */
local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
page_head = compound_head(page);
/*
* page_head is valid pointer but we must pin
@@ -362,10 +519,10 @@ again:
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = page_head->index;
+ key->shared.pgoff = basepage_index(page);
}
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
out:
unlock_page(page_head);
@@ -586,27 +743,74 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
+ */
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct plist_head *head;
struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex(&this->key, key)) {
/*
- * Another waiter already exists - bump up
- * the refcount and return its pi_state:
+ * Sanity check the waiter before increasing
+ * the refcount and attaching to it.
*/
pi_state = this->pi_state;
/*
- * Userspace might have messed up non-PI and PI futexes
+ * Userspace might have messed up non-PI and
+ * PI futexes [3]
*/
if (unlikely(!pi_state))
return -EINVAL;
@@ -614,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
WARN_ON(!atomic_read(&pi_state->refcount));
/*
- * When pi_state->owner is NULL then the owner died
- * and another waiter is on the fly. pi_state->owner
- * is fixed up by the task which acquires
- * pi_state->rt_mutex.
- *
- * We do not check for pid == 0 which can happen when
- * the owner died and robust_list_exit() cleared the
- * TID.
+ * Handle the owner died case:
*/
- if (pid && pi_state->owner) {
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and
+ * wakes the topmost waiter. The task which
+ * acquires the pi_state->rt_mutex will fixup
+ * owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user
+ * space TID is not 0. Inconsistent
+ * state. [5]
+ */
+ if (pid)
+ return -EINVAL;
+ /*
+ * Take a ref on the state and
+ * return. [4]
+ */
+ goto out_state;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner
+ * has not yet executed exit_pi_state_list()
+ * or some waiter acquired the rtmutex in the
+ * pi state, but did not yet fixup the TID in
+ * user space.
+ *
+ * Take a ref on the state and return. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
/*
- * Bail out if user space manipulated the
- * futex value.
+ * If the owner died bit is not set,
+ * then the pi_state must have an
+ * owner. [7]
*/
- if (pid != task_pid_vnr(pi_state->owner))
+ if (!pi_state->owner)
return -EINVAL;
}
+ /*
+ * Bail out if user space manipulated the
+ * futex value. If pi state exists then the
+ * owner TID must be the same as the user
+ * space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+
+ out_state:
atomic_inc(&pi_state->refcount);
*ps = pi_state;
-
return 0;
}
}
/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0
+ * the new pi_state to it, but bail out when TID = 0 [1]
*/
if (!pid)
return -ESRCH;
@@ -649,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
@@ -669,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return ret;
}
+ /*
+ * No existing pi state. First waiter. [2]
+ */
pi_state = alloc_pi_state();
/*
@@ -703,9 +951,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -715,7 +963,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, ownerdied = 0;
+ int lock_taken, ret, force_take = 0;
u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
@@ -740,10 +988,18 @@ retry:
return -EDEADLK;
/*
- * Surprise - we got the lock. Just return to userspace:
+ * Surprise - we got the lock, but we do not trust user space at all.
*/
- if (unlikely(!curval))
- return 1;
+ if (unlikely(!curval)) {
+ /*
+ * We verify whether there is kernel state for this
+ * futex. If not, we can safely assume, that the 0 ->
+ * TID transition is correct. If state exists, we do
+ * not bother to fixup the user space state as it was
+ * corrupted already.
+ */
+ return futex_top_waiter(hb, key) ? -EINVAL : 1;
+ }
uval = curval;
@@ -754,17 +1010,15 @@ retry:
newval = curval | FUTEX_WAITERS;
/*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
+ * Should we force take the futex? See below.
*/
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
+ if (unlikely(force_take)) {
+ /*
+ * Keep the OWNER_DIED and the WAITERS bit and set the
+ * new TID value.
+ */
newval = (curval & ~FUTEX_TID_MASK) | vpid;
- ownerdied = 0;
+ force_take = 0;
lock_taken = 1;
}
@@ -774,7 +1028,7 @@ retry:
goto retry;
/*
- * We took the lock due to owner died take over.
+ * We took the lock due to forced take over.
*/
if (unlikely(lock_taken))
return 1;
@@ -789,20 +1043,25 @@ retry:
switch (ret) {
case -ESRCH:
/*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
+ * We failed to find an owner for this
+ * futex. So we have no pi_state to block
+ * on. This can happen in two cases:
+ *
+ * 1) The owner died
+ * 2) A stale FUTEX_WAITERS bit
+ *
+ * Re-read the futex value.
*/
if (get_futex_value_locked(&curval, uaddr))
return -EFAULT;
/*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
+ * If the owner died or we have a stale
+ * WAITERS bit the owner TID in the user space
+ * futex is 0.
*/
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
+ if (!(curval & FUTEX_TID_MASK)) {
+ force_take = 1;
goto retry;
}
default:
@@ -829,6 +1088,7 @@ static void __unqueue_futex(struct futex_q *q)
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
}
/*
@@ -839,6 +1099,9 @@ static void wake_futex(struct futex_q *q)
{
struct task_struct *p = q->task;
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
* a non-futex wake up happens on another CPU then the task
@@ -867,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ int ret = 0;
if (!pi_state)
return -EINVAL;
@@ -890,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = this->task;
/*
- * We pass it to the next owner. (The WAITERS bit is always
- * kept enabled while there is PI state around. We must also
- * preserve the owner died bit.)
+ * We pass it to the next owner. The WAITERS bit is always
+ * kept enabled while there is PI state around. We cleanup the
+ * owner died bit, because we are the owner.
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- int ret = 0;
-
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -974,7 +1234,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
int ret;
@@ -986,10 +1245,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
goto out;
hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+ goto out_put_key;
+
spin_lock(&hb->lock);
- head = &hb->chain;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
@@ -1007,6 +1270,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
+out_put_key:
put_futex_key(&key);
out:
return ret;
@@ -1022,7 +1286,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret;
@@ -1070,10 +1333,12 @@ retry_private:
goto retry;
}
- head = &hb1->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (match_futex (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
@@ -1081,11 +1346,13 @@ retry_private:
}
if (op_ret > 0) {
- head = &hb2->chain;
-
op_ret = 0;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
if (match_futex (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++op_ret >= nr_wake2)
break;
@@ -1094,6 +1361,7 @@ retry_private:
ret += op_ret;
}
+out_unlock:
double_unlock_hb(hb1, hb2);
out_put_keys:
put_futex_key(&key2);
@@ -1121,7 +1389,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
*/
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -1174,9 +1444,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1187,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1215,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
@@ -1237,8 +1509,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1249,12 +1521,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
int drop_count = 0, task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head1;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
* requeue_pi requires a pi_state, try to allocate it now
* without any locks in case it fails.
*/
@@ -1292,10 +1569,20 @@ retry:
if (unlikely(ret != 0))
goto out_put_key1;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry_private:
+ hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -1305,6 +1592,7 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1);
if (ret)
@@ -1337,16 +1625,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1354,6 +1651,7 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
@@ -1363,6 +1661,7 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
cond_resched();
@@ -1372,8 +1671,7 @@ retry_private:
}
}
- head1 = &hb1->chain;
- plist_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (task_count - nr_wake >= nr_requeue)
break;
@@ -1383,9 +1681,13 @@ retry_private:
/*
* FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter)) {
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
ret = -EINVAL;
break;
}
@@ -1435,6 +1737,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
/*
* drop_futex_key_refs() must be called outside the spinlocks. During
@@ -1462,17 +1765,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
struct futex_hash_bucket *hb;
hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock);
+ spin_lock(&hb->lock); /* implies MB (A) */
return hb;
}
static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
spin_unlock(&hb->lock);
+ hb_waiters_dec(hb);
}
/**
@@ -1515,8 +1830,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1686,9 +2001,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1785,7 +2100,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- schedule();
+ freezable_schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -1803,8 +2118,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -1842,7 +2157,7 @@ retry_private:
ret = get_futex_value_locked(&uval, uaddr);
if (ret) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = get_user(uval, uaddr);
if (ret)
@@ -1856,7 +2171,7 @@ retry_private:
}
if (uval != val) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = -EWOULDBLOCK;
}
@@ -2004,7 +2319,7 @@ retry_private:
* Task is exiting and we just wait for the
* exit to complete.
*/
- queue_unlock(&q, hb);
+ queue_unlock(hb);
put_futex_key(&q.key);
cond_resched();
goto retry;
@@ -2056,7 +2371,7 @@ retry_private:
goto out_put_key;
out_unlock_put_key:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
out_put_key:
put_futex_key(&q.key);
@@ -2066,7 +2381,7 @@ out:
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
@@ -2088,7 +2403,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
u32 uval, vpid = task_pid_vnr(current);
int ret;
@@ -2112,9 +2426,10 @@ retry:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
- * anyone else up:
+ * anyone else up. We only try this if neither the waiters nor
+ * the owner died bit are set.
*/
- if (!(uval & FUTEX_OWNER_DIED) &&
+ if (!(uval & ~FUTEX_TID_MASK) &&
cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
@@ -2128,9 +2443,7 @@ retry:
* Ok, other tasks may need to be woken up - check waiters
* and do the wakeup if necessary:
*/
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -2146,11 +2459,9 @@ retry:
/*
* No waiters - kernel unlocks the futex:
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
- }
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
out_unlock:
spin_unlock(&hb->lock);
@@ -2182,9 +2493,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2207,6 +2518,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* Unqueue the futex_q and determine which it was.
*/
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2226,18 +2538,17 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2255,8 +2566,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2271,6 +2582,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
if (!bitset)
return -EINVAL;
@@ -2289,6 +2603,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* code while we sleep on uaddr.
*/
debug_rt_mutex_init_waiter(&rt_waiter);
+ RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&rt_waiter.tree_entry);
rt_waiter.task = NULL;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2307,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (ret)
goto out_key2;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (match_futex(&q.key, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -2342,7 +2667,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
* the pi_state.
*/
- WARN_ON(!&q.pi_state);
+ WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2369,7 +2694,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
- if (rt_mutex_owner(pi_mutex) == current)
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
@@ -2443,40 +2768,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
{
struct robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- /* If victim is in different user_ns, then uids are not
- comparable, so we must have CAP_SYS_PTRACE */
- if (cred->user->user_ns != pcred->user->user_ns) {
- if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
- goto ok;
- }
- /* If victim is in same user_ns, then uids are comparable */
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
-ok:
- head = p->robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
@@ -2628,7 +2942,7 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ int cmd = op & FUTEX_CMD_MASK;
unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2955,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
}
switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_UNLOCK_PI:
+ case FUTEX_TRYLOCK_PI:
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, flags, val, timeout, val3);
- break;
+ return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, flags, val, val3);
- break;
+ return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- break;
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
- break;
+ return futex_lock_pi(uaddr, flags, val, timeout, 0);
case FUTEX_UNLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, flags);
- break;
+ return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
- break;
+ return futex_lock_pi(uaddr, flags, 0, timeout, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- break;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- break;
- default:
- ret = -ENOSYS;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
}
- return ret;
+ return -ENOSYS;
}
@@ -2720,10 +3029,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
- int i;
/*
* This will fail and we want it. Some arch implementations do
@@ -2737,8 +3046,31 @@ static int __init futex_init(void)
*/
if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ futex_detect_cmpxchg();
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 5f9e689dc8f..55c8c9349cf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,6 +10,8 @@
#include <linux/compat.h>
#include <linux/nsproxy.h>
#include <linux/futex.h>
+#include <linux/ptrace.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
@@ -115,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
}
}
-asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
- compat_size_t len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
@@ -130,46 +132,35 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
return 0;
}
-asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
- compat_size_t __user *len_ptr)
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->compat_robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- /* If victim is in different user_ns, then uids are not
- comparable, so we must have CAP_SYS_PTRACE */
- if (cred->user->user_ns != pcred->user->user_ns) {
- if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
- goto ok;
- }
- /* If victim is in same user_ns, then uids are comparable */
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
-ok:
- head = p->compat_robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
@@ -180,9 +171,9 @@ err_unlock:
return ret;
}
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
- struct compat_timespec __user *utime, u32 __user *uaddr2,
- u32 val3)
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
@@ -192,7 +183,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (get_compat_timespec(&ts, utime))
+ if (compat_get_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
return -EINVAL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc..d04ce8ac439 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
@@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
+choice
+ prompt "Specify GCOV format"
+ depends on GCOV_KERNEL
+ default GCOV_FORMAT_AUTODETECT
+ ---help---
+ The gcov format is usually determined by the GCC version, but there are
+ exceptions where format changes are integrated in lower-version GCCs.
+ In such a case use this option to adjust the format used in the kernel
+ accordingly.
+
+ If unsure, choose "Autodetect".
+
+config GCOV_FORMAT_AUTODETECT
+ bool "Autodetect"
+ ---help---
+ Select this option to use the format that corresponds to your GCC
+ version.
+
+config GCOV_FORMAT_3_4
+ bool "GCC 3.4 format"
+ ---help---
+ Select this option to use the format defined by GCC 3.4.
+
+config GCOV_FORMAT_4_7
+ bool "GCC 4.7 format"
+ ---help---
+ Select this option to use the format defined by GCC 4.7.
+
+endchoice
+
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index e97ca59e252..52aa7e8de92 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,33 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
+# if-lt
+# Usage VAR := $(call if-lt, $(a), $(b))
+# Returns 1 if (a < b)
+if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
+
+ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
+ cc-ver := 0304
+else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
+ cc-ver := 0407
+else
+# Use cc-version if available, otherwise set 0
+#
+# scripts/Kbuild.include, which contains cc-version function, is not included
+# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
+# Meaning cc-ver is empty causing if-lt test to fail with
+# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
+# This has no affect on the clean phase, but the error message could be
+# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
+# is not available. We can probably move if-lt to Kbuild.include, so it's also
+# not defined during clean or to include Kbuild.include in
+# scripts/Makefile.clean. But the following workaround seems least invasive.
+ cc-ver := $(if $(call cc-version),$(call cc-version),0)
+endif
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
+
+ifeq ($(call if-lt, $(cc-ver), 0407),1)
+ obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
+else
+ obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
+endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9b22d03cc58..b358a802fd1 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,7 +20,6 @@
#include <linux/mutex.h>
#include "gcov.h"
-static struct gcov_info *gcov_info_head;
static int gcov_events_enabled;
static DEFINE_MUTEX(gcov_lock);
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)
mutex_lock(&gcov_lock);
if (gcov_version == 0) {
- gcov_version = info->version;
+ gcov_version = gcov_info_version(info);
/*
* Printing gcc's version magic may prove useful for debugging
* incompatibility reports.
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)
* Add new profiling data structure to list and inform event
* listener.
*/
- info->next = gcov_info_head;
- gcov_info_head = info;
+ gcov_info_link(info);
if (gcov_events_enabled)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
@@ -81,6 +79,18 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_delta);
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
@@ -91,13 +101,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);
*/
void gcov_enable_events(void)
{
- struct gcov_info *info;
+ struct gcov_info *info = NULL;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
+
/* Perform event callback for previously registered entries. */
- for (info = gcov_info_head; info; info = info->next)
+ while ((info = gcov_info_next(info)))
gcov_event(GCOV_ADD, info);
+
mutex_unlock(&gcov_lock);
}
@@ -112,25 +124,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
- struct gcov_info *info;
- struct gcov_info *prev;
+ struct gcov_info *info = NULL;
+ struct gcov_info *prev = NULL;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
- prev = NULL;
+
/* Remove entries located in module from linked list. */
- for (info = gcov_info_head; info; info = info->next) {
+ while ((info = gcov_info_next(info))) {
if (within(info, mod->module_core, mod->core_size)) {
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
+ gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
+
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 9bd0934f6c3..15ff01a7637 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -74,8 +74,8 @@ static int __init gcov_persist_setup(char *str)
{
unsigned long val;
- if (strict_strtoul(str, 0, &val)) {
- pr_warning("invalid gcov_persist parameter '%s'\n", str);
+ if (kstrtoul(str, 0, &val)) {
+ pr_warn("invalid gcov_persist parameter '%s'\n", str);
return 0;
}
gcov_persist = val;
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)
list_for_each_entry(node, &all_head, all) {
info = get_node_info(node);
- if (info && (strcmp(info->filename, name) == 0))
+ if (info && (strcmp(gcov_info_filename(info), name) == 0))
return node;
}
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
seq = file->private_data;
info = gcov_iter_get_info(seq->private);
mutex_lock(&node_lock);
- node = get_node_by_name(info->filename);
+ node = get_node_by_name(gcov_info_filename(info));
if (node) {
/* Reset counts or remove node for unloaded modules. */
if (node->num_loaded == 0)
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename)
*/
static void add_links(struct gcov_node *node, struct dentry *parent)
{
- char *basename;
+ const char *basename;
char *target;
int num;
int i;
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
if (!node->links)
return;
for (i = 0; i < num; i++) {
- target = get_link_target(get_node_info(node)->filename,
- &gcov_link[i]);
+ target = get_link_target(
+ gcov_info_filename(get_node_info(node)),
+ &gcov_link[i]);
if (!target)
goto out_err;
- basename = strrchr(target, '/');
- if (!basename)
+ basename = kbasename(target);
+ if (basename == target)
goto out_err;
- basename++;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
if (!node->links[i])
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
if (!node->dentry) {
- pr_warning("could not create file\n");
+ pr_warn("could not create file\n");
kfree(node);
return NULL;
}
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
err_nomem:
kfree(node);
- pr_warning("out of memory\n");
+ pr_warn("out of memory\n");
return NULL;
}
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)
struct gcov_node *parent;
struct gcov_node *node;
- filename = kstrdup(info->filename, GFP_KERNEL);
+ filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
if (!filename)
return;
parent = &root_node;
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
*/
loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
if (!loaded_info) {
- pr_warning("could not add '%s' (out of memory)\n",
- info->filename);
+ pr_warn("could not add '%s' (out of memory)\n",
+ gcov_info_filename(info));
return;
}
memcpy(loaded_info, node->loaded_info,
@@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
* data set replaces the copy of the last one.
*/
if (!gcov_info_is_compatible(node->unloaded_info, info)) {
- pr_warning("discarding saved data for %s "
- "(incompatible version)\n", info->filename);
+ pr_warn("discarding saved data for %s "
+ "(incompatible version)\n",
+ gcov_info_filename(info));
gcov_info_free(node->unloaded_info);
node->unloaded_info = NULL;
}
@@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
* The initial one takes precedence.
*/
if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
- pr_warning("could not add '%s' (incompatible "
- "version)\n", info->filename);
+ pr_warn("could not add '%s' (incompatible "
+ "version)\n", gcov_info_filename(info));
kfree(loaded_info);
return;
}
@@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)
else {
node->unloaded_info = gcov_info_dup(info);
if (!node->unloaded_info) {
- pr_warning("could not save data for '%s' "
- "(out of memory)\n", info->filename);
+ pr_warn("could not save data for '%s' "
+ "(out of memory)\n",
+ gcov_info_filename(info));
}
}
}
@@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)
i = get_info_index(node, info);
if (i < 0) {
- pr_warning("could not remove '%s' (not found)\n",
- info->filename);
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
return;
}
if (gcov_persist)
@@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
struct gcov_node *node;
mutex_lock(&node_lock);
- node = get_node_by_name(info->filename);
+ node = get_node_by_name(gcov_info_filename(info));
switch (action) {
case GCOV_ADD:
if (node)
@@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
if (node)
remove_info(node, info);
else {
- pr_warning("could not remove '%s' (not found)\n",
- info->filename);
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
}
break;
}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb426003..27bc88a3501 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,6 +21,121 @@
#include <linux/vmalloc.h>
#include "gcov.h"
+#define GCOV_COUNTERS 5
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 00000000000..826ba9fb5e3
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,565 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 4.7.
+ *
+ * This file is based heavily on gcc_3_4.c file.
+ *
+ * For a better understanding, refer to gcc source:
+ * gcc/gcov-io.h
+ * libgcc/libgcov.c
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#define GCOV_COUNTERS 9
+#else
+#define GCOV_COUNTERS 8
+#endif
+
+#define GCOV_TAG_FUNCTION_LENGTH 3
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_ctr_info - information about counters for a single function
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+};
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @key: comdat key
+ * @ident: unique ident of function
+ * @lineno_checksum: function lineo_checksum
+ * @cfg_checksum: function cfg checksum
+ * @ctrs: instrumented counters
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ *
+ * Information about a single function. This uses the trailing array
+ * idiom. The number of counters is determined from the merge pointer
+ * array in gcov_info. The key is used to detect which of a set of
+ * comdat functions was selected -- it points to the gcov_info object
+ * of the object file containing the selected comdat function.
+ */
+struct gcov_fn_info {
+ const struct gcov_info *key;
+ unsigned int ident;
+ unsigned int lineno_checksum;
+ unsigned int cfg_checksum;
+ struct gcov_ctr_info ctrs[0];
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: uniquifying time stamp
+ * @filename: name of the associated gcov data file
+ * @merge: merge functions (null for unused counter type)
+ * @n_functions: number of instrumented functions
+ * @functions: pointer to pointers to function information
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
+ unsigned int n_functions;
+ struct gcov_fn_info **functions;
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Doesn't change at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return info->merge[type] ? 1 : 0;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ memset(ci_ptr->values, 0,
+ sizeof(gcov_type) * ci_ptr->num);
+ ci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_ctr_info *dci_ptr;
+ struct gcov_ctr_info *sci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int val_idx;
+
+ for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
+ dci_ptr = dst->functions[fi_idx]->ctrs;
+ sci_ptr = src->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(src, ct_idx))
+ continue;
+
+ for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
+ dci_ptr->values[val_idx] +=
+ sci_ptr->values[val_idx];
+
+ dci_ptr++;
+ sci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_ctr_info *dci_ptr; /* dst counter info */
+ struct gcov_ctr_info *sci_ptr; /* src counter info */
+ unsigned int active;
+ unsigned int fi_idx; /* function info idx */
+ unsigned int ct_idx; /* counter type idx */
+ size_t fi_size; /* function info size */
+ size_t cv_size; /* counter values size */
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+
+ dup->next = NULL;
+ dup->filename = NULL;
+ dup->functions = NULL;
+
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+
+ dup->functions = kcalloc(info->n_functions,
+ sizeof(struct gcov_fn_info *), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+
+ active = num_counter_active(info);
+ fi_size = sizeof(struct gcov_fn_info);
+ fi_size += sizeof(struct gcov_ctr_info) * active;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
+ if (!dup->functions[fi_idx])
+ goto err_free;
+
+ *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
+
+ sci_ptr = info->functions[fi_idx]->ctrs;
+ dci_ptr = dup->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++) {
+
+ cv_size = sizeof(gcov_type) * sci_ptr->num;
+
+ dci_ptr->values = vmalloc(cv_size);
+
+ if (!dci_ptr->values)
+ goto err_free;
+
+ dci_ptr->num = sci_ptr->num;
+ memcpy(dci_ptr->values, sci_ptr->values, cv_size);
+
+ sci_ptr++;
+ dci_ptr++;
+ }
+ }
+
+ return dup;
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ struct gcov_ctr_info *ci_ptr;
+
+ if (!info->functions)
+ goto free_info;
+
+ active = num_counter_active(info);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ if (!info->functions[fi_idx])
+ continue;
+
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
+ vfree(ci_ptr->values);
+
+ kfree(info->functions[fi_idx]);
+ }
+
+free_info:
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int cv_idx;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->stamp);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ fi_ptr = info->functions[fi_idx];
+
+ /* Function record. */
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ ci_ptr = fi_ptr->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ /* Counter record. */
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FOR_COUNTER(ct_idx));
+ pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+
+ for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
+ pos += store_gcov_u64(buffer, pos,
+ ci_ptr->values[cv_idx]);
+ }
+
+ ci_ptr++;
+ }
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a..92c8e22a29e 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,7 +21,6 @@
* gcc and need to be kept as close to the original definition as possible to
* remain compatible.
*/
-#define GCOV_COUNTERS 5
#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
@@ -34,60 +33,18 @@ typedef long gcov_type;
typedef long long gcov_type;
#endif
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @ident: object file-unique function identifier
- * @checksum: function checksum
- * @n_ctrs: number of values per counter type belonging to this function
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
- unsigned int ident;
- unsigned int checksum;
- unsigned int n_ctrs[0];
-};
-
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
- void (*merge)(gcov_type *, unsigned int);
-};
+/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
+ * we cannot use full definition here and they need to be placed in gcc specific
+ * implementation of gcov. This also means no direct access to the members in
+ * generic code and usage of the interface below.*/
+struct gcov_info;
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: time stamp
- * @filename: name of the associated gcov data file
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- unsigned int n_functions;
- const struct gcov_fn_info *functions;
- unsigned int ctr_mask;
- struct gcov_ctr_info counts[0];
-};
+/* Interface to access gcov_info data */
+const char *gcov_info_filename(struct gcov_info *info);
+unsigned int gcov_info_version(struct gcov_info *info);
+struct gcov_info *gcov_info_next(struct gcov_info *info);
+void gcov_info_link(struct gcov_info *info);
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
/* Base interface. */
enum gcov_action {
diff --git a/kernel/groups.c b/kernel/groups.c
index 99b53d1eb7e..451698f86cf 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
group_info->blocks[0] = group_info->small_block;
else {
for (i = 0; i < nblocks; i++) {
- gid_t *b;
+ kgid_t *b;
b = (void *)__get_free_page(GFP_USER);
if (!b)
goto out_undo_partial_alloc;
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free);
static int groups_to_user(gid_t __user *grouplist,
const struct group_info *group_info)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
unsigned int count = group_info->ngroups;
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_to_user(grouplist, group_info->blocks[i], len))
+ for (i = 0; i < count; i++) {
+ gid_t gid;
+ gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+ if (put_user(gid, grouplist+i))
return -EFAULT;
-
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
}
return 0;
}
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist,
static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
unsigned int count = group_info->ngroups;
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_from_user(group_info->blocks[i], grouplist, len))
+ for (i = 0; i < count; i++) {
+ gid_t gid;
+ kgid_t kgid;
+ if (get_user(gid, grouplist+i))
return -EFAULT;
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
+ kgid = make_kgid(user_ns, gid);
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ GROUP_AT(group_info, i) = kgid;
}
return 0;
}
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
for (base = 0; base < max; base++) {
int left = base;
int right = left + stride;
- gid_t tmp = GROUP_AT(group_info, right);
+ kgid_t tmp = GROUP_AT(group_info, right);
- while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+ while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
GROUP_AT(group_info, right) =
GROUP_AT(group_info, left);
right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
}
/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, kgid_t grp)
{
unsigned int left, right;
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
- if (grp > GROUP_AT(group_info, mid))
+ if (gid_gt(grp, GROUP_AT(group_info, mid)))
left = mid + 1;
- else if (grp < GROUP_AT(group_info, mid))
+ else if (gid_lt(grp, GROUP_AT(group_info, mid)))
right = mid;
else
return 1;
@@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, gid_t grp)
* set_groups - Change a group subscription in a set of credentials
* @new: The newly prepared set of credentials to alter
* @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
*/
-int set_groups(struct cred *new, struct group_info *group_info)
+void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
- return 0;
}
EXPORT_SYMBOL(set_groups);
@@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
- int ret;
new = prepare_creds();
if (!new)
return -ENOMEM;
- ret = set_groups(new, group_info);
- if (ret < 0) {
- abort_creds(new);
- return ret;
- }
-
+ set_groups(new, group_info);
return commit_creds(new);
}
@@ -233,7 +223,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!nsown_capable(CAP_SETGID))
+ if (!ns_capable(current_user_ns(), CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
@@ -256,24 +246,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
/*
* Check whether we're fsgid/egid or in the supplemental group..
*/
-int in_group_p(gid_t grp)
+int in_group_p(kgid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
- if (grp != cred->fsgid)
+ if (!gid_eq(grp, cred->fsgid))
retval = groups_search(cred->group_info, grp);
return retval;
}
EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(gid_t grp)
+int in_egroup_p(kgid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
- if (grp != cred->egid)
+ if (!gid_eq(grp, cred->egid))
retval = groups_search(cred->group_info, grp);
return retval;
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682..3ab28993f6e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,7 +44,11 @@
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
#include <linux/timer.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
@@ -61,6 +65,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -81,6 +86,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.get_time = &ktime_get_boottime,
.resolution = KTIME_LOW_RES,
},
+ {
+ .index = HRTIMER_BASE_TAI,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ .resolution = KTIME_LOW_RES,
+ },
}
};
@@ -88,6 +99,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+ [CLOCK_TAI] = HRTIMER_BASE_TAI,
};
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -104,8 +116,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
ktime_t xtim, mono, boot;
struct timespec xts, tom, slp;
+ s32 tai_offset;
get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+ tai_offset = timekeeping_get_tai_offset();
xtim = timespec_to_ktime(xts);
mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -113,6 +127,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+ base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+ ktime_add(xtim, ktime_set(tai_offset, 0));
}
/*
@@ -152,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
}
-
-/*
- * Get the preferred target CPU for NOHZ
- */
-static int hrtimer_get_target(int this_cpu, int pinned)
-{
-#ifdef CONFIG_NO_HZ
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
- return get_nohz_timer_target();
-#endif
- return this_cpu;
-}
-
/*
* With HIGHRES=y we do not migrate the timer when it is expiring
* before the next event on the target cpu because we cannot reprogram
@@ -198,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
int this_cpu = smp_processor_id();
- int cpu = hrtimer_get_target(this_cpu, pinned);
+ int cpu = get_nohz_timer_target(pinned);
int basenum = base->index;
again:
@@ -231,6 +234,11 @@ again:
goto again;
}
timer->base = new_base;
+ } else {
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ goto again;
+ }
}
return new_base;
}
@@ -273,6 +281,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
} else {
unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+ /* Make sure nsec fits into long */
+ if (unlikely(nsec > KTIME_SEC_MAX))
+ return (ktime_t){ .tv64 = KTIME_MAX };
+
tmp = ktime_set((long)nsec, rem);
}
@@ -562,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
cpu_base->expires_next.tv64 = expires_next.tv64;
+ /*
+ * If a hang was detected in the last timer interrupt then we
+ * leave the hang delay active in the hardware. We want the
+ * system to make progress. That also prevents the following
+ * scenario:
+ * T1 expires 50ms from now
+ * T2 expires 5s from now
+ *
+ * T1 is removed, so this code is called and would reprogram
+ * the hardware to 5s from now. Any hrtimer_start after that
+ * will not reprogram the hardware due to hang_detected being
+ * set. So we'd effectivly block all timers until the T2 event
+ * fires.
+ */
+ if (cpu_base->hang_detected)
+ return;
+
if (cpu_base->expires_next.tv64 != KTIME_MAX)
tick_program_event(cpu_base->expires_next, 1);
}
@@ -640,21 +669,18 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
- if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
- if (wakeup) {
- raw_spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- raw_spin_lock(&base->cpu_base->lock);
- } else
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
+}
- return 1;
- }
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return 0;
+ return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
}
/*
@@ -665,22 +691,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
- struct timespec realtime_offset, xtim, wtm, sleep;
if (!hrtimer_hres_active())
return;
- /* Optimized out for !HIGH_RES */
- get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- /* Adjust CLOCK_REALTIME offset */
raw_spin_lock(&base->lock);
- base->clock_base[HRTIMER_BASE_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
- base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
- timespec_to_ktime(sleep);
-
+ hrtimer_update_base(base);
hrtimer_force_reprogram(base, 0);
raw_spin_unlock(&base->lock);
}
@@ -710,13 +726,28 @@ static int hrtimer_switch_to_hres(void)
base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
-
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
return 1;
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set();
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
+/*
+ * Called from timekeeping and resume code to reprogramm the hrtimer
+ * interrupt device on all cpus.
+ */
+void clock_was_set_delayed(void)
+{
+ schedule_work(&hrtimer_work);
+}
+
#else
static inline int hrtimer_hres_active(void) { return 0; }
@@ -725,8 +756,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
return 0;
}
@@ -757,15 +787,19 @@ void clock_was_set(void)
/*
* During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
+ * interrupt on all online CPUs. However, all other CPUs will be
+ * stopped with IRQs interrupts disabled so the clock_was_set() call
+ * must be deferred.
*/
void hrtimers_resume(void)
{
WARN_ONCE(!irqs_disabled(),
KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+ /* Retrigger on the local CPU */
retrigger_next_event(NULL);
- timerfd_clock_was_set();
+ /* And schedule a retrigger for all others */
+ clock_was_set_delayed();
}
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -956,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
ret = remove_hrtimer(timer, base);
- /* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
if (mode & HRTIMER_MODE_REL) {
- tim = ktime_add_safe(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -975,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
@@ -985,20 +1019,35 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
*
* XXX send_remote_softirq() ?
*/
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
- hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
+ && hrtimer_enqueue_reprogram(timer, new_base)) {
+ if (wakeup) {
+ /*
+ * We need to drop cpu_base->lock to avoid a
+ * lock ordering issue vs. rq->lock.
+ */
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ local_irq_restore(flags);
+ return ret;
+ } else {
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+ }
unlock_hrtimer_base(timer, &flags);
return ret;
}
+EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
*
* Returns:
* 0 on success
@@ -1015,7 +1064,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
* hrtimer_start - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
*
* Returns:
* 0 on success
@@ -1094,7 +1144,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/**
* hrtimer_get_next_event - get the time until next expiry event
*
@@ -1250,11 +1300,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
cpu_base->nr_events++;
dev->next_event.tv64 = KTIME_MAX;
- entry_time = now = ktime_get();
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
retry:
expires_next.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1298,6 +1347,8 @@ retry:
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
+ if (expires.tv64 < 0)
+ expires.tv64 = KTIME_MAX;
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
break;
@@ -1330,8 +1381,12 @@ retry:
* We need to prevent that we loop forever in the hrtimer
* interrupt routine. We give it 3 attempts to avoid
* overreacting on some spurious event.
+ *
+ * Acquire base lock for updating the offsets and retrieving
+ * the current time.
*/
- now = ktime_get();
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
@@ -1343,6 +1398,7 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
+ raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
if (delta.tv64 > cpu_base->max_hang_time.tv64)
cpu_base->max_hang_time = delta;
@@ -1501,7 +1557,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
t->task = NULL;
if (likely(t->task))
- schedule();
+ freezable_schedule();
hrtimer_cancel(&t->timer);
mode = HRTIMER_MODE_ABS;
@@ -1565,7 +1621,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
unsigned long slack;
slack = current->timer_slack_ns;
- if (rt_task(current))
+ if (dl_task(current) || rt_task(current))
slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);
@@ -1614,13 +1670,11 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
/*
* Functions related to boot-time initialization:
*/
-static void __cpuinit init_hrtimers_cpu(int cpu)
+static void init_hrtimers_cpu(int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
@@ -1697,7 +1751,7 @@ static void migrate_hrtimers(int scpu)
#endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
+static int hrtimer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int scpu = (long)hcpu;
@@ -1730,7 +1784,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata hrtimers_nb = {
+static struct notifier_block hrtimers_nb = {
.notifier_call = hrtimer_cpu_notify,
};
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 2e48ec0c2e9..06db12434d7 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,11 +15,13 @@
#include <linux/lockdep.h>
#include <linux/export.h>
#include <linux/sysctl.h>
+#include <linux/utsname.h>
+#include <trace/events/sched.h>
/*
* The number of tasks checked:
*/
-unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
@@ -35,7 +37,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
@@ -50,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
static int __init hung_task_panic_setup(char *str)
{
- sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+ int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
+ if (rc)
+ return rc;
return 1;
}
__setup("hung_task_panic=", hung_task_panic_setup);
@@ -91,25 +95,36 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
t->last_switch_count = switch_count;
return;
}
+
+ trace_sched_process_hang(t);
+
if (!sysctl_hung_task_warnings)
return;
- sysctl_hung_task_warnings--;
+
+ if (sysctl_hung_task_warnings > 0)
+ sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid, timeout);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
+ pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
+ t->comm, t->pid, timeout);
+ pr_err(" %s %s %.*s\n",
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
sched_show_task(t);
debug_show_held_locks(t);
touch_nmi_watchdog();
- if (sysctl_hung_task_panic)
+ if (sysctl_hung_task_panic) {
+ trigger_all_cpu_backtrace();
panic("hung_task: blocked tasks");
+ }
}
/*
@@ -119,15 +134,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
* to exit the grace period. For classic RCU, a reschedule is required.
*/
-static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
+ bool can_cont;
+
get_task_struct(g);
get_task_struct(t);
rcu_read_unlock();
cond_resched();
rcu_read_lock();
+ can_cont = pid_alive(g) && pid_alive(t);
put_task_struct(t);
put_task_struct(g);
+
+ return can_cont;
}
/*
@@ -154,9 +174,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
if (!--batch_count) {
batch_count = HUNG_TASK_BATCHING;
- rcu_lock_break(g, t);
- /* Exit if t or g was unhashed during refresh. */
- if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+ if (!rcu_lock_break(g, t))
goto unlock;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
@@ -193,6 +211,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+static atomic_t reset_hung_task = ATOMIC_INIT(0);
+
+void reset_hung_task_detector(void)
+{
+ atomic_set(&reset_hung_task, 1);
+}
+EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+
/*
* kthread which checks for tasks stuck in D state
*/
@@ -206,6 +232,9 @@ static int watchdog(void *dummy)
while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
timeout = sysctl_hung_task_timeout_secs;
+ if (atomic_xchg(&reset_hung_task, 0))
+ continue;
+
check_hung_uninterruptible_tasks(timeout);
}
@@ -219,5 +248,4 @@ static int __init hung_task_init(void)
return 0;
}
-
-module_init(hung_task_init);
+subsys_initcall(hung_task_init);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de64..d269cecdfbf 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,19 +1,12 @@
-# Select this to activate the generic irq options below
-config HAVE_GENERIC_HARDIRQS
- bool
-
-if HAVE_GENERIC_HARDIRQS
menu "IRQ subsystem"
-#
-# Interrupt subsystem related configuration options
-#
-config GENERIC_HARDIRQS
- def_bool y
-
# Options selectable by the architecture code
# Make sparse irq Kconfig switch below available
-config HAVE_SPARSE_IRQ
+config MAY_HAVE_SPARSE_IRQ
+ bool
+
+# Legacy support, required for itanic
+config GENERIC_IRQ_LEGACY
bool
# Enable the generic irq autoprobe mechanism
@@ -28,6 +21,11 @@ config GENERIC_IRQ_SHOW
config GENERIC_IRQ_SHOW_LEVEL
bool
+# Facility to allocate a hardware interrupt. This is legacy support
+# and should not be used in new code. Use irq domains instead.
+config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ bool
+
# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
bool
@@ -51,18 +49,28 @@ config IRQ_EDGE_EOI_HANDLER
# Generic configurable interrupt chip implementation
config GENERIC_IRQ_CHIP
bool
+ select IRQ_DOMAIN
# Generic irq_domain hw <--> linux irq number translation
config IRQ_DOMAIN
bool
+config IRQ_DOMAIN_DEBUG
+ bool "Expose hardware/virtual IRQ mapping via debugfs"
+ depends on IRQ_DOMAIN && DEBUG_FS
+ help
+ This option will show the mapping relationship between hardware irq
+ numbers and Linux irq numbers. The mapping is exposed via debugfs
+ in the file "irq_domain_mapping".
+
+ If you don't know what this means you don't need it.
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
config SPARSE_IRQ
- bool "Support sparse irq numbering"
- depends on HAVE_SPARSE_IRQ
+ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
---help---
Sparse irq numbering is useful for distro kernels that want
@@ -75,4 +83,3 @@ config SPARSE_IRQ
If you don't know what to do here, say N.
endmenu
-endif
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 342d8f44e40..0119b9d467a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- irq_startup(desc);
+ irq_startup(desc, false);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
- if (irq_startup(desc))
+ if (irq_startup(desc, false))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f7c543a801d..a2b28a2fd7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,6 +16,8 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <trace/events/irq.h>
+
#include "internals.h"
/**
@@ -38,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
- * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
- * already marked, and this call is harmless.
+ * allocated_irqs.
*/
- irq_reserve_irq(irq);
+ irq_mark_irq(irq);
return 0;
}
EXPORT_SYMBOL(irq_set_chip);
@@ -61,8 +62,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
- if (type != IRQ_TYPE_NONE)
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, irq, type);
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -89,27 +89,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+ struct msi_desc *entry)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
desc->irq_data.msi_desc = entry;
- if (entry)
- entry->irq = irq;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
irq_put_desc_unlock(desc, flags);
return 0;
}
/**
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
+ *
+ * Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+ return irq_set_msi_desc_off(irq, 0, entry);
+}
+
+/**
* irq_set_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
@@ -157,19 +171,22 @@ static void irq_state_set_masked(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
}
-int irq_startup(struct irq_desc *desc)
+int irq_startup(struct irq_desc *desc, bool resend)
{
+ int ret = 0;
+
irq_state_clr_disabled(desc);
desc->depth = 0;
if (desc->irq_data.chip->irq_startup) {
- int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+ ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
irq_state_clr_masked(desc);
- return ret;
+ } else {
+ irq_enable(desc);
}
-
- irq_enable(desc);
- return 0;
+ if (resend)
+ check_irq_resend(desc, desc->irq_data.irq);
+ return ret;
}
void irq_shutdown(struct irq_desc *desc)
@@ -195,6 +212,19 @@ void irq_enable(struct irq_desc *desc)
irq_state_clr_masked(desc);
}
+/**
+ * irq_disable - Mark interrupt disabled
+ * @desc: irq descriptor which should be disabled
+ *
+ * If the chip does not implement the irq_disable callback, we
+ * use a lazy disable approach. That means we mark the interrupt
+ * disabled, but leave the hardware unmasked. That's an
+ * optimization because we avoid the hardware access for the
+ * common case where no interrupt happens after we marked it
+ * disabled. If an interrupt happens, then the interrupt flow
+ * handler masks the line at the hardware level and marks it
+ * pending.
+ */
void irq_disable(struct irq_desc *desc)
{
irq_state_set_disabled(desc);
@@ -250,6 +280,19 @@ void unmask_irq(struct irq_desc *desc)
}
}
+void unmask_threaded_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ if (chip->flags & IRQCHIP_EOI_THREADED)
+ chip->irq_eoi(&desc->irq_data);
+
+ if (chip->irq_unmask) {
+ chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ }
+}
+
/*
* handle_nested_irq - Handle a nested irq from a irq thread
* @irq: the interrupt number
@@ -268,11 +311,14 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
@@ -320,8 +366,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
handle_irq_event(desc);
@@ -330,6 +378,24 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
+/*
+ * Called unconditionally from handle_level_irq() and only for oneshot
+ * interrupts from handle_fasteoi_irq()
+ */
+static void cond_unmask_irq(struct irq_desc *desc)
+{
+ /*
+ * We need to unmask in the following cases:
+ * - Standard level irq (IRQF_ONESHOT is not set)
+ * - Oneshot irq which did not wake the thread (caused by a
+ * spurious interrupt or a primary handler handling it
+ * completely).
+ */
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
+ unmask_irq(desc);
+}
+
/**
* handle_level_irq - Level type irq handler
* @irq: the interrupt number
@@ -357,13 +423,15 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
* If its disabled or no action available
* keep it masked and get out of here
*/
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
handle_irq_event(desc);
- if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
- unmask_irq(desc);
+ cond_unmask_irq(desc);
+
out_unlock:
raw_spin_unlock(&desc->lock);
}
@@ -379,6 +447,27 @@ static inline void preflow_handler(struct irq_desc *desc)
static inline void preflow_handler(struct irq_desc *desc) { }
#endif
+static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
+{
+ if (!(desc->istate & IRQS_ONESHOT)) {
+ chip->irq_eoi(&desc->irq_data);
+ return;
+ }
+ /*
+ * We need to unmask in the following cases:
+ * - Oneshot irq which did not wake the thread (caused by a
+ * spurious interrupt or a primary handler handling it
+ * completely).
+ */
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) {
+ chip->irq_eoi(&desc->irq_data);
+ unmask_irq(desc);
+ } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) {
+ chip->irq_eoi(&desc->irq_data);
+ }
+}
+
/**
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
@@ -392,6 +481,8 @@ static inline void preflow_handler(struct irq_desc *desc) { }
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
+ struct irq_chip *chip = desc->irq_data.chip;
+
raw_spin_lock(&desc->lock);
if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
@@ -417,15 +508,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
preflow_handler(desc);
handle_irq_event(desc);
-out_eoi:
- desc->irq_data.chip->irq_eoi(&desc->irq_data);
-out_unlock:
+ cond_unmask_eoi_irq(desc, chip);
+
raw_spin_unlock(&desc->lock);
return;
out:
- if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
- goto out_eoi;
- goto out_unlock;
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
}
/**
@@ -493,6 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
out_unlock:
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL(handle_edge_irq);
#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
/**
@@ -625,7 +716,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
- irq_startup(desc);
+ irq_startup(desc, true);
}
out:
irq_put_desc_busunlock(desc, flags);
@@ -639,6 +730,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_set_chip(irq, chip);
__irq_set_handler(irq, handle, 0, name);
}
+EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 97a8bfadc88..e75e29e4434 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,10 +4,10 @@
#include <linux/kallsyms.h>
-#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
-#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
/* FIXME */
-#define PD(f) do { } while (0)
+#define ___PD(f) do { } while (0)
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
print_symbol("%s\n", (unsigned long)desc->action->handler);
}
- P(IRQ_LEVEL);
- P(IRQ_PER_CPU);
- P(IRQ_NOPROBE);
- P(IRQ_NOREQUEST);
- P(IRQ_NOTHREAD);
- P(IRQ_NOAUTOEN);
+ ___P(IRQ_LEVEL);
+ ___P(IRQ_PER_CPU);
+ ___P(IRQ_NOPROBE);
+ ___P(IRQ_NOREQUEST);
+ ___P(IRQ_NOTHREAD);
+ ___P(IRQ_NOAUTOEN);
- PS(IRQS_AUTODETECT);
- PS(IRQS_REPLAY);
- PS(IRQS_WAITING);
- PS(IRQS_PENDING);
+ ___PS(IRQS_AUTODETECT);
+ ___PS(IRQS_REPLAY);
+ ___PS(IRQS_WAITING);
+ ___PS(IRQS_PENDING);
- PD(IRQS_INPROGRESS);
- PD(IRQS_DISABLED);
- PD(IRQS_MASKED);
+ ___PD(IRQS_INPROGRESS);
+ ___PD(IRQS_DISABLED);
+ ___PD(IRQS_MASKED);
}
-#undef P
-#undef PS
-#undef PD
+#undef ___P
+#undef ___PS
+#undef ___PD
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index bd8e788d71e..1ef0606797c 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -73,6 +73,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
EXPORT_SYMBOL(devm_request_threaded_irq);
/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device
+ * @dev: device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @thread_fn: function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the
+ * same arguments and performs the same function as
+ * request_any_context_irq(). IRQs requested with this function will be
+ * automatically freed on driver detach.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ struct irq_devres *dr;
+ int rc;
+
+ dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
+ GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
+ if (rc) {
+ devres_free(dr);
+ return rc;
+ }
+
+ dr->irq = irq;
+ dr->dev_id = dev_id;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_any_context_irq);
+
+/**
* devm_free_irq - free an interrupt
* @dev: device to free interrupt for
* @irq: Interrupt line to free
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index b5fcd96c710..988dc58e884 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -6,6 +6,7 @@
*/
#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <linux/export.h>
#include "internals.h"
@@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = {
.irq_mask = noop,
.irq_unmask = noop,
};
+EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f66..452d6f2ba21 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -7,6 +7,7 @@
#include <linux/irq.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/irqdomain.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/syscore_ops.h>
@@ -16,11 +17,6 @@
static LIST_HEAD(gc_list);
static DEFINE_RAW_SPINLOCK(gc_lock);
-static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
-{
- return &container_of(d->chip, struct irq_chip_type, chip)->regs;
-}
-
/**
* irq_gc_noop - NOOP function
* @d: irq_data
@@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d)
void irq_gc_mask_disable_reg(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
- gc->mask_cache &= ~mask;
+ irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
+ *ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
/**
- * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
+ * irq_gc_mask_set_bit - Mask chip via setting bit in mask register
* @d: irq_data
*
* Chip has a single mask register. Values of this register are cached
@@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
void irq_gc_mask_set_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- gc->mask_cache |= mask;
- irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+ *ct->mask_cache |= mask;
+ irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
/**
- * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
+ * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
* @d: irq_data
*
* Chip has a single mask register. Values of this register are cached
@@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d)
void irq_gc_mask_clr_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- gc->mask_cache &= ~mask;
- irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+ *ct->mask_cache &= ~mask;
+ irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
/**
* irq_gc_unmask_enable_reg - Unmask chip via enable register
@@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
void irq_gc_unmask_enable_reg(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
- gc->mask_cache |= mask;
+ irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
+ *ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
@@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
void irq_gc_ack_set_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
/**
* irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
@@ -122,25 +126,27 @@ void irq_gc_ack_set_bit(struct irq_data *d)
void irq_gc_ack_clr_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = ~(1 << (d->irq - gc->irq_base));
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = ~d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
irq_gc_unlock(gc);
}
/**
- * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
+ * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
* @d: irq_data
*/
void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
void irq_gc_eoi(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
+ irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
irq_gc_unlock(gc);
}
/**
* irq_gc_set_wake - Set/clr wake bit for an interrupt
- * @d: irq_data
+ * @d: irq_data
+ * @on: Indicates whether the wake bit should be set or cleared
*
* For chips where the wake from suspend functionality is not
* configured in a separate register and the wakeup active state is
@@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d)
int irq_gc_set_wake(struct irq_data *d, unsigned int on)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- u32 mask = 1 << (d->irq - gc->irq_base);
+ u32 mask = d->mask;
if (!(mask & gc->wake_enabled))
return -EINVAL;
@@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
return 0;
}
+static void
+irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+ int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler)
+{
+ raw_spin_lock_init(&gc->lock);
+ gc->num_ct = num_ct;
+ gc->irq_base = irq_base;
+ gc->reg_base = reg_base;
+ gc->chip_types->chip.name = name;
+ gc->chip_types->handler = handler;
+}
+
/**
* irq_alloc_generic_chip - Allocate a generic chip and initialize it
* @name: Name of the irq chip
@@ -203,23 +224,183 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
gc = kzalloc(sz, GFP_KERNEL);
if (gc) {
- raw_spin_lock_init(&gc->lock);
- gc->num_ct = num_ct;
- gc->irq_base = irq_base;
- gc->reg_base = reg_base;
- gc->chip_types->chip.name = name;
- gc->chip_types->handler = handler;
+ irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
+ handler);
}
return gc;
}
EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
+static void
+irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
+{
+ struct irq_chip_type *ct = gc->chip_types;
+ u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
+ int i;
+
+ for (i = 0; i < gc->num_ct; i++) {
+ if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
+ mskptr = &ct[i].mask_cache_priv;
+ mskreg = ct[i].regs.mask;
+ }
+ ct[i].mask_cache = mskptr;
+ if (flags & IRQ_GC_INIT_MASK_CACHE)
+ *mskptr = irq_reg_readl(gc->reg_base + mskreg);
+ }
+}
+
+/**
+ * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @irqs_per_chip: Number of interrupts each chip handles
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @name: Name of the irq chip
+ * @handler: Default flow handler associated with these chips
+ * @clr: IRQ_* bits to clear in the mapping function
+ * @set: IRQ_* bits to set in the mapping function
+ * @gcflags: Generic chip specific setup flags
+ */
+int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
+{
+ struct irq_domain_chip_generic *dgc;
+ struct irq_chip_generic *gc;
+ int numchips, sz, i;
+ unsigned long flags;
+ void *tmp;
+
+ if (d->gc)
+ return -EBUSY;
+
+ numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
+ if (!numchips)
+ return -EINVAL;
+
+ /* Allocate a pointer, generic chip and chiptypes for each chip */
+ sz = sizeof(*dgc) + numchips * sizeof(gc);
+ sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+
+ tmp = dgc = kzalloc(sz, GFP_KERNEL);
+ if (!dgc)
+ return -ENOMEM;
+ dgc->irqs_per_chip = irqs_per_chip;
+ dgc->num_chips = numchips;
+ dgc->irq_flags_to_set = set;
+ dgc->irq_flags_to_clear = clr;
+ dgc->gc_flags = gcflags;
+ d->gc = dgc;
+
+ /* Calc pointer to the first generic chip */
+ tmp += sizeof(*dgc) + numchips * sizeof(gc);
+ for (i = 0; i < numchips; i++) {
+ /* Store the pointer to the generic chip */
+ dgc->gc[i] = gc = tmp;
+ irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
+ NULL, handler);
+ gc->domain = d;
+ raw_spin_lock_irqsave(&gc_lock, flags);
+ list_add_tail(&gc->list, &gc_list);
+ raw_spin_unlock_irqrestore(&gc_lock, flags);
+ /* Calc pointer to the next generic chip */
+ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ }
+ d->name = name;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+
+/**
+ * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
+ * @d: irq domain pointer
+ * @hw_irq: Hardware interrupt number
+ */
+struct irq_chip_generic *
+irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ int idx;
+
+ if (!dgc)
+ return NULL;
+ idx = hw_irq / dgc->irqs_per_chip;
+ if (idx >= dgc->num_chips)
+ return NULL;
+ return dgc->gc[idx];
+}
+EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
+
/*
* Separate lockdep class for interrupt chip which can nest irq_desc
* lock.
*/
static struct lock_class_key irq_nested_lock_class;
+/*
+ * irq_map_generic_chip - Map a generic chip for an irq domain
+ */
+static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+ irq_hw_number_t hw_irq)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ struct irq_chip_generic *gc;
+ struct irq_chip_type *ct;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int idx;
+
+ if (!d->gc)
+ return -ENODEV;
+
+ idx = hw_irq / dgc->irqs_per_chip;
+ if (idx >= dgc->num_chips)
+ return -EINVAL;
+ gc = dgc->gc[idx];
+
+ idx = hw_irq % dgc->irqs_per_chip;
+
+ if (test_bit(idx, &gc->unused))
+ return -ENOTSUPP;
+
+ if (test_bit(idx, &gc->installed))
+ return -EBUSY;
+
+ ct = gc->chip_types;
+ chip = &ct->chip;
+
+ /* We only init the cache for the first mapping of a generic chip */
+ if (!gc->installed) {
+ raw_spin_lock_irqsave(&gc->lock, flags);
+ irq_gc_init_mask_cache(gc, dgc->gc_flags);
+ raw_spin_unlock_irqrestore(&gc->lock, flags);
+ }
+
+ /* Mark the interrupt as installed */
+ set_bit(idx, &gc->installed);
+
+ if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
+ irq_set_lockdep_class(virq, &irq_nested_lock_class);
+
+ if (chip->irq_calc_mask)
+ chip->irq_calc_mask(data);
+ else
+ data->mask = 1 << idx;
+
+ irq_set_chip_and_handler(virq, chip, ct->handler);
+ irq_set_chip_data(virq, gc);
+ irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
+ return 0;
+}
+
+struct irq_domain_ops irq_generic_chip_ops = {
+ .map = irq_map_generic_chip,
+ .xlate = irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
+
/**
* irq_setup_generic_chip - Setup a range of interrupts with a generic chip
* @gc: Generic irq chip holding all data
@@ -237,15 +418,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int set)
{
struct irq_chip_type *ct = gc->chip_types;
+ struct irq_chip *chip = &ct->chip;
unsigned int i;
raw_spin_lock(&gc_lock);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock(&gc_lock);
- /* Init mask cache ? */
- if (flags & IRQ_GC_INIT_MASK_CACHE)
- gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
+ irq_gc_init_mask_cache(gc, flags);
for (i = gc->irq_base; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
@@ -254,7 +434,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
if (flags & IRQ_GC_INIT_NESTED_LOCK)
irq_set_lockdep_class(i, &irq_nested_lock_class);
- irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+ if (!(flags & IRQ_GC_NO_MASK)) {
+ struct irq_data *d = irq_get_irq_data(i);
+
+ if (chip->irq_calc_mask)
+ chip->irq_calc_mask(d);
+ else
+ d->mask = 1 << (i - gc->irq_base);
+ }
+ irq_set_chip_and_handler(i, chip, ct->handler);
irq_set_chip_data(i, gc);
irq_modify_status(i, clr, set);
}
@@ -265,7 +453,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
/**
* irq_setup_alt_chip - Switch to alternative chip
* @d: irq_data for this interrupt
- * @type Flow type to be initialized
+ * @type: Flow type to be initialized
*
* Only to be called from chip->irq_set_type() callbacks.
*/
@@ -317,6 +505,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
+static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
+{
+ unsigned int virq;
+
+ if (!gc->domain)
+ return irq_get_irq_data(gc->irq_base);
+
+ /*
+ * We don't know which of the irqs has been actually
+ * installed. Use the first one.
+ */
+ if (!gc->installed)
+ return NULL;
+
+ virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
+ return virq ? irq_get_irq_data(virq) : NULL;
+}
+
#ifdef CONFIG_PM
static int irq_gc_suspend(void)
{
@@ -325,8 +531,12 @@ static int irq_gc_suspend(void)
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
- if (ct->chip.irq_suspend)
- ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
+ if (ct->chip.irq_suspend) {
+ struct irq_data *data = irq_gc_get_irq_data(gc);
+
+ if (data)
+ ct->chip.irq_suspend(data);
+ }
}
return 0;
}
@@ -338,8 +548,12 @@ static void irq_gc_resume(void)
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
- if (ct->chip.irq_resume)
- ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
+ if (ct->chip.irq_resume) {
+ struct irq_data *data = irq_gc_get_irq_data(gc);
+
+ if (data)
+ ct->chip.irq_resume(data);
+ }
}
}
#else
@@ -354,8 +568,12 @@ static void irq_gc_shutdown(void)
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
- if (ct->chip.irq_pm_shutdown)
- ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
+ if (ct->chip.irq_pm_shutdown) {
+ struct irq_data *data = irq_gc_get_irq_data(gc);
+
+ if (data)
+ ct->chip.irq_pm_shutdown(data);
+ }
}
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c82bb..63548027085 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id)
{
return IRQ_NONE;
}
+EXPORT_SYMBOL_GPL(no_action);
static void warn_no_thread(unsigned int irq, struct irqaction *action)
{
@@ -51,17 +52,21 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
-static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
{
/*
- * Wake up the handler thread for this action. In case the
- * thread crashed and was killed we just pretend that we
- * handled the interrupt. The hardirq handler has disabled the
- * device interrupt, so no irq storm is lurking. If the
+ * In case the thread crashed and was killed we just pretend that
+ * we handled the interrupt. The hardirq handler has disabled the
+ * device interrupt, so no irq storm is lurking.
+ */
+ if (action->thread->flags & PF_EXITING)
+ return;
+
+ /*
+ * Wake up the handler thread for this action. If the
* RUNTHREAD bit is already set, nothing to do.
*/
- if (test_bit(IRQTF_DIED, &action->thread_flags) ||
- test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
return;
/*
@@ -110,6 +115,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
* threads_oneshot untouched and runs the thread another time.
*/
desc->threads_oneshot |= action->thread_mask;
+
+ /*
+ * We increment the threads_active counter in case we wake up
+ * the irq thread. The irq thread decrements the counter when
+ * it returns from the handler or in the exit path and wakes
+ * up waiters which are stuck in synchronize_irq() when the
+ * active count becomes zero. synchronize_irq() is serialized
+ * against this code (hard irq handler) via IRQS_INPROGRESS
+ * like the finalize_oneshot() code. See comment above.
+ */
+ atomic_inc(&desc->threads_active);
+
wake_up_process(action->thread);
}
@@ -117,7 +134,7 @@ irqreturn_t
handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t retval = IRQ_NONE;
- unsigned int random = 0, irq = desc->irq_data.irq;
+ unsigned int flags = 0, irq = desc->irq_data.irq;
do {
irqreturn_t res;
@@ -141,11 +158,11 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
break;
}
- irq_wake_thread(desc, action);
+ __irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
- random |= action->flags;
+ flags |= action->flags;
break;
default:
@@ -156,8 +173,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
action = action->next;
} while (action);
- if (random & IRQF_SAMPLE_RANDOM)
- add_interrupt_randomness(irq);
+ add_interrupt_randomness(irq, flags);
if (!noirqdebug)
note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b7952316016..099ea2e0eb8 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -6,6 +6,7 @@
* of this file for your non core code.
*/
#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
#ifdef CONFIG_SPARSE_IRQ
# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -20,21 +21,19 @@ extern bool noirqdebug;
/*
* Bits used by threaded handlers:
* IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
- * IRQTF_DIED - handler thread died
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
*/
enum {
IRQTF_RUNTHREAD,
- IRQTF_DIED,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
};
/*
- * Bit masks for desc->state
+ * Bit masks for desc->core_internal_state__do_not_mess_with_it
*
* IRQS_AUTODETECT - autodetection in progress
* IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
@@ -67,7 +66,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern int irq_startup(struct irq_desc *desc);
+extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
@@ -75,6 +74,13 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
+extern void unmask_threaded_irq(struct irq_desc *desc);
+
+#ifdef CONFIG_SPARSE_IRQ
+static inline void irq_mark_irq(unsigned int irq) { }
+#else
+extern void irq_mark_irq(unsigned int irq);
+#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -84,6 +90,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq);
bool irq_wait_for_poll(struct irq_desc *desc);
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -103,6 +110,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern int irq_do_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force);
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -178,3 +188,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
return d->state_use_accessors & mask;
}
+
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+{
+ __this_cpu_inc(*desc->kstat_irqs);
+ __this_cpu_inc(kstat.irqs_sum);
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95e..1487a123db5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return radix_tree_lookup(&irq_desc_tree, irq);
}
+EXPORT_SYMBOL(irq_to_desc);
static void delete_irq_desc(unsigned int irq)
{
@@ -273,10 +274,16 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
+EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
- dynamic_irq_cleanup(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc_set_defaults(irq, desc, desc_node(desc), NULL);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -297,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)
return -ENOMEM;
}
+void irq_mark_irq(unsigned int irq)
+{
+ mutex_lock(&sparse_irq_lock);
+ bitmap_set(allocated_irqs, irq, 1);
+ mutex_unlock(&sparse_irq_lock);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_LEGACY
+void irq_init_desc(unsigned int irq)
+{
+ free_desc(irq);
+}
+#endif
+
#endif /* !CONFIG_SPARSE_IRQ */
/**
@@ -361,6 +382,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
if (from > irq)
return -EINVAL;
from = irq;
+ } else {
+ /*
+ * For interrupts which are freely allocated the
+ * architecture can force a lower bound to the @from
+ * argument. x86 uses this to exclude the GSI space.
+ */
+ from = arch_dynirq_lower_bound(from);
}
mutex_lock(&sparse_irq_lock);
@@ -387,30 +415,56 @@ err:
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
+#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
/**
- * irq_reserve_irqs - mark irqs allocated
- * @from: mark from irq number
- * @cnt: number of irqs to mark
+ * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
+ * @cnt: number of interrupts to allocate
+ * @node: node on which to allocate
*
- * Returns 0 on success or an appropriate error code
+ * Returns an interrupt number > 0 or 0, if the allocation fails.
*/
-int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+unsigned int irq_alloc_hwirqs(int cnt, int node)
{
- unsigned int start;
- int ret = 0;
+ int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
- if (!cnt || (from + cnt) > nr_irqs)
- return -EINVAL;
+ if (irq < 0)
+ return 0;
- mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
- if (start == from)
- bitmap_set(allocated_irqs, start, cnt);
- else
- ret = -EEXIST;
- mutex_unlock(&sparse_irq_lock);
- return ret;
+ for (i = irq; cnt > 0; i++, cnt--) {
+ if (arch_setup_hwirq(i, node))
+ goto err;
+ irq_clear_status_flags(i, _IRQ_NOREQUEST);
+ }
+ return irq;
+
+err:
+ for (i--; i >= irq; i--) {
+ irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+ arch_teardown_hwirq(i);
+ }
+ irq_free_descs(irq, cnt);
+ return 0;
}
+EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
+
+/**
+ * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
+ * @from: Free from irq number
+ * @cnt: number of interrupts to free
+ *
+ */
+void irq_free_hwirqs(unsigned int from, int cnt)
+{
+ int i, j;
+
+ for (i = from, j = cnt; j > 0; i++, j--) {
+ irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+ arch_teardown_hwirq(i);
+ }
+ irq_free_descs(from, cnt);
+}
+EXPORT_SYMBOL_GPL(irq_free_hwirqs);
+#endif
/**
* irq_get_next_irq - get next allocated irq number
@@ -473,18 +527,9 @@ int irq_set_percpu_devid(unsigned int irq)
return 0;
}
-/**
- * dynamic_irq_cleanup - cleanup a dynamically allocated irq
- * @irq: irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
+void kstat_incr_irq_this_cpu(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, desc_node(desc), NULL);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
}
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b6..eb5e10e32e0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,711 @@
+#define pr_fmt(fmt) "irq: " fmt
+
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <linux/irqdesc.h>
#include <linux/irqdomain.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/topology.h>
+#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
+static DEFINE_MUTEX(revmap_trees_mutex);
+static struct irq_domain *irq_default_domain;
+
/**
- * irq_domain_add() - Register an irq_domain
- * @domain: ptr to initialized irq_domain structure
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @of_node: optional device-tree node of the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
*
- * Registers an irq_domain structure. The irq_domain must at a minimum be
- * initialized with an ops structure pointer, and either a ->to_irq hook or
- * a valid irq_base value. Everything else is optional.
+ * Allocates and initialize and irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
*/
-void irq_domain_add(struct irq_domain *domain)
+struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
- struct irq_data *d;
- int hwirq, irq;
+ struct irq_domain *domain;
- /*
- * This assumes that the irq_domain owner has already allocated
- * the irq_descs. This block will be removed when support for dynamic
- * allocation of irq_descs is added to irq_domain.
- */
- irq_domain_for_each_irq(domain, hwirq, irq) {
- d = irq_get_irq_data(irq);
- if (!d) {
- WARN(1, "error: assigning domain to non existant irq_desc");
- return;
- }
- if (d->domain) {
- /* things are broken; just report, don't clean up */
- WARN(1, "error: irq_desc already assigned to a domain");
- return;
- }
- d->domain = domain;
- d->hwirq = hwirq;
- }
+ domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
+ GFP_KERNEL, of_node_to_nid(of_node));
+ if (WARN_ON(!domain))
+ return NULL;
+
+ /* Fill structure */
+ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+ domain->ops = ops;
+ domain->host_data = host_data;
+ domain->of_node = of_node_get(of_node);
+ domain->hwirq_max = hwirq_max;
+ domain->revmap_size = size;
+ domain->revmap_direct_max_irq = direct_max;
mutex_lock(&irq_domain_mutex);
- list_add(&domain->list, &irq_domain_list);
+ list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
+
+ pr_debug("Added domain %s\n", domain->name);
+ return domain;
}
+EXPORT_SYMBOL_GPL(__irq_domain_add);
/**
- * irq_domain_del() - Unregister an irq_domain
- * @domain: ptr to registered irq_domain.
+ * irq_domain_remove() - Remove an irq domain.
+ * @domain: domain to remove
+ *
+ * This routine is used to remove an irq domain. The caller must ensure
+ * that all mappings within the domain have been disposed of prior to
+ * use, depending on the revmap type.
*/
-void irq_domain_del(struct irq_domain *domain)
+void irq_domain_remove(struct irq_domain *domain)
{
- struct irq_data *d;
- int hwirq, irq;
-
mutex_lock(&irq_domain_mutex);
- list_del(&domain->list);
+
+ /*
+ * radix_tree_delete() takes care of destroying the root
+ * node when all entries are removed. Shout if there are
+ * any mappings left.
+ */
+ WARN_ON(domain->revmap_tree.height);
+
+ list_del(&domain->link);
+
+ /*
+ * If the going away domain is the default one, reset it.
+ */
+ if (unlikely(irq_default_domain == domain))
+ irq_set_default_host(NULL);
+
mutex_unlock(&irq_domain_mutex);
- /* Clear the irq_domain assignments */
- irq_domain_for_each_irq(domain, hwirq, irq) {
- d = irq_get_irq_data(irq);
- d->domain = NULL;
- }
+ pr_debug("Removed domain %s\n", domain->name);
+
+ of_node_put(domain->of_node);
+ kfree(domain);
}
+EXPORT_SYMBOL_GPL(irq_domain_remove);
-#if defined(CONFIG_OF_IRQ)
/**
- * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
+ * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in mapping
+ * @first_irq: first number of irq block assigned to the domain,
+ * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
+ * pre-map all of the irqs in the domain to virqs starting at first_irq.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
*
- * Used by the device tree interrupt mapping code to translate a device tree
- * interrupt specifier to a valid linux irq number. Returns either a valid
- * linux IRQ number or 0.
+ * Allocates an irq_domain, and optionally if first_irq is positive then also
+ * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
*
- * When the caller no longer need the irq number returned by this function it
- * should arrange to call irq_dispose_mapping().
+ * This is intended to implement the expected behaviour for most
+ * interrupt controllers. If device tree is used, then first_irq will be 0 and
+ * irqs get mapped dynamically on the fly. However, if the controller requires
+ * static virq assignments (non-DT boot) then it will set that up correctly.
*/
-unsigned int irq_create_of_mapping(struct device_node *controller,
- const u32 *intspec, unsigned int intsize)
+struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
+ unsigned int size,
+ unsigned int first_irq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irq_domain *domain;
- unsigned long hwirq;
- unsigned int irq, type;
- int rc = -EINVAL;
- /* Find a domain which can translate the irq spec */
+ domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
+ if (!domain)
+ return NULL;
+
+ if (first_irq > 0) {
+ if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
+ /* attempt to allocated irq_descs */
+ int rc = irq_alloc_descs(first_irq, first_irq, size,
+ of_node_to_nid(of_node));
+ if (rc < 0)
+ pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ first_irq);
+ }
+ irq_domain_associate_many(domain, first_irq, 0, size);
+ }
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+
+/**
+ * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in legacy mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @first_hwirq: first hwirq number to use for the translation. Should normally
+ * be '0', but a positive integer can be used if the effective
+ * hwirqs numbering does not begin at zero.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Note: the map() callback will be called before this function returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller).
+ */
+struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+ unsigned int size,
+ unsigned int first_irq,
+ irq_hw_number_t first_hwirq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_add(of_node, first_hwirq + size,
+ first_hwirq + size, 0, ops, host_data);
+ if (!domain)
+ return NULL;
+
+ irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+
+/**
+ * irq_find_host() - Locates a domain for a given device node
+ * @node: device-tree node of the interrupt controller
+ */
+struct irq_domain *irq_find_host(struct device_node *node)
+{
+ struct irq_domain *h, *found = NULL;
+ int rc;
+
+ /* We might want to match the legacy controller last since
+ * it might potentially be set to match all interrupts in
+ * the absence of a device node. This isn't a problem so far
+ * yet though...
+ */
mutex_lock(&irq_domain_mutex);
- list_for_each_entry(domain, &irq_domain_list, list) {
- if (!domain->ops->dt_translate)
- continue;
- rc = domain->ops->dt_translate(domain, controller,
- intspec, intsize, &hwirq, &type);
- if (rc == 0)
+ list_for_each_entry(h, &irq_domain_list, link) {
+ if (h->ops->match)
+ rc = h->ops->match(h, node);
+ else
+ rc = (h->of_node != NULL) && (h->of_node == node);
+
+ if (rc) {
+ found = h;
break;
+ }
}
mutex_unlock(&irq_domain_mutex);
-
- if (rc != 0)
- return 0;
-
- irq = irq_domain_to_irq(domain, hwirq);
- if (type != IRQ_TYPE_NONE)
- irq_set_irq_type(irq, type);
- pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
- controller->full_name, (int)hwirq, irq, type);
- return irq;
+ return found;
}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+EXPORT_SYMBOL_GPL(irq_find_host);
/**
- * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
- * @irq: linux irq number to be discarded
+ * irq_set_default_host() - Set a "default" irq domain
+ * @domain: default domain pointer
*
- * Calling this function indicates the caller no longer needs a reference to
- * the linux irq number returned by a prior call to irq_create_of_mapping().
+ * For convenience, it's possible to set a "default" domain that will be used
+ * whenever NULL is passed to irq_create_mapping(). It makes life easier for
+ * platforms that want to manipulate a few hard coded interrupt numbers that
+ * aren't properly represented in the device-tree.
*/
-void irq_dispose_mapping(unsigned int irq)
+void irq_set_default_host(struct irq_domain *domain)
{
- /*
- * nothing yet; will be filled when support for dynamic allocation of
- * irq_descs is added to irq_domain
- */
+ pr_debug("Default domain set to @0x%p\n", domain);
+
+ irq_default_domain = domain;
}
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+EXPORT_SYMBOL_GPL(irq_set_default_host);
-int irq_domain_simple_dt_translate(struct irq_domain *d,
- struct device_node *controller,
- const u32 *intspec, unsigned int intsize,
- unsigned long *out_hwirq, unsigned int *out_type)
+static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
- if (d->of_node != controller)
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ irq_hw_number_t hwirq;
+
+ if (WARN(!irq_data || irq_data->domain != domain,
+ "virq%i doesn't exist; cannot disassociate\n", irq))
+ return;
+
+ hwirq = irq_data->hwirq;
+ irq_set_status_flags(irq, IRQ_NOREQUEST);
+
+ /* remove chip and handler */
+ irq_set_chip_and_handler(irq, NULL, NULL);
+
+ /* Make sure it's completed */
+ synchronize_irq(irq);
+
+ /* Tell the PIC about it */
+ if (domain->ops->unmap)
+ domain->ops->unmap(domain, irq);
+ smp_mb();
+
+ irq_data->domain = NULL;
+ irq_data->hwirq = 0;
+
+ /* Clear reverse map for this hwirq */
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ int ret;
+
+ if (WARN(hwirq >= domain->hwirq_max,
+ "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
return -EINVAL;
- if (intsize < 1)
+ if (WARN(!irq_data, "error: virq%i is not allocated", virq))
return -EINVAL;
- if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
- (intspec[0] >= d->hwirq_base + d->nr_irq)))
+ if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- *out_hwirq = intspec[0];
- *out_type = IRQ_TYPE_NONE;
- if (intsize > 1)
- *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&irq_domain_mutex);
+ irq_data->hwirq = hwirq;
+ irq_data->domain = domain;
+ if (domain->ops->map) {
+ ret = domain->ops->map(domain, virq, hwirq);
+ if (ret != 0) {
+ /*
+ * If map() returns -EPERM, this interrupt is protected
+ * by the firmware or some other service and shall not
+ * be mapped. Don't bother telling the user about it.
+ */
+ if (ret != -EPERM) {
+ pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
+ domain->name, hwirq, virq, ret);
+ }
+ irq_data->domain = NULL;
+ irq_data->hwirq = 0;
+ mutex_unlock(&irq_domain_mutex);
+ return ret;
+ }
+
+ /* If not already assigned, give the domain the chip's name */
+ if (!domain->name && irq_data->chip)
+ domain->name = irq_data->chip->name;
+ }
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = virq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+ mutex_unlock(&irq_domain_mutex);
+
+ irq_clear_status_flags(virq, IRQ_NOREQUEST);
+
return 0;
}
+EXPORT_SYMBOL_GPL(irq_domain_associate);
+
+void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
+ irq_hw_number_t hwirq_base, int count)
+{
+ int i;
+
+ pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
+ of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+
+ for (i = 0; i < count; i++) {
+ irq_domain_associate(domain, irq_base + i, hwirq_base + i);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_domain_associate_many);
/**
- * irq_domain_create_simple() - Set up a 'simple' translation range
+ * irq_create_direct_mapping() - Allocate an irq for direct mapping
+ * @domain: domain to allocate the irq for or NULL for default domain
+ *
+ * This routine is used for irq controllers which can choose the hardware
+ * interrupt numbers they generate. In such a case it's simplest to use
+ * the linux irq as the hardware interrupt number. It still uses the linear
+ * or radix tree to store the mapping, but the irq controller can optimize
+ * the revmap path by using the hwirq directly.
*/
-void irq_domain_add_simple(struct device_node *controller, int irq_base)
+unsigned int irq_create_direct_mapping(struct irq_domain *domain)
+{
+ unsigned int virq;
+
+ if (domain == NULL)
+ domain = irq_default_domain;
+
+ virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ if (!virq) {
+ pr_debug("create_direct virq allocation failed\n");
+ return 0;
+ }
+ if (virq >= domain->revmap_direct_max_irq) {
+ pr_err("ERROR: no free irqs available below %i maximum\n",
+ domain->revmap_direct_max_irq);
+ irq_free_desc(virq);
+ return 0;
+ }
+ pr_debug("create_direct obtained virq %d\n", virq);
+
+ if (irq_domain_associate(domain, virq, virq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+
+/**
+ * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * @domain: domain owning this hardware interrupt or NULL for default domain
+ * @hwirq: hardware irq number in that domain space
+ *
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
+ * irq number.
+ * If the sense/trigger is to be specified, set_irq_type() should be called
+ * on the number returned from that call.
+ */
+unsigned int irq_create_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ unsigned int hint;
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Look for default domain if nececssary */
+ if (domain == NULL)
+ domain = irq_default_domain;
+ if (domain == NULL) {
+ WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
+ return 0;
+ }
+ pr_debug("-> using domain @%p\n", domain);
+
+ /* Check if mapping already exists */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
+ pr_debug("-> existing mapping on virq %d\n", virq);
+ return virq;
+ }
+
+ /* Allocate a virtual interrupt number */
+ hint = hwirq % nr_irqs;
+ if (hint == 0)
+ hint++;
+ virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
+ if (virq <= 0)
+ virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(domain->of_node), virq);
+
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping);
+
+/**
+ * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
+ * @domain: domain owning the interrupt range
+ * @irq_base: beginning of linux IRQ range
+ * @hwirq_base: beginning of hardware IRQ range
+ * @count: Number of interrupts to map
+ *
+ * This routine is used for allocating and mapping a range of hardware
+ * irqs to linux irqs where the linux irq numbers are at pre-defined
+ * locations. For use by controllers that already have static mappings
+ * to insert in to the domain.
+ *
+ * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
+ * domain insertion.
+ *
+ * 0 is returned upon success, while any failure to establish a static
+ * mapping is treated as an error.
+ */
+int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
+ irq_hw_number_t hwirq_base, int count)
+{
+ int ret;
+
+ ret = irq_alloc_descs(irq_base, irq_base, count,
+ of_node_to_nid(domain->of_node));
+ if (unlikely(ret < 0))
+ return ret;
+
+ irq_domain_associate_many(domain, irq_base, hwirq_base, count);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
+
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
{
struct irq_domain *domain;
+ irq_hw_number_t hwirq;
+ unsigned int type = IRQ_TYPE_NONE;
+ unsigned int virq;
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
if (!domain) {
- WARN_ON(1);
+ pr_warn("no irq domain found for %s !\n",
+ of_node_full_name(irq_data->np));
+ return 0;
+ }
+
+ /* If domain has no translation, then we assume interrupt line */
+ if (domain->ops->xlate == NULL)
+ hwirq = irq_data->args[0];
+ else {
+ if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
+ irq_data->args_count, &hwirq, &type))
+ return 0;
+ }
+
+ /* Create mapping */
+ virq = irq_create_mapping(domain, hwirq);
+ if (!virq)
+ return virq;
+
+ /* Set type if specified and different than the current one */
+ if (type != IRQ_TYPE_NONE &&
+ type != irq_get_trigger_type(virq))
+ irq_set_irq_type(virq, type);
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+
+/**
+ * irq_dispose_mapping() - Unmap an interrupt
+ * @virq: linux irq number of the interrupt to unmap
+ */
+void irq_dispose_mapping(unsigned int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
+
+ if (!virq || !irq_data)
+ return;
+
+ domain = irq_data->domain;
+ if (WARN_ON(domain == NULL))
return;
+
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+
+/**
+ * irq_find_mapping() - Find a linux irq from an hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
+ */
+unsigned int irq_find_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ struct irq_data *data;
+
+ /* Look for default domain if nececssary */
+ if (domain == NULL)
+ domain = irq_default_domain;
+ if (domain == NULL)
+ return 0;
+
+ if (hwirq < domain->revmap_direct_max_irq) {
+ data = irq_get_irq_data(hwirq);
+ if (data && (data->domain == domain) && (data->hwirq == hwirq))
+ return hwirq;
}
- domain->irq_base = irq_base;
- domain->of_node = of_node_get(controller);
- domain->ops = &irq_domain_simple_ops;
- irq_domain_add(domain);
+ /* Check if the hwirq is in the linear revmap. */
+ if (hwirq < domain->revmap_size)
+ return domain->linear_revmap[hwirq];
+
+ rcu_read_lock();
+ data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+ rcu_read_unlock();
+ return data ? data->irq : 0;
}
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_find_mapping);
+
+#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static int virq_debug_show(struct seq_file *m, void *private)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+ struct irq_domain *domain;
+ struct radix_tree_iter iter;
+ void *data, **slot;
+ int i;
+
+ seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
+ "name", "mapped", "linear-max", "direct-max", "devtree-node");
+ mutex_lock(&irq_domain_mutex);
+ list_for_each_entry(domain, &irq_domain_list, link) {
+ int count = 0;
+ radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
+ count++;
+ seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
+ domain == irq_default_domain ? '*' : ' ', domain->name,
+ domain->revmap_size + count, domain->revmap_size,
+ domain->revmap_direct_max_irq,
+ domain->of_node ? of_node_full_name(domain->of_node) : "");
+ }
+ mutex_unlock(&irq_domain_mutex);
+
+ seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
+ "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
+ "active", "type", "domain");
+
+ for (i = 1; i < nr_irqs; i++) {
+ desc = irq_to_desc(i);
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ domain = desc->irq_data.domain;
+
+ if (domain) {
+ struct irq_chip *chip;
+ int hwirq = desc->irq_data.hwirq;
+ bool direct;
+
+ seq_printf(m, "%5d ", i);
+ seq_printf(m, "0x%05x ", hwirq);
+
+ chip = irq_desc_get_chip(desc);
+ seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
+
+ data = irq_desc_get_chip_data(desc);
+ seq_printf(m, data ? "0x%p " : " %p ", data);
+
+ seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
+ direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
+ seq_printf(m, "%6s%-8s ",
+ (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
+ direct ? "(DIRECT)" : "");
+ seq_printf(m, "%s\n", desc->irq_data.domain->name);
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+
+ return 0;
+}
+
+static int virq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, virq_debug_show, inode->i_private);
+}
+
+static const struct file_operations virq_debug_fops = {
+ .open = virq_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init irq_debugfs_init(void)
+{
+ if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
+ NULL, &virq_debug_fops) == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+__initcall(irq_debugfs_init);
+#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
+
+/**
+ * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with one cell
+ * bindings where the cell value maps directly to the hwirq number.
+ */
+int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (WARN_ON(intsize < 1))
+ return -EINVAL;
+ *out_hwirq = intspec[0];
+ *out_type = IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
+
+/**
+ * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+ if (WARN_ON(intsize < 2))
+ return -EINVAL;
+ *out_hwirq = intspec[0];
+ *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
+
+/**
+ * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with either one
+ * or two cell bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ *
+ * Note: don't use this function unless your interrupt controller explicitly
+ * supports both one and two cell bindings. For the majority of controllers
+ * the _onecell() or _twocell() variants above should be used.
+ */
+int irq_domain_xlate_onetwocell(struct irq_domain *d,
+ struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (WARN_ON(intsize < 1))
+ return -EINVAL;
+ *out_hwirq = intspec[0];
+ *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
-void irq_domain_generate_simple(const struct of_device_id *match,
- u64 phys_base, unsigned int irq_start)
-{
- struct device_node *node;
- pr_info("looking for phys_base=%llx, irq_start=%i\n",
- (unsigned long long) phys_base, (int) irq_start);
- node = of_find_matching_node_by_address(NULL, match, phys_base);
- if (node)
- irq_domain_add_simple(node, irq_start);
- else
- pr_info("no node found\n");
-}
-EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
-#endif /* CONFIG_OF_IRQ */
-
-struct irq_domain_ops irq_domain_simple_ops = {
-#ifdef CONFIG_OF_IRQ
- .dt_translate = irq_domain_simple_dt_translate,
-#endif /* CONFIG_OF_IRQ */
+const struct irq_domain_ops irq_domain_simple_ops = {
+ .xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a9a9dbe49fe..3dc6a61bf06 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,6 +7,8 @@
* This file contains driver APIs to the irq subsystem.
*/
+#define pr_fmt(fmt) "genirq: " fmt
+
#include <linux/irq.h>
#include <linux/kthread.h>
#include <linux/module.h>
@@ -14,6 +16,8 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/task_work.h>
#include "internals.h"
@@ -28,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
-/**
- * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
- *
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
- *
- * This function may be called - with care - from IRQ context.
- */
-void synchronize_irq(unsigned int irq)
+static void __synchronize_hardirq(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
bool inprogress;
- if (!desc)
- return;
-
do {
unsigned long flags;
@@ -63,12 +53,56 @@ void synchronize_irq(unsigned int irq)
/* Oops, that failed? */
} while (inprogress);
+}
- /*
- * We made sure that no hardirq handler is running. Now verify
- * that no threaded handlers are active.
- */
- wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+/**
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this
+ * function while holding a resource the IRQ handler may need you
+ * will deadlock. It does not take associated threaded handlers
+ * into account.
+ *
+ * Do not use this for shutdown scenarios where you must be sure
+ * that all parts (hardirq and threaded handler) have completed.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_hardirq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc)
+ __synchronize_hardirq(desc);
+}
+EXPORT_SYMBOL(synchronize_hardirq);
+
+/**
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ __synchronize_hardirq(desc);
+ /*
+ * We made sure that no hardirq handler is
+ * running. Now verify that no threaded handlers are
+ * active.
+ */
+ wait_event(desc->wait_for_threads,
+ !atomic_read(&desc->threads_active));
+ }
}
EXPORT_SYMBOL(synchronize_irq);
@@ -139,7 +173,27 @@ static inline void
irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
#endif
-int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ int ret;
+
+ ret = chip->irq_set_affinity(data, mask, force);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(data->affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
struct irq_chip *chip = irq_data_get_irq_chip(data);
struct irq_desc *desc = irq_data_to_desc(data);
@@ -149,14 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return -EINVAL;
if (irq_can_move_pcntxt(data)) {
- ret = chip->irq_set_affinity(data, mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(data->affinity, mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- ret = 0;
- }
+ ret = irq_do_set_affinity(data, mask, force);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
@@ -171,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return ret;
}
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq: Interrupt to set affinity
- * @mask: cpumask
- *
- */
-int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
+int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -187,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
+ ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
@@ -280,9 +321,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
static int
setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
- struct irq_chip *chip = irq_desc_get_chip(desc);
struct cpumask *set = irq_default_affinity;
- int ret;
+ int node = desc->irq_data.node;
/* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
@@ -301,13 +341,14 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
}
cpumask_and(mask, cpu_online_mask, set);
- ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(desc->irq_data.affinity, mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
+ if (node != NUMA_NO_NODE) {
+ const struct cpumask *nodemask = cpumask_of_node(node);
+
+ /* make sure at least one of the cpus in nodemask is online */
+ if (cpumask_intersects(mask, nodemask))
+ cpumask_and(mask, mask, nodemask);
}
+ irq_do_set_affinity(&desc->irq_data, mask, false);
return 0;
}
#else
@@ -539,9 +580,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return 0;
if (irq_settings_can_request(desc)) {
- if (desc->action)
- if (irqflags & desc->action->flags & IRQF_SHARED)
- canrequest =1;
+ if (!desc->action ||
+ irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest = 1;
}
irq_put_desc_unlock(desc, flags);
return canrequest;
@@ -559,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* flow-types?
*/
pr_debug("No set_type function for IRQ %d (%s)\n", irq,
- chip ? (chip->name ? : "unknown") : "unknown");
+ chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -593,7 +634,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
ret = 0;
break;
default:
- pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+ pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
flags, irq, chip->irq_set_type);
}
if (unmask)
@@ -601,6 +642,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return ret;
}
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+int irq_set_parent(int irq, int parent_irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+
+ if (!desc)
+ return -EINVAL;
+
+ desc->parent_irq = parent_irq;
+
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+#endif
+
/*
* Default primary interrupt handler for threaded interrupts. Is
* assigned as primary handler when request_threaded_irq is called
@@ -645,7 +702,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
* is marked MASKED.
*/
static void irq_finalize_oneshot(struct irq_desc *desc,
- struct irqaction *action, bool force)
+ struct irqaction *action)
{
if (!(desc->istate & IRQS_ONESHOT))
return;
@@ -679,14 +736,14 @@ again:
* we would clear the threads_oneshot bit of this thread which
* was just set.
*/
- if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
goto out_unlock;
desc->threads_oneshot &= ~action->thread_mask;
if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
irqd_irq_masked(&desc->irq_data))
- unmask_irq(desc);
+ unmask_threaded_irq(desc);
out_unlock:
raw_spin_unlock_irq(&desc->lock);
@@ -695,12 +752,13 @@ out_unlock:
#ifdef CONFIG_SMP
/*
- * Check whether we need to chasnge the affinity of the interrupt thread.
+ * Check whether we need to change the affinity of the interrupt thread.
*/
static void
irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
{
cpumask_var_t mask;
+ bool valid = true;
if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
return;
@@ -715,10 +773,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
}
raw_spin_lock_irq(&desc->lock);
- cpumask_copy(mask, desc->irq_data.affinity);
+ /*
+ * This code is triggered unconditionally. Check the affinity
+ * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+ */
+ if (desc->irq_data.affinity)
+ cpumask_copy(mask, desc->irq_data.affinity);
+ else
+ valid = false;
raw_spin_unlock_irq(&desc->lock);
- set_cpus_allowed_ptr(current, mask);
+ if (valid)
+ set_cpus_allowed_ptr(current, mask);
free_cpumask_var(mask);
}
#else
@@ -739,13 +805,13 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
local_bh_disable();
ret = action->thread_fn(action->irq, action->dev_id);
- irq_finalize_oneshot(desc, action, false);
+ irq_finalize_oneshot(desc, action);
local_bh_enable();
return ret;
}
/*
- * Interrupts explicitely requested as threaded interupts want to be
+ * Interrupts explicitly requested as threaded interrupts want to be
* preemtible - many of them need to sleep and wait for slow busses to
* complete.
*/
@@ -755,105 +821,116 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
irqreturn_t ret;
ret = action->thread_fn(action->irq, action->dev_id);
- irq_finalize_oneshot(desc, action, false);
+ irq_finalize_oneshot(desc, action);
return ret;
}
+static void wake_threads_waitq(struct irq_desc *desc)
+{
+ if (atomic_dec_and_test(&desc->threads_active))
+ wake_up(&desc->wait_for_threads);
+}
+
+static void irq_thread_dtor(struct callback_head *unused)
+{
+ struct task_struct *tsk = current;
+ struct irq_desc *desc;
+ struct irqaction *action;
+
+ if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
+ return;
+
+ action = kthread_data(tsk);
+
+ pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ tsk->comm, tsk->pid, action->irq);
+
+
+ desc = irq_to_desc(action->irq);
+ /*
+ * If IRQTF_RUNTHREAD is set, we need to decrement
+ * desc->threads_active and wake possible waiters.
+ */
+ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ wake_threads_waitq(desc);
+
+ /* Prevent a stale desc->threads_oneshot */
+ irq_finalize_oneshot(desc, action);
+}
+
/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
{
- static const struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
+ struct callback_head on_exit_work;
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
- int wake;
- if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+ if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
else
handler_fn = irq_thread_fn;
- sched_setscheduler(current, SCHED_FIFO, &param);
- current->irqaction = action;
+ init_task_work(&on_exit_work, irq_thread_dtor);
+ task_work_add(current, &on_exit_work, false);
+
+ irq_thread_check_affinity(desc, action);
while (!irq_wait_for_interrupt(action)) {
+ irqreturn_t action_ret;
irq_thread_check_affinity(desc, action);
- atomic_inc(&desc->threads_active);
+ action_ret = handler_fn(desc, action);
+ if (action_ret == IRQ_HANDLED)
+ atomic_inc(&desc->threads_handled);
- raw_spin_lock_irq(&desc->lock);
- if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
- /*
- * CHECKME: We might need a dedicated
- * IRQ_THREAD_PENDING flag here, which
- * retriggers the thread in check_irq_resend()
- * but AFAICT IRQS_PENDING should be fine as it
- * retriggers the interrupt itself --- tglx
- */
- desc->istate |= IRQS_PENDING;
- raw_spin_unlock_irq(&desc->lock);
- } else {
- irqreturn_t action_ret;
-
- raw_spin_unlock_irq(&desc->lock);
- action_ret = handler_fn(desc, action);
- if (!noirqdebug)
- note_interrupt(action->irq, desc, action_ret);
- }
-
- wake = atomic_dec_and_test(&desc->threads_active);
-
- if (wake && waitqueue_active(&desc->wait_for_threads))
- wake_up(&desc->wait_for_threads);
+ wake_threads_waitq(desc);
}
- /* Prevent a stale desc->threads_oneshot */
- irq_finalize_oneshot(desc, action, true);
-
/*
- * Clear irqaction. Otherwise exit_irq_thread() would make
- * fuzz about an active irq thread going into nirvana.
+ * This is the regular exit path. __free_irq() is stopping the
+ * thread via kthread_stop() after calling
+ * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
+ * oneshot mask bit can be set. We cannot verify that as we
+ * cannot touch the oneshot mask at this point anymore as
+ * __setup_irq() might have given out currents thread_mask
+ * again.
*/
- current->irqaction = NULL;
+ task_work_cancel(current, irq_thread_dtor);
return 0;
}
-/*
- * Called from do_exit()
+/**
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
+ *
*/
-void exit_irq_thread(void)
+void irq_wake_thread(unsigned int irq, void *dev_id)
{
- struct task_struct *tsk = current;
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
- if (!tsk->irqaction)
+ if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return;
- printk(KERN_ERR
- "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
- tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
-
- desc = irq_to_desc(tsk->irqaction->irq);
-
- /*
- * Prevent a stale desc->threads_oneshot. Must be called
- * before setting the IRQTF_DIED flag.
- */
- irq_finalize_oneshot(desc, tsk->irqaction, true);
-
- /*
- * Set the THREAD DIED flag to prevent further wakeups of the
- * soon to be gone threaded handler.
- */
- set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action; action; action = action->next) {
+ if (action->dev_id == dev_id) {
+ if (action->thread)
+ __irq_wake_thread(desc, action);
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
}
+EXPORT_SYMBOL_GPL(irq_wake_thread);
static void irq_setup_forced_threading(struct irqaction *new)
{
@@ -871,6 +948,23 @@ static void irq_setup_forced_threading(struct irqaction *new)
}
}
+static int irq_request_resources(struct irq_desc *desc)
+{
+ struct irq_data *d = &desc->irq_data;
+ struct irq_chip *c = d->chip;
+
+ return c->irq_request_resources ? c->irq_request_resources(d) : 0;
+}
+
+static void irq_release_resources(struct irq_desc *desc)
+{
+ struct irq_data *d = &desc->irq_data;
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_release_resources)
+ c->irq_release_resources(d);
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -879,7 +973,6 @@ static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
- const char *old_name = NULL;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
cpumask_var_t mask;
@@ -891,22 +984,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
return -ENOSYS;
if (!try_module_get(desc->owner))
return -ENODEV;
- /*
- * Some drivers like serial.c use request_irq() heavily,
- * so we have to be careful not to interfere with a
- * running system.
- */
- if (new->flags & IRQF_SAMPLE_RANDOM) {
- /*
- * This function might sleep, we want to call it first,
- * outside of the atomic block.
- * Yes, this might clear the entropy pool if the wrong
- * driver is attempted to be loaded, without actually
- * installing a new handler, but is this really a problem,
- * only the sysadmin is able to do this.
- */
- rand_initialize_irq(irq);
- }
/*
* Check whether the interrupt nests into another interrupt
@@ -936,6 +1013,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (new->thread_fn && !nested) {
struct task_struct *t;
+ static const struct sched_param param = {
+ .sched_priority = MAX_USER_RT_PRIO/2,
+ };
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
new->name);
@@ -943,6 +1023,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = PTR_ERR(t);
goto out_mput;
}
+
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
@@ -950,6 +1033,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
get_task_struct(t);
new->thread = t;
+ /*
+ * Tell the thread to set its affinity. This is
+ * important for shared interrupt handlers as we do
+ * not invoke setup_affinity() for the secondary
+ * handlers as everything is already set up. Even for
+ * interrupts marked with IRQF_NO_BALANCE this is
+ * correct as we want the thread to move to the cpu(s)
+ * on which the requesting code placed the interrupt.
+ */
+ set_bit(IRQTF_AFFINITY, &new->thread_flags);
}
if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -958,6 +1051,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/*
+ * Drivers are often written to work w/o knowledge about the
+ * underlying irq chip implementation, so a request for a
+ * threaded irq without a primary hard irq context handler
+ * requires the ONESHOT flag to be set. Some irq chips like
+ * MSI based interrupts are per se one shot safe. Check the
+ * chip flags, so we can avoid the unmask dance at the end of
+ * the threaded handler for those.
+ */
+ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
+ new->flags &= ~IRQF_ONESHOT;
+
+ /*
* The following block of code has to be executed atomically
*/
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -973,10 +1078,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
- old_name = old->name;
+ ((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
- }
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
@@ -985,6 +1088,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
/* add new interrupt at end of irq queue */
do {
+ /*
+ * Or all existing action->thread_mask bits,
+ * so we can find the next zero bit for this
+ * new action.
+ */
thread_mask |= old->thread_mask;
old_ptr = &old->next;
old = *old_ptr;
@@ -993,16 +1101,72 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/*
- * Setup the thread mask for this irqaction. Unlikely to have
- * 32 resp 64 irqs sharing one line, but who knows.
+ * Setup the thread mask for this irqaction for ONESHOT. For
+ * !ONESHOT irqs the thread mask is 0 so we can avoid a
+ * conditional in irq_wake_thread().
*/
- if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
- ret = -EBUSY;
+ if (new->flags & IRQF_ONESHOT) {
+ /*
+ * Unlikely to have 32 resp 64 irqs sharing one line,
+ * but who knows.
+ */
+ if (thread_mask == ~0UL) {
+ ret = -EBUSY;
+ goto out_mask;
+ }
+ /*
+ * The thread_mask for the action is or'ed to
+ * desc->thread_active to indicate that the
+ * IRQF_ONESHOT thread handler has been woken, but not
+ * yet finished. The bit is cleared when a thread
+ * completes. When all threads of a shared interrupt
+ * line have completed desc->threads_active becomes
+ * zero and the interrupt line is unmasked. See
+ * handle.c:irq_wake_thread() for further information.
+ *
+ * If no thread is woken by primary (hard irq context)
+ * interrupt handlers, then desc->threads_active is
+ * also checked for zero to unmask the irq line in the
+ * affected hard irq flow handlers
+ * (handle_[fasteoi|level]_irq).
+ *
+ * The new action gets the first zero bit of
+ * thread_mask assigned. See the loop above which or's
+ * all existing action->thread_mask bits.
+ */
+ new->thread_mask = 1 << ffz(thread_mask);
+
+ } else if (new->handler == irq_default_primary_handler &&
+ !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
+ /*
+ * The interrupt was requested with handler = NULL, so
+ * we use the default primary handler for it. But it
+ * does not have the oneshot flag set. In combination
+ * with level interrupts this is deadly, because the
+ * default primary handler just wakes the thread, then
+ * the irq lines is reenabled, but the device still
+ * has the level irq asserted. Rinse and repeat....
+ *
+ * While this works for edge type interrupts, we play
+ * it safe and reject unconditionally because we can't
+ * say for sure which type this interrupt really
+ * has. The type flags are unreliable as the
+ * underlying chip implementation can override them.
+ */
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+ irq);
+ ret = -EINVAL;
goto out_mask;
}
- new->thread_mask = 1 << ffz(thread_mask);
if (!shared) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_mask;
+ }
+
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -1027,7 +1191,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
desc->istate |= IRQS_ONESHOT;
if (irq_settings_can_autoenable(desc))
- irq_startup(desc);
+ irq_startup(desc, true);
else
/* Undo nested disables: */
desc->depth = 1;
@@ -1047,7 +1211,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
- pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+ pr_warning("irq %d uses trigger mode %u; requested %u\n",
irq, nmsk, omsk);
}
@@ -1084,14 +1248,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
return 0;
mismatch:
-#ifdef CONFIG_DEBUG_SHIRQ
if (!(new->flags & IRQF_PROBE_SHARED)) {
- printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
- if (old_name)
- printk(KERN_ERR "current handler: %s\n", old_name);
+ pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
+ irq, new->flags, new->name, old->flags, old->name);
+#ifdef CONFIG_DEBUG_SHIRQ
dump_stack();
- }
#endif
+ }
ret = -EBUSY;
out_mask:
@@ -1103,8 +1266,7 @@ out_thread:
struct task_struct *t = new->thread;
new->thread = NULL;
- if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
- kthread_stop(t);
+ kthread_stop(t);
put_task_struct(t);
}
out_mput:
@@ -1174,15 +1336,11 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries: */
*action_ptr = action->next;
- /* Currently used only by UML, might disappear one day: */
-#ifdef CONFIG_IRQ_RELEASE_METHOD
- if (desc->irq_data.chip->release)
- desc->irq_data.chip->release(irq, dev_id);
-#endif
-
/* If this was the last handler, shut down the IRQ line: */
- if (!desc->action)
+ if (!desc->action) {
irq_shutdown(desc);
+ irq_release_resources(desc);
+ }
#ifdef CONFIG_SMP
/* make sure affinity_hint is cleaned up */
@@ -1214,8 +1372,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
#endif
if (action->thread) {
- if (!test_bit(IRQTF_DIED, &action->thread_flags))
- kthread_stop(action->thread);
+ kthread_stop(action->thread);
put_task_struct(action->thread);
}
@@ -1310,7 +1467,6 @@ EXPORT_SYMBOL(free_irq);
* Flags:
*
* IRQF_SHARED Interrupt is shared
- * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
* IRQF_TRIGGER_* Specify active edge(s) or level
*
*/
@@ -1447,6 +1603,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
out:
irq_put_desc_unlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
void disable_percpu_irq(unsigned int irq)
{
@@ -1460,6 +1617,7 @@ void disable_percpu_irq(unsigned int irq)
irq_percpu_disable(desc, cpu);
irq_put_desc_unlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
/*
* Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 47420908fba..ca3f4aaff70 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,13 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
- < nr_cpu_ids))
- if (!chip->irq_set_affinity(&desc->irq_data,
- desc->pending_mask, false)) {
- cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
- irq_set_thread_affinity(desc);
- }
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
+ irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a..abcd6ca86cb 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
bool is_early = desc->action &&
desc->action->flags & IRQF_EARLY_RESUME;
- if (is_early != want_early)
+ if (!is_early && want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
+ /*
+ * Only interrupts which are marked as wakeup source
+ * and have not been disabled before the suspend check
+ * can abort suspend.
+ */
if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->istate & IRQS_PENDING)
+ if (desc->depth == 1 && desc->istate & IRQS_PENDING)
return -EBUSY;
continue;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323..ac1ba2f1103 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
cpumask_var_t new_value;
int err;
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE(inode)->data);
+ return single_open(file, default_affinity_show, PDE_DATA(inode));
}
static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
static int irq_node_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_node_proc_show, PDE(inode)->data);
+ return single_open(file, irq_node_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
static int irq_spurious_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
+ return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_spurious_proc_fops = {
@@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", 0600, desc->dir,
+ proc_create_data("smp_affinity", 0644, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
/* create /proc/irq/<irq>/affinity_hint */
- proc_create_data("affinity_hint", 0400, desc->dir,
+ proc_create_data("affinity_hint", 0444, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
/* create /proc/irq/<irq>/smp_affinity_list */
- proc_create_data("smp_affinity_list", 0600, desc->dir,
+ proc_create_data("smp_affinity_list", 0644, desc->dir,
&irq_affinity_list_proc_fops, (void *)(long)irq);
proc_create_data("node", 0444, desc->dir,
@@ -366,17 +366,13 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
- if (action->dir) {
- struct irq_desc *desc = irq_to_desc(irq);
-
- remove_proc_entry(action->dir->name, desc->dir);
- }
+ proc_remove(action->dir);
}
static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
- proc_create("irq/default_smp_affinity", 0600, NULL,
+ proc_create("irq/default_smp_affinity", 0644, NULL,
&default_affinity_proc_fops);
#endif
}
@@ -466,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)
} else {
seq_printf(p, " %8s", "None");
}
+ if (desc->irq_data.domain)
+ seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c..9065107f083 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
- * active.
+ * active. Clear the pending bit so suspend/resume does not
+ * get confused.
*/
- if (irq_settings_is_level(desc))
+ if (irq_settings_is_level(desc)) {
+ desc->istate &= ~IRQS_PENDING;
return;
+ }
if (desc->istate & IRQS_REPLAY)
return;
if (desc->istate & IRQS_PENDING) {
@@ -71,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
+ /*
+ * If the interrupt has a parent irq and runs
+ * in the thread context of the parent irq,
+ * retrigger the parent.
+ */
+ if (desc->parent_irq &&
+ irq_settings_is_nested_thread(desc))
+ irq = desc->parent_irq;
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f1030f1..3320b84cc60 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,6 +14,7 @@ enum {
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
+ _IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -26,6 +27,7 @@ enum {
#define IRQ_NOAUTOEN GOT_YOU_MORON
#define IRQ_NESTED_THREAD GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
+#define IRQ_IS_POLLED GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_NESTED_THREAD;
}
+
+static inline bool irq_settings_is_polled(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_IS_POLLED;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c4..e2514b0e439 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
raw_spin_lock(&desc->lock);
- /* PER_CPU and nested thread interrupts are never polled */
- if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
+ /*
+ * PER_CPU, nested thread interrupts and interrupts explicitely
+ * marked polled are excluded from polling.
+ */
+ if (irq_settings_is_per_cpu(desc) ||
+ irq_settings_is_nested_thread(desc) ||
+ irq_settings_is_polled(desc))
goto out;
/*
@@ -80,13 +85,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
/*
* All handlers must agree on IRQF_SHARED, so we test just the
- * first. Check for action->next as well.
+ * first.
*/
action = desc->action;
if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER) ||
- (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
- !action->next)
+ (action->flags & __IRQF_TIMER))
goto out;
/* Already running on another processor */
@@ -104,6 +107,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
ret = IRQ_HANDLED;
+ /* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
@@ -266,14 +270,13 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
return action && (action->flags & IRQF_IRQPOLL);
}
+#define SPURIOUS_DEFERRED 0x80000000
+
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
- if (desc->istate & IRQS_POLL_INPROGRESS)
- return;
-
- /* we get here again via the threaded handler */
- if (action_ret == IRQ_WAKE_THREAD)
+ if (desc->istate & IRQS_POLL_INPROGRESS ||
+ irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
@@ -281,6 +284,106 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
return;
}
+ /*
+ * We cannot call note_interrupt from the threaded handler
+ * because we need to look at the compound of all handlers
+ * (primary and threaded). Aside of that in the threaded
+ * shared case we have no serialization against an incoming
+ * hardware interrupt while we are dealing with a threaded
+ * result.
+ *
+ * So in case a thread is woken, we just note the fact and
+ * defer the analysis to the next hardware interrupt.
+ *
+ * The threaded handlers store whether they sucessfully
+ * handled an interrupt and we check whether that number
+ * changed versus the last invocation.
+ *
+ * We could handle all interrupts with the delayed by one
+ * mechanism, but for the non forced threaded case we'd just
+ * add pointless overhead to the straight hardirq interrupts
+ * for the sake of a few lines less code.
+ */
+ if (action_ret & IRQ_WAKE_THREAD) {
+ /*
+ * There is a thread woken. Check whether one of the
+ * shared primary handlers returned IRQ_HANDLED. If
+ * not we defer the spurious detection to the next
+ * interrupt.
+ */
+ if (action_ret == IRQ_WAKE_THREAD) {
+ int handled;
+ /*
+ * We use bit 31 of thread_handled_last to
+ * denote the deferred spurious detection
+ * active. No locking necessary as
+ * thread_handled_last is only accessed here
+ * and we have the guarantee that hard
+ * interrupts are not reentrant.
+ */
+ if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
+ desc->threads_handled_last |= SPURIOUS_DEFERRED;
+ return;
+ }
+ /*
+ * Check whether one of the threaded handlers
+ * returned IRQ_HANDLED since the last
+ * interrupt happened.
+ *
+ * For simplicity we just set bit 31, as it is
+ * set in threads_handled_last as well. So we
+ * avoid extra masking. And we really do not
+ * care about the high bits of the handled
+ * count. We just care about the count being
+ * different than the one we saw before.
+ */
+ handled = atomic_read(&desc->threads_handled);
+ handled |= SPURIOUS_DEFERRED;
+ if (handled != desc->threads_handled_last) {
+ action_ret = IRQ_HANDLED;
+ /*
+ * Note: We keep the SPURIOUS_DEFERRED
+ * bit set. We are handling the
+ * previous invocation right now.
+ * Keep it for the current one, so the
+ * next hardware interrupt will
+ * account for it.
+ */
+ desc->threads_handled_last = handled;
+ } else {
+ /*
+ * None of the threaded handlers felt
+ * responsible for the last interrupt
+ *
+ * We keep the SPURIOUS_DEFERRED bit
+ * set in threads_handled_last as we
+ * need to account for the current
+ * interrupt as well.
+ */
+ action_ret = IRQ_NONE;
+ }
+ } else {
+ /*
+ * One of the primary handlers returned
+ * IRQ_HANDLED. So we don't care about the
+ * threaded handlers on the same line. Clear
+ * the deferred detection bit.
+ *
+ * In theory we could/should check whether the
+ * deferred bit is set and take the result of
+ * the previous run into account here as
+ * well. But it's really not worth the
+ * trouble. If every other interrupt is
+ * handled we never trigger the spurious
+ * detector. And if this is just the one out
+ * of 100k unhandled ones which is handled
+ * then we merily delay the spurious detection
+ * by one hard interrupt. Not a real problem.
+ */
+ desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
+ }
+ }
+
if (unlikely(action_ret == IRQ_NONE)) {
/*
* If we are seeing only the odd spurious IRQ caused by
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c3c46c72046..a82170e2fa7 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -5,42 +5,43 @@
* context. The enqueueing is NMI-safe.
*/
+#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/processor.h>
-/*
- * An entry can be in one of four states:
- *
- * free NULL, 0 -> {claimed} : free to be used
- * claimed NULL, 3 -> {pending} : claimed to be enqueued
- * pending next, 3 -> {busy} : queued, pending callback
- * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-
-#define IRQ_WORK_PENDING 1UL
-#define IRQ_WORK_BUSY 2UL
-#define IRQ_WORK_FLAGS 3UL
static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(int, irq_work_raised);
/*
* Claim the entry so that no one else will poke at it.
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, nflags;
+ unsigned long flags, oflags, nflags;
+ /*
+ * Start with our best wish as a premise but only trust any
+ * flag value after cmpxchg() result.
+ */
+ flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
- flags = work->flags;
- if (flags & IRQ_WORK_PENDING)
- return false;
nflags = flags | IRQ_WORK_FLAGS;
- if (cmpxchg(&work->flags, flags, nflags) == flags)
+ oflags = cmpxchg(&work->flags, flags, nflags);
+ if (oflags == flags)
break;
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
+ flags = oflags;
cpu_relax();
}
@@ -55,57 +56,71 @@ void __weak arch_irq_work_raise(void)
}
/*
- * Queue the entry and raise the IPI if needed.
+ * Enqueue the irq_work @entry unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
*/
-static void __irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
{
- bool empty;
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
+ /* Queue the entry and raise the IPI if needed. */
preempt_disable();
- empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
- /* The list was empty, raise self-interrupt to start processing. */
- if (empty)
- arch_irq_work_raise();
+ llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+
+ /*
+ * If the work is not "lazy" or the tick is stopped, raise the irq
+ * work interrupt (if supported by the arch), otherwise, just wait
+ * for the next tick.
+ */
+ if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
+ if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+ arch_irq_work_raise();
+ }
preempt_enable();
+
+ return true;
}
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
- * Enqueue the irq_work @entry, returns true on success, failure when the
- * @entry was already enqueued by someone else.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue(struct irq_work *work)
+bool irq_work_needs_cpu(void)
{
- if (!irq_work_claim(work)) {
- /*
- * Already enqueued, can't do!
- */
+ struct llist_head *this_list;
+
+ this_list = &__get_cpu_var(irq_work_list);
+ if (llist_empty(this_list))
return false;
- }
- __irq_work_queue(work);
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
- */
-void irq_work_run(void)
+static void __irq_work_run(void)
{
+ unsigned long flags;
struct irq_work *work;
struct llist_head *this_list;
struct llist_node *llnode;
+
+ /*
+ * Reset the "raised" state right before we check the list because
+ * an NMI may enqueue after we find the list empty from the runner.
+ */
+ __this_cpu_write(irq_work_raised, 0);
+ barrier();
+
this_list = &__get_cpu_var(irq_work_list);
if (llist_empty(this_list))
return;
- BUG_ON(!in_irq());
BUG_ON(!irqs_disabled());
llnode = llist_del_all(this_list);
@@ -117,16 +132,31 @@ void irq_work_run(void)
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
*/
- work->flags = IRQ_WORK_BUSY;
+ flags = work->flags & ~IRQ_WORK_PENDING;
+ xchg(&work->flags, flags);
+
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+ (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+ BUG_ON(!in_irq());
+ __irq_work_run();
+}
EXPORT_SYMBOL_GPL(irq_work_run);
/*
@@ -141,3 +171,35 @@ void irq_work_sync(struct irq_work *work)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int irq_work_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_DYING:
+ /* Called from stop_machine */
+ if (WARN_ON_ONCE(cpu != smp_processor_id()))
+ break;
+ __irq_work_run();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_notify;
+
+static __init int irq_work_init_cpu_notifier(void)
+{
+ cpu_notify.notifier_call = irq_work_cpu_notify;
+ cpu_notify.priority = 0;
+ register_cpu_notifier(&cpu_notify);
+ return 0;
+}
+device_initcall(irq_work_init_cpu_notifier);
+
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 22000c3db0d..8d262b46757 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -284,8 +284,12 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
if (value) {
if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
return -EFAULT;
- } else
- memset((char *) &set_buffer, 0, sizeof(set_buffer));
+ } else {
+ memset(&set_buffer, 0, sizeof(set_buffer));
+ printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+ " Misfeature support will be removed\n",
+ current->comm);
+ }
error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
if (error || !ovalue)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01d3b70fc98..9019f15deab 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,8 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/err.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
+#include <linux/jump_label_ratelimit.h>
#ifdef HAVE_JUMP_LABEL
@@ -29,11 +30,6 @@ void jump_label_unlock(void)
mutex_unlock(&jump_label_mutex);
}
-bool jump_label_enabled(struct jump_label_key *key)
-{
- return !!atomic_read(&key->enabled);
-}
-
static int jump_label_cmp(const void *a, const void *b)
{
const struct jump_entry *jea = a;
@@ -58,61 +54,76 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
-static void jump_label_update(struct jump_label_key *key, int enable);
+static void jump_label_update(struct static_key *key, int enable);
-void jump_label_inc(struct jump_label_key *key)
+void static_key_slow_inc(struct static_key *key)
{
+ STATIC_KEY_CHECK_USE();
if (atomic_inc_not_zero(&key->enabled))
return;
jump_label_lock();
- if (atomic_read(&key->enabled) == 0)
- jump_label_update(key, JUMP_LABEL_ENABLE);
+ if (atomic_read(&key->enabled) == 0) {
+ if (!jump_label_get_branch_default(key))
+ jump_label_update(key, JUMP_LABEL_ENABLE);
+ else
+ jump_label_update(key, JUMP_LABEL_DISABLE);
+ }
atomic_inc(&key->enabled);
jump_label_unlock();
}
-EXPORT_SYMBOL_GPL(jump_label_inc);
+EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __jump_label_dec(struct jump_label_key *key,
+static void __static_key_slow_dec(struct static_key *key,
unsigned long rate_limit, struct delayed_work *work)
{
- if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
+ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+ WARN(atomic_read(&key->enabled) < 0,
+ "jump label: negative count!\n");
return;
+ }
if (rate_limit) {
atomic_inc(&key->enabled);
schedule_delayed_work(work, rate_limit);
- } else
- jump_label_update(key, JUMP_LABEL_DISABLE);
-
+ } else {
+ if (!jump_label_get_branch_default(key))
+ jump_label_update(key, JUMP_LABEL_DISABLE);
+ else
+ jump_label_update(key, JUMP_LABEL_ENABLE);
+ }
jump_label_unlock();
}
-EXPORT_SYMBOL_GPL(jump_label_dec);
static void jump_label_update_timeout(struct work_struct *work)
{
- struct jump_label_key_deferred *key =
- container_of(work, struct jump_label_key_deferred, work.work);
- __jump_label_dec(&key->key, 0, NULL);
+ struct static_key_deferred *key =
+ container_of(work, struct static_key_deferred, work.work);
+ __static_key_slow_dec(&key->key, 0, NULL);
}
-void jump_label_dec(struct jump_label_key *key)
+void static_key_slow_dec(struct static_key *key)
{
- __jump_label_dec(key, 0, NULL);
+ STATIC_KEY_CHECK_USE();
+ __static_key_slow_dec(key, 0, NULL);
}
+EXPORT_SYMBOL_GPL(static_key_slow_dec);
-void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
- __jump_label_dec(&key->key, key->timeout, &key->work);
+ STATIC_KEY_CHECK_USE();
+ __static_key_slow_dec(&key->key, key->timeout, &key->work);
}
+EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
-
-void jump_label_rate_limit(struct jump_label_key_deferred *key,
+void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
+ STATIC_KEY_CHECK_USE();
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
@@ -150,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
arch_jump_label_transform(entry, type);
}
-static void __jump_label_update(struct jump_label_key *key,
+static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
struct jump_entry *stop, int enable)
{
@@ -167,38 +178,52 @@ static void __jump_label_update(struct jump_label_key *key,
}
}
+static enum jump_label_type jump_label_type(struct static_key *key)
+{
+ bool true_branch = jump_label_get_branch_default(key);
+ bool state = static_key_enabled(key);
+
+ if ((!true_branch && state) || (true_branch && !state))
+ return JUMP_LABEL_ENABLE;
+
+ return JUMP_LABEL_DISABLE;
+}
+
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
struct jump_entry *iter_stop = __stop___jump_table;
- struct jump_label_key *key = NULL;
+ struct static_key *key = NULL;
struct jump_entry *iter;
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
for (iter = iter_start; iter < iter_stop; iter++) {
- struct jump_label_key *iterk;
+ struct static_key *iterk;
- iterk = (struct jump_label_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
- JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+ iterk = (struct static_key *)(unsigned long)iter->key;
+ arch_jump_label_transform_static(iter, jump_label_type(iterk));
if (iterk == key)
continue;
key = iterk;
- key->entries = iter;
+ /*
+ * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+ */
+ *((unsigned long *)&key->entries) += (unsigned long)iter;
#ifdef CONFIG_MODULES
key->next = NULL;
#endif
}
+ static_key_initialized = true;
jump_label_unlock();
}
#ifdef CONFIG_MODULES
-struct jump_label_mod {
- struct jump_label_mod *next;
+struct static_key_mod {
+ struct static_key_mod *next;
struct jump_entry *entries;
struct module *mod;
};
@@ -218,9 +243,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
start, end);
}
-static void __jump_label_mod_update(struct jump_label_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key, int enable)
{
- struct jump_label_mod *mod = key->next;
+ struct static_key_mod *mod = key->next;
while (mod) {
struct module *m = mod->mod;
@@ -251,11 +276,7 @@ void jump_label_apply_nops(struct module *mod)
return;
for (iter = iter_start; iter < iter_stop; iter++) {
- struct jump_label_key *iterk;
-
- iterk = (struct jump_label_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
- JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+ arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
}
}
@@ -264,8 +285,8 @@ static int jump_label_add_module(struct module *mod)
struct jump_entry *iter_start = mod->jump_entries;
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
- struct jump_label_key *key = NULL;
- struct jump_label_mod *jlm;
+ struct static_key *key = NULL;
+ struct static_key_mod *jlm;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
@@ -274,28 +295,30 @@ static int jump_label_add_module(struct module *mod)
jump_label_sort_entries(iter_start, iter_stop);
for (iter = iter_start; iter < iter_stop; iter++) {
- if (iter->key == (jump_label_t)(unsigned long)key)
- continue;
+ struct static_key *iterk;
- key = (struct jump_label_key *)(unsigned long)iter->key;
+ iterk = (struct static_key *)(unsigned long)iter->key;
+ if (iterk == key)
+ continue;
+ key = iterk;
if (__module_address(iter->key) == mod) {
- atomic_set(&key->enabled, 0);
- key->entries = iter;
+ /*
+ * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+ */
+ *((unsigned long *)&key->entries) += (unsigned long)iter;
key->next = NULL;
continue;
}
-
- jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
+ jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
-
jlm->mod = mod;
jlm->entries = iter;
jlm->next = key->next;
key->next = jlm;
- if (jump_label_enabled(key))
+ if (jump_label_type(key) == JUMP_LABEL_ENABLE)
__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
}
@@ -307,14 +330,14 @@ static void jump_label_del_module(struct module *mod)
struct jump_entry *iter_start = mod->jump_entries;
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
- struct jump_label_key *key = NULL;
- struct jump_label_mod *jlm, **prev;
+ struct static_key *key = NULL;
+ struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
if (iter->key == (jump_label_t)(unsigned long)key)
continue;
- key = (struct jump_label_key *)(unsigned long)iter->key;
+ key = (struct static_key *)(unsigned long)iter->key;
if (__module_address(iter->key) == mod)
continue;
@@ -416,12 +439,13 @@ int jump_label_text_reserved(void *start, void *end)
return ret;
}
-static void jump_label_update(struct jump_label_key *key, int enable)
+static void jump_label_update(struct static_key *key, int enable)
{
- struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
+ struct jump_entry *stop = __stop___jump_table;
+ struct jump_entry *entry = jump_label_get_entries(key);
#ifdef CONFIG_MODULES
- struct module *mod = __module_address((jump_label_t)key);
+ struct module *mod = __module_address((unsigned long)key);
__jump_label_mod_update(key, enable);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 079f1d39a8b..cb0cf37dac3 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <asm/sections.h>
@@ -36,8 +37,8 @@
* These will be re-linked against their real values
* during the second link stage.
*/
-extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern const u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const u8 kallsyms_names[] __weak;
/*
* Tell the compiler that the count isn't in the small data section if the arch
@@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak));
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
-extern const unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __weak;
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -84,9 +85,11 @@ static int is_ksym_addr(unsigned long addr)
/*
* Expand a compressed symbol data into the resulting uncompressed string,
+ * if uncompressed string is too long (>= maxlen), it will be truncated,
* given the offset to where the symbol is in the compressed stream.
*/
-static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+static unsigned int kallsyms_expand_symbol(unsigned int off,
+ char *result, size_t maxlen)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
@@ -113,15 +116,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
while (*tptr) {
if (skipped_first) {
+ if (maxlen <= 1)
+ goto tail;
*result = *tptr;
result++;
+ maxlen--;
} else
skipped_first = 1;
tptr++;
}
}
- *result = '\0';
+tail:
+ if (maxlen)
+ *result = '\0';
/* Return to offset to the next symbol. */
return off;
@@ -176,7 +184,7 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
@@ -195,7 +203,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
@@ -294,7 +302,8 @@ const char *kallsyms_lookup(unsigned long addr,
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
return namebuf;
@@ -315,7 +324,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
pos = get_symbol_pos(addr, NULL, NULL);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ symname, KSYM_NAME_LEN);
return 0;
}
/* See if it's in a module. */
@@ -333,7 +343,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
pos = get_symbol_pos(addr, size, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ name, KSYM_NAME_LEN);
modname[0] = '\0';
return 0;
}
@@ -343,7 +354,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
- int symbol_offset)
+ int symbol_offset, int add_offset)
{
char *modname;
const char *name;
@@ -358,13 +369,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
if (name != buffer)
strcpy(buffer, name);
len = strlen(buffer);
- buffer += len;
offset -= symbol_offset;
+ if (add_offset)
+ len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
+
if (modname)
- len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
- else
- len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+ len += sprintf(buffer + len, " [%s]", modname);
return len;
}
@@ -382,12 +393,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
*/
int sprint_symbol(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0);
+ return __sprint_symbol(buffer, address, 0, 1);
}
-
EXPORT_SYMBOL_GPL(sprint_symbol);
/**
+ * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name
+ * and module name to @buffer if possible. If no symbol was found, just saves
+ * its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_no_offset(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
+
+/**
* sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
@@ -403,7 +430,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
*/
int sprint_backtrace(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, -1);
+ return __sprint_symbol(buffer, address, -1, 1);
}
/* Look up a kernel symbol and print it to the kernel messages. */
@@ -447,7 +474,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
iter->type = kallsyms_get_symbol_type(off);
- off = kallsyms_expand_symbol(off, iter->name);
+ off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
return off - iter->nameoff;
}
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 00000000000..e30ac0fe61c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,197 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+
+#include <asm/unistd.h>
+
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+ return (v ^ cookies[type][0]) * cookies[type][1];
+}
+
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+ long ret;
+
+ ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+
+ return (ret < 0) | ((ret > 0) << 1);
+}
+
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+ struct file *file = NULL;
+
+ task_lock(task);
+ rcu_read_lock();
+
+ if (task->files)
+ file = fcheck_files(task->files, idx);
+
+ rcu_read_unlock();
+ task_unlock(task);
+
+ return file;
+}
+
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+ if (likely(m2 != m1))
+ mutex_unlock(m2);
+ mutex_unlock(m1);
+}
+
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+ int err;
+
+ if (m2 > m1)
+ swap(m1, m2);
+
+ err = mutex_lock_killable(m1);
+ if (!err && likely(m1 != m2)) {
+ err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ if (err)
+ mutex_unlock(m1);
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+ unsigned long, idx1, unsigned long, idx2)
+{
+ struct task_struct *task1, *task2;
+ int ret;
+
+ rcu_read_lock();
+
+ /*
+ * Tasks are looked up in caller's PID namespace only.
+ */
+ task1 = find_task_by_vpid(pid1);
+ task2 = find_task_by_vpid(pid2);
+ if (!task1 || !task2)
+ goto err_no_task;
+
+ get_task_struct(task1);
+ get_task_struct(task2);
+
+ rcu_read_unlock();
+
+ /*
+ * One should have enough rights to inspect task details.
+ */
+ ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+ &task2->signal->cred_guard_mutex);
+ if (ret)
+ goto err;
+ if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+ !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+ ret = -EPERM;
+ goto err_unlock;
+ }
+
+ switch (type) {
+ case KCMP_FILE: {
+ struct file *filp1, *filp2;
+
+ filp1 = get_file_raw_ptr(task1, idx1);
+ filp2 = get_file_raw_ptr(task2, idx2);
+
+ if (filp1 && filp2)
+ ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+ else
+ ret = -EBADF;
+ break;
+ }
+ case KCMP_VM:
+ ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+ break;
+ case KCMP_FILES:
+ ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+ break;
+ case KCMP_FS:
+ ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+ break;
+ case KCMP_SIGHAND:
+ ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+ break;
+ case KCMP_IO:
+ ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+ break;
+ case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+ ret = kcmp_ptr(task1->sysvsem.undo_list,
+ task2->sysvsem.undo_list,
+ KCMP_SYSVSEM);
+#else
+ ret = -EOPNOTSUPP;
+#endif
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+err_unlock:
+ kcmp_unlock(&task1->signal->cred_guard_mutex,
+ &task2->signal->cred_guard_mutex);
+err:
+ put_task_struct(task1);
+ put_task_struct(task2);
+
+ return ret;
+
+err_no_task:
+ rcu_read_unlock();
+ return -ESRCH;
+}
+
+static __init int kcmp_cookies_init(void)
+{
+ int i;
+
+ get_random_bytes(cookies, sizeof(cookies));
+
+ for (i = 0; i < KCMP_TYPES; i++)
+ cookies[i][1] |= (~(~0UL >> 1) | 1);
+
+ return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b088678670..4b8f0c92588 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,6 @@
#include <linux/hardirq.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <generated/utsrelease.h>
#include <linux/utsname.h>
#include <linux/numa.h>
#include <linux/suspend.h>
@@ -33,11 +32,12 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
#include <asm/page.h>
#include <asm/uaccess.h>
#include <asm/io.h>
-#include <asm/system.h>
#include <asm/sections.h>
/* Per cpu memory for storing cpu states in case of system crash. */
@@ -49,6 +49,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
size_t vmcoreinfo_size;
size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -56,6 +59,12 @@ struct resource crashk_res = {
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
int kexec_should_crash(struct task_struct *p)
{
@@ -117,8 +126,8 @@ static struct page *kimage_alloc_page(struct kimage *image,
unsigned long dest);
static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
size_t segment_bytes;
struct kimage *image;
@@ -225,6 +234,8 @@ out:
}
+static void kimage_free_page_list(struct list_head *list);
+
static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments)
@@ -238,8 +249,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
if (result)
goto out;
- *rimage = image;
-
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
@@ -249,23 +258,23 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
- printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free;
}
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
- printk(KERN_ERR "Could not allocate swap buffer\n");
- goto out;
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free;
}
- result = 0;
- out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+out:
return result;
}
@@ -312,7 +321,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out;
+ goto out_free;
}
/*
@@ -324,17 +333,16 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
- printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kfree(image);
+out:
return result;
}
@@ -499,8 +507,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
- if (hole_end > crashk_res.end)
- break;
/* See if I overlap any of the segments */
for (i = 0; i < image->nr_segments; i++) {
unsigned long mstart, mend;
@@ -616,8 +622,8 @@ static void kimage_terminate(struct kimage *image)
#define for_each_kimage_entry(image, ptr, entry) \
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION)? \
- phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
static void kimage_free_entry(kimage_entry_t entry)
{
@@ -645,8 +651,7 @@ static void kimage_free(struct kimage *image)
* done with it.
*/
ind = entry;
- }
- else if (entry & IND_SOURCE)
+ } else if (entry & IND_SOURCE)
kimage_free_entry(entry);
}
/* Free the final indirection page */
@@ -769,8 +774,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
addr = old_addr;
page = old_page;
break;
- }
- else {
+ } else {
/* Place the page on the destination list I
* will use it later.
*/
@@ -785,7 +789,7 @@ static int kimage_load_normal_segment(struct kimage *image,
struct kexec_segment *segment)
{
unsigned long maddr;
- unsigned long ubytes, mbytes;
+ size_t ubytes, mbytes;
int result;
unsigned char __user *buf;
@@ -818,13 +822,9 @@ static int kimage_load_normal_segment(struct kimage *image,
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
- mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes)
- mchunk = mbytes;
-
- uchunk = mchunk;
- if (uchunk > ubytes)
- uchunk = ubytes;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
@@ -849,7 +849,7 @@ static int kimage_load_crash_segment(struct kimage *image,
* We do things a page at a time for the sake of kmap.
*/
unsigned long maddr;
- unsigned long ubytes, mbytes;
+ size_t ubytes, mbytes;
int result;
unsigned char __user *buf;
@@ -870,13 +870,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
- mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes)
- mchunk = mbytes;
-
- uchunk = mchunk;
- if (uchunk > ubytes) {
- uchunk = ubytes;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
@@ -927,7 +924,7 @@ static int kimage_load_segment(struct kimage *image,
* reinitialize them.
*
* - A machine specific part that includes the syscall number
- * and the copies the image to it's final destination. And
+ * and then copies the image to it's final destination. And
* jumps into the image at entry.
*
* kexec does not sync, or unmount filesystems so if you need
@@ -935,6 +932,7 @@ static int kimage_load_segment(struct kimage *image,
*/
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
+int kexec_load_disabled;
static DEFINE_MUTEX(kexec_mutex);
@@ -945,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT))
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
/*
@@ -1041,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void)
{}
#ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_kexec_load(unsigned long entry,
- unsigned long nr_segments,
- struct compat_kexec_segment __user *segments,
- unsigned long flags)
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+ compat_ulong_t, nr_segments,
+ struct compat_kexec_segment __user *, segments,
+ compat_ulong_t, flags)
{
struct compat_kexec_segment in;
struct kexec_segment out, __user *ksegments;
@@ -1060,7 +1058,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
return -EINVAL;
ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
- for (i=0; i < nr_segments; i++) {
+ for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
return -EFAULT;
@@ -1117,12 +1115,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
{
unsigned long addr;
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
- init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
- free_page((unsigned long)__va(addr));
- totalram_pages++;
- }
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1219,14 +1213,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
* squirrelled away. ELF notes happen to provide
* all of that, so there is no need to invent something new.
*/
- buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
if (!buf)
return;
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
+ &prstatus, sizeof(prstatus));
final_note(buf);
}
@@ -1235,13 +1229,12 @@ static int __init crash_notes_memory_init(void)
/* Allocate memory for saving cpu registers. */
crash_notes = alloc_percpu(note_buf_t);
if (!crash_notes) {
- printk("Kexec: Memory allocation for saving cpu register"
- " states failed\n");
+ pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
return -ENOMEM;
}
return 0;
}
-module_init(crash_notes_memory_init)
+subsys_initcall(crash_notes_memory_init);
/*
@@ -1258,10 +1251,10 @@ module_init(crash_notes_memory_init)
*
* The function returns 0 on success and -EINVAL on failure.
*/
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
@@ -1272,12 +1265,12 @@ static int __init parse_crashkernel_mem(char *cmdline,
/* get the start of the range */
start = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("crashkernel: Memory value expected\n");
+ pr_warn("crashkernel: Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (*cur != '-') {
- pr_warning("crashkernel: '-' expected\n");
+ pr_warn("crashkernel: '-' expected\n");
return -EINVAL;
}
cur++;
@@ -1286,31 +1279,30 @@ static int __init parse_crashkernel_mem(char *cmdline,
if (*cur != ':') {
end = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("crashkernel: Memory "
- "value expected\n");
+ pr_warn("crashkernel: Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (end <= start) {
- pr_warning("crashkernel: end <= start\n");
+ pr_warn("crashkernel: end <= start\n");
return -EINVAL;
}
}
if (*cur != ':') {
- pr_warning("crashkernel: ':' expected\n");
+ pr_warn("crashkernel: ':' expected\n");
return -EINVAL;
}
cur++;
size = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("Memory value expected\n");
+ pr_warn("Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (size >= system_ram) {
- pr_warning("crashkernel: invalid size\n");
+ pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
@@ -1328,8 +1320,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
cur++;
*crash_base = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("Memory value expected "
- "after '@'\n");
+ pr_warn("Memory value expected after '@'\n");
return -EINVAL;
}
}
@@ -1341,56 +1332,140 @@ static int __init parse_crashkernel_mem(char *cmdline,
/*
* That function parses "simple" (old) crashkernel command lines like
*
- * crashkernel=size[@offset]
+ * crashkernel=size[@offset]
*
* It returns 0 on success and -EINVAL on failure.
*/
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
{
char *cur = cmdline;
*crash_size = memparse(cmdline, &cur);
if (cmdline == cur) {
- pr_warning("crashkernel: memory value expected\n");
+ pr_warn("crashkernel: memory value expected\n");
return -EINVAL;
}
if (*cur == '@')
*crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
return 0;
}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
*/
-int __init parse_crashkernel(char *cmdline,
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
{
- char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
+ char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, "crashkernel=");
- while (p) {
- ck_cmdline = p;
- p = strstr(p+1, "crashkernel=");
- }
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
- ck_cmdline += 12; /* strlen("crashkernel=") */
+ ck_cmdline += strlen(name);
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ crash_base, suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1400,13 +1475,40 @@ int __init parse_crashkernel(char *cmdline,
if (first_colon && (!first_space || first_colon < first_space))
return parse_crashkernel_mem(ck_cmdline, system_ram,
crash_size, crash_base);
- else
- return parse_crashkernel_simple(ck_cmdline, crash_size,
- crash_base);
- return 0;
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
}
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
static void update_vmcoreinfo_note(void)
{
@@ -1421,7 +1523,7 @@ static void update_vmcoreinfo_note(void)
void crash_save_vmcoreinfo(void)
{
- vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
update_vmcoreinfo_note();
}
@@ -1429,14 +1531,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
{
va_list args;
char buf[0x50];
- int r;
+ size_t r;
va_start(args, fmt);
- r = vsnprintf(buf, sizeof(buf), fmt, args);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- if (r + vmcoreinfo_size > vmcoreinfo_max_size)
- r = vmcoreinfo_max_size - vmcoreinfo_size;
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
@@ -1447,10 +1548,10 @@ void vmcoreinfo_append_str(const char *fmt, ...)
* provide an empty default implementation here -- architecture
* code may override this
*/
-void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
+void __weak arch_crash_save_vmcoreinfo(void)
{}
-unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
+unsigned long __weak paddr_vmcoreinfo_note(void)
{
return __pa((unsigned long)(char *)&vmcoreinfo_note);
}
@@ -1462,9 +1563,11 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmlist);
+ VMCOREINFO_SYMBOL(vmap_area_list);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
@@ -1486,6 +1589,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, _count);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1500,7 +1605,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
@@ -1508,6 +1614,15 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
@@ -1515,7 +1630,7 @@ static int __init crash_save_vmcoreinfo_init(void)
return 0;
}
-module_init(crash_save_vmcoreinfo_init)
+subsys_initcall(crash_save_vmcoreinfo_init);
/*
* Move into place and start executing a preloaded standalone
@@ -1546,13 +1661,13 @@ int kernel_kexec(void)
if (error)
goto Resume_console;
/* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_noirq(). We *must* call
- * dpm_suspend_noirq() now. Otherwise, drivers for
+ * but *not* dpm_suspend_end(). We *must* call
+ * dpm_suspend_end() now. Otherwise, drivers for
* some devices (e.g. interrupt controllers) become
* desynchronized with the actual state of the
* hardware at resume time, and evil weirdness ensues.
*/
- error = dpm_suspend_noirq(PMSG_FREEZE);
+ error = dpm_suspend_end(PMSG_FREEZE);
if (error)
goto Resume_devices;
error = disable_nonboot_cpus();
@@ -1565,8 +1680,18 @@ int kernel_kexec(void)
} else
#endif
{
+ kexec_in_progress = true;
kernel_restart_prepare(NULL);
- printk(KERN_EMERG "Starting new kernel\n");
+ migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
machine_shutdown();
}
@@ -1579,7 +1704,7 @@ int kernel_kexec(void)
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
- dpm_resume_noirq(PMSG_RESTORE);
+ dpm_resume_start(PMSG_RESTORE);
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
Resume_console:
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index c744b88c44e..00000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,608 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
- return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
- size_t esize, gfp_t gfp_mask)
-{
- /*
- * round down to the next power of 2, since our 'let the indices
- * wrap' technique works only in this case.
- */
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
-
- if (size < 2) {
- fifo->data = NULL;
- fifo->mask = 0;
- return -EINVAL;
- }
-
- fifo->data = kmalloc(size * esize, gfp_mask);
-
- if (!fifo->data) {
- fifo->mask = 0;
- return -ENOMEM;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-
-void __kfifo_free(struct __kfifo *fifo)
-{
- kfree(fifo->data);
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = 0;
- fifo->data = NULL;
- fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
- unsigned int size, size_t esize)
-{
- size /= esize;
-
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
- fifo->data = buffer;
-
- if (size < 2) {
- fifo->mask = 0;
- return -EINVAL;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(fifo->data + off, src, l);
- memcpy(fifo->data, src + l, len - l);
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_in(struct __kfifo *fifo,
- const void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- kfifo_copy_in(fifo, buf, len, fifo->in);
- fifo->in += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(dst, fifo->data + off, l);
- memcpy(dst + l, fifo->data, len - l);
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- kfifo_copy_out(fifo, buf, len, fifo->out);
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-
-unsigned int __kfifo_out(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- len = __kfifo_out_peek(fifo, buf, len);
- fifo->out += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int off,
- unsigned int *copied)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned long ret;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_from_user(fifo->data + off, from, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_from_user(fifo->data, from + l, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->in += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
- unsigned int len, unsigned int off, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_to_user(to, fifo->data + off, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_to_user(to + l, fifo->data, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->out += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
- int nents, unsigned int len)
-{
- int n;
- unsigned int l;
- unsigned int off;
- struct page *page;
-
- if (!nents)
- return 0;
-
- if (!len)
- return 0;
-
- n = 0;
- page = virt_to_page(buf);
- off = offset_in_page(buf);
- l = 0;
-
- while (len >= l + PAGE_SIZE - off) {
- struct page *npage;
-
- l += PAGE_SIZE;
- buf += PAGE_SIZE;
- npage = virt_to_page(buf);
- if (page_to_phys(page) != page_to_phys(npage) - l) {
- sg_set_page(sgl, page, l - off, off);
- sgl = sg_next(sgl);
- if (++n == nents || sgl == NULL)
- return n;
- page = npage;
- len -= l - off;
- l = off = 0;
- }
- }
- sg_set_page(sgl, page, len, off);
- return n + 1;
-}
-
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
- int nents, unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned int n;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
- n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-
- return n;
-}
-
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
- unsigned int max = (1 << (recsize << 3)) - 1;
-
- if (len > max)
- return max;
- return len;
-}
-
-#define __KFIFO_PEEK(data, out, mask) \
- ((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int l;
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- l = __KFIFO_PEEK(data, fifo->out, mask);
-
- if (--recsize)
- l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-
- return l;
-}
-
-#define __KFIFO_POKE(data, in, mask, val) \
- ( \
- (data)[(in) & (mask)] = (unsigned char)(val) \
- )
-
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- __KFIFO_POKE(data, fifo->in, mask, n);
-
- if (recsize > 1)
- __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
- return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
- unsigned int len, size_t recsize)
-{
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- __kfifo_poke_n(fifo, len, recsize);
-
- kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
- fifo->in += len + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
- void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
- *n = __kfifo_peek_n(fifo, recsize);
-
- if (len > *n)
- len = *n;
-
- kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
- return len;
-}
-
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
- fifo->out += n + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-
-void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int n;
-
- n = __kfifo_peek_n(fifo, recsize);
- fifo->out += n + recsize;
-}
-EXPORT_SYMBOL(__kfifo_skip_r);
-
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo)) {
- *copied = 0;
- return 0;
- }
-
- __kfifo_poke_n(fifo, len, recsize);
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->in += len + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
- unsigned int n;
-
- if (fifo->in == fifo->out) {
- *copied = 0;
- return 0;
- }
-
- n = __kfifo_peek_n(fifo, recsize);
- if (len > n)
- len = n;
-
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->out += n + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
- unsigned int len, size_t recsize)
-{
- len = __kfifo_max_r(len, recsize);
- __kfifo_poke_n(fifo, len, recsize);
- fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > fifo->in - fifo->out)
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int len;
-
- len = __kfifo_peek_n(fifo, recsize);
- fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934..8637e041a24 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
@@ -45,6 +47,13 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
+/*
+ * kmod_thread_locker is used for deadlock avoidance. There is no explicit
+ * locking to protect this global - it is private to the singleton khelper
+ * thread and should only ever be modified by that thread.
+ */
+static const struct task_struct *kmod_thread_locker;
+
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -60,6 +69,51 @@ static DECLARE_RWSEM(umhelper_sem);
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+ kfree(info->argv[3]); /* check call_modprobe() */
+ kfree(info->argv);
+}
+
+static int call_modprobe(char *module_name, int wait)
+{
+ struct subprocess_info *info;
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+
+ char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+ if (!argv)
+ goto out;
+
+ module_name = kstrdup(module_name, GFP_KERNEL);
+ if (!module_name)
+ goto free_argv;
+
+ argv[0] = modprobe_path;
+ argv[1] = "-q";
+ argv[2] = "--";
+ argv[3] = module_name; /* check free_modprobe_argv() */
+ argv[4] = NULL;
+
+ info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
+ NULL, free_modprobe_argv, NULL);
+ if (!info)
+ goto free_module_name;
+
+ return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+
+free_module_name:
+ kfree(module_name);
+free_argv:
+ kfree(argv);
+out:
+ return -ENOMEM;
+}
+
/**
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
@@ -81,15 +135,21 @@ int __request_module(bool wait, const char *fmt, ...)
char module_name[MODULE_NAME_LEN];
unsigned int max_modprobes;
int ret;
- char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
- static char *envp[] = { "HOME=/",
- "TERM=linux",
- "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL };
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
+ /*
+ * We don't allow synchronous module loading from async. Module
+ * init may invoke async_synchronize_full() which will end up
+ * waiting for this task which already is waiting for the module
+ * loading to complete, leading to a deadlock.
+ */
+ WARN_ON_ONCE(wait && current_is_async());
+
+ if (!modprobe_path[0])
+ return 0;
+
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
@@ -128,9 +188,7 @@ int __request_module(bool wait, const char *fmt, ...)
trace_module_request(module_name, wait, _RET_IP_);
- ret = call_usermodehelper_fns(modprobe_path, argv, envp,
- wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
- NULL, NULL, NULL);
+ ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
@@ -181,9 +239,11 @@ static int ____call_usermodehelper(void *data)
commit_creds(new);
- retval = kernel_execve(sub_info->path,
- (const char *const *)sub_info->argv,
- (const char *const *)sub_info->envp);
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
+ if (!retval)
+ return 0;
/* Exec failed? */
fail:
@@ -191,13 +251,32 @@ fail:
do_exit(0);
}
-void call_usermodehelper_freeinfo(struct subprocess_info *info)
+static int call_helper(void *data)
+{
+ /* Worker thread started blocking khelper thread. */
+ kmod_thread_locker = current;
+ return ____call_usermodehelper(data);
+}
+
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
(*info->cleanup)(info);
kfree(info);
}
-EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
@@ -206,10 +285,7 @@ static int wait_for_helper(void *data)
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
- spin_unlock_irq(&current->sighand->siglock);
-
+ kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
@@ -235,8 +311,8 @@ static int wait_for_helper(void *data)
sub_info->retval = ret;
}
- complete(sub_info->complete);
- return 0;
+ umh_complete(sub_info);
+ do_exit(0);
}
/* This is run by khelper thread */
@@ -244,7 +320,7 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- enum umh_wait wait = sub_info->wait;
+ int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
/* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -253,9 +329,12 @@ static void __call_usermodehelper(struct work_struct *work)
if (wait == UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
- else
- pid = kernel_thread(____call_usermodehelper, sub_info,
+ else {
+ pid = kernel_thread(call_helper, sub_info,
CLONE_VFORK | SIGCHLD);
+ /* Worker thread stopped blocking khelper thread. */
+ kmod_thread_locker = NULL;
+ }
switch (wait) {
case UMH_NO_WAIT:
@@ -269,7 +348,7 @@ static void __call_usermodehelper(struct work_struct *work)
case UMH_WAIT_EXEC:
if (pid < 0)
sub_info->retval = pid;
- complete(sub_info->complete);
+ umh_complete(sub_info);
}
}
@@ -279,7 +358,7 @@ static void __call_usermodehelper(struct work_struct *work)
* land has been frozen during a system-wide hibernation or suspend operation).
* Should always be manipulated under umhelper_sem acquired for write.
*/
-static int usermodehelper_disabled = 1;
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
@@ -291,32 +370,110 @@ static atomic_t running_helpers = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
* Time to wait for running_helpers to become zero before the setting of
* usermodehelper_disabled in usermodehelper_disable() fails
*/
#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-void read_lock_usermodehelper(void)
+int usermodehelper_read_trylock(void)
{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
}
-EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
-void read_unlock_usermodehelper(void)
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
+
+void usermodehelper_read_unlock(void)
{
up_read(&umhelper_sem);
}
-EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
/**
- * usermodehelper_disable - prevent new helpers from being started
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
+ */
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
*/
-int usermodehelper_disable(void)
+int __usermodehelper_disable(enum umh_disable_depth depth)
{
long retval;
+ if (!depth)
+ return -EINVAL;
+
down_write(&umhelper_sem);
- usermodehelper_disabled = 1;
+ usermodehelper_disabled = depth;
up_write(&umhelper_sem);
/*
@@ -331,35 +488,14 @@ int usermodehelper_disable(void)
if (retval)
return 0;
- down_write(&umhelper_sem);
- usermodehelper_disabled = 0;
- up_write(&umhelper_sem);
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
return -EAGAIN;
}
-/**
- * usermodehelper_enable - allow new helpers to be started again
- */
-void usermodehelper_enable(void)
-{
- down_write(&umhelper_sem);
- usermodehelper_disabled = 0;
- up_write(&umhelper_sem);
-}
-
-/**
- * usermodehelper_is_disabled - check if new helpers are allowed to be started
- */
-bool usermodehelper_is_disabled(void)
-{
- return usermodehelper_disabled;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
-
static void helper_lock(void)
{
atomic_inc(&running_helpers);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
}
static void helper_unlock(void)
@@ -374,13 +510,28 @@ static void helper_unlock(void)
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
*/
struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
- char **envp, gfp_t gfp_mask)
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -391,64 +542,51 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
out:
return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);
/**
- * call_usermodehelper_setfns - set a cleanup/init function
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-void call_usermodehelper_setfns(struct subprocess_info *info,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- info->cleanup = cleanup;
- info->init = init;
- info->data = data;
-}
-EXPORT_SYMBOL(call_usermodehelper_setfns);
-
-/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
* @wait: wait for the application to finish and return status.
- * when -1 don't wait at all, but you get no useful error back when
- * the program couldn't be exec'ed. This makes it safe to call
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
- enum umh_wait wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
helper_lock();
- if (sub_info->path[0] == '\0')
- goto out;
-
if (!khelper_wq || usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
+ /*
+ * Worker thread must not wait for khelper thread at below
+ * wait_for_completion() if the thread was created with CLONE_VFORK
+ * flag, for khelper thread is already waiting for the thread at
+ * wait_for_completion() in do_fork().
+ */
+ if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
+ retval = -EBUSY;
+ goto out;
+ }
sub_info->complete = &done;
sub_info->wait = wait;
@@ -456,9 +594,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
queue_work(khelper_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
wait_for_completion(&done);
+wait_done:
retval = sub_info->retval;
-
out:
call_usermodehelper_freeinfo(sub_info);
unlock:
@@ -467,6 +617,33 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
+ */
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
+ if (info == NULL)
+ return -ENOMEM;
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
static int proc_cap_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9788c0ec6f4..734e9a7d280 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
return &(kretprobe_table_locks[hash].lock);
}
-/*
- * Normally, functions that we'd want to prohibit kprobes in, are marked
- * __kprobes. But, there are cases where such functions already belong to
- * a different section (__sched for preempt_schedule)
- *
- * For such cases, we now have a blacklist
- */
-static struct kprobe_blackpoint kprobe_blacklist[] = {
- {"preempt_schedule",},
- {"native_get_debugreg",},
- {"irq_entries_start",},
- {"common_interrupt",},
- {"mcount",}, /* mcount can be called from everywhere */
- {NULL} /* Terminator */
-};
+/* Blacklist -- list of struct kprobe_blacklist_entry */
+static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
@@ -112,6 +99,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
struct kprobe_insn_page {
struct list_head list;
kprobe_opcode_t *insns; /* Page of instruction slots */
+ struct kprobe_insn_cache *cache;
int nused;
int ngarbage;
char slot_used[];
@@ -121,12 +109,6 @@ struct kprobe_insn_page {
(offsetof(struct kprobe_insn_page, slot_used) + \
(sizeof(char) * (slots)))
-struct kprobe_insn_cache {
- struct list_head pages; /* list of kprobe_insn_page */
- size_t insn_size; /* size of instruction slot */
- int nr_garbage;
-};
-
static int slots_per_page(struct kprobe_insn_cache *c)
{
return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,22 +120,36 @@ enum kprobe_slot_state {
SLOT_USED = 2,
};
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
-static struct kprobe_insn_cache kprobe_insn_slots = {
+static void *alloc_insn_page(void)
+{
+ return module_alloc(PAGE_SIZE);
+}
+
+static void free_insn_page(void *page)
+{
+ module_free(NULL, page);
+}
+
+struct kprobe_insn_cache kprobe_insn_slots = {
+ .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
+ .alloc = alloc_insn_page,
+ .free = free_insn_page,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
};
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
+static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
-static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
+ kprobe_opcode_t *slot = NULL;
+ mutex_lock(&c->mutex);
retry:
list_for_each_entry(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
@@ -162,7 +158,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
if (kip->slot_used[i] == SLOT_CLEAN) {
kip->slot_used[i] = SLOT_USED;
kip->nused++;
- return kip->insns + (i * c->insn_size);
+ slot = kip->insns + (i * c->insn_size);
+ goto out;
}
}
/* kip->nused is broken. Fix it. */
@@ -178,41 +175,33 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
/* All out of space. Need to allocate a new page. */
kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
if (!kip)
- return NULL;
+ goto out;
/*
* Use module_alloc so this page is within +/- 2GB of where the
* kernel image and loaded module images reside. This is required
* so x86_64 can correctly handle the %rip-relative fixups.
*/
- kip->insns = module_alloc(PAGE_SIZE);
+ kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
- return NULL;
+ goto out;
}
INIT_LIST_HEAD(&kip->list);
memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
kip->slot_used[0] = SLOT_USED;
kip->nused = 1;
kip->ngarbage = 0;
+ kip->cache = c;
list_add(&kip->list, &c->pages);
- return kip->insns;
-}
-
-
-kprobe_opcode_t __kprobes *get_insn_slot(void)
-{
- kprobe_opcode_t *ret = NULL;
-
- mutex_lock(&kprobe_insn_mutex);
- ret = __get_insn_slot(&kprobe_insn_slots);
- mutex_unlock(&kprobe_insn_mutex);
-
- return ret;
+ slot = kip->insns;
+out:
+ mutex_unlock(&c->mutex);
+ return slot;
}
/* Return 1 if all garbages are collected, otherwise 0. */
-static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -225,7 +214,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
*/
if (!list_is_singular(&kip->list)) {
list_del(&kip->list);
- module_free(NULL, kip->insns);
+ kip->cache->free(kip->insns);
kfree(kip);
}
return 1;
@@ -233,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
return 0;
}
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
+static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip, *next;
@@ -255,11 +244,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
-static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
- kprobe_opcode_t *slot, int dirty)
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
+ mutex_lock(&c->mutex);
list_for_each_entry(kip, &c->pages, list) {
long idx = ((long)slot - (long)kip->insns) /
(c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +262,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
collect_garbage_slots(c);
} else
collect_one_slot(kip, idx);
- return;
+ goto out;
}
}
/* Could not free this slot. */
WARN_ON(1);
+out:
+ mutex_unlock(&c->mutex);
}
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-{
- mutex_lock(&kprobe_insn_mutex);
- __free_insn_slot(&kprobe_insn_slots, slot, dirty);
- mutex_unlock(&kprobe_insn_mutex);
-}
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
-static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
-static struct kprobe_insn_cache kprobe_optinsn_slots = {
+struct kprobe_insn_cache kprobe_optinsn_slots = {
+ .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
+ .alloc = alloc_insn_page,
+ .free = free_insn_page,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
};
-/* Get a slot for optimized_kprobe buffer */
-kprobe_opcode_t __kprobes *get_optinsn_slot(void)
-{
- kprobe_opcode_t *ret = NULL;
-
- mutex_lock(&kprobe_optinsn_mutex);
- ret = __get_insn_slot(&kprobe_optinsn_slots);
- mutex_unlock(&kprobe_optinsn_mutex);
-
- return ret;
-}
-
-void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
-{
- mutex_lock(&kprobe_optinsn_mutex);
- __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
- mutex_unlock(&kprobe_optinsn_mutex);
-}
#endif
#endif
@@ -331,22 +301,22 @@ static inline void reset_kprobe_instance(void)
* OR
* - with preemption disabled - from arch/xxx/kernel/kprobes.c
*/
-struct kprobe __kprobes *get_kprobe(void *addr)
+struct kprobe *get_kprobe(void *addr)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (p->addr == addr)
return p;
}
return NULL;
}
+NOKPROBE_SYMBOL(get_kprobe);
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -378,7 +348,7 @@ static bool kprobes_allow_optimization;
* Call all pre_handler on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
-void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -390,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
}
}
+NOKPROBE_SYMBOL(opt_pre_handler);
/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -430,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
}
/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int __kprobes kprobe_queued(struct kprobe *p)
+static int kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -446,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
*/
-static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
int i;
struct kprobe *p = NULL;
@@ -468,17 +439,17 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
+static LIST_HEAD(freeing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
-static DECLARE_COMPLETION(optimizer_comp);
#define OPTIMIZE_DELAY 5
/*
* Optimize (replace a breakpoint with a jump) kprobes listed on
* optimizing_list.
*/
-static __kprobes void do_optimize_kprobes(void)
+static void do_optimize_kprobes(void)
{
/* Optimization never be done when disarmed */
if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -506,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
* if need) kprobes listed on unoptimizing_list.
*/
-static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -517,9 +488,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
/* Ditto to do_optimize_kprobes */
get_online_cpus();
mutex_lock(&text_mutex);
- arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
- list_for_each_entry_safe(op, tmp, free_list, list) {
+ list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -538,11 +509,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
}
/* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
- list_for_each_entry_safe(op, tmp, free_list, list) {
+ list_for_each_entry_safe(op, tmp, &freeing_list, list) {
BUG_ON(!kprobe_unused(&op->kp));
list_del_init(&op->list);
free_aggr_kprobe(&op->kp);
@@ -550,26 +521,23 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
}
/* Start optimizer after OPTIMIZE_DELAY passed */
-static __kprobes void kick_kprobe_optimizer(void)
+static void kick_kprobe_optimizer(void)
{
- if (!delayed_work_pending(&optimizing_work))
- schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+static void kprobe_optimizer(struct work_struct *work)
{
- LIST_HEAD(free_list);
-
+ mutex_lock(&kprobe_mutex);
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
- mutex_lock(&kprobe_mutex);
/*
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
* kprobes before waiting for quiesence period.
*/
- do_unoptimize_kprobes(&free_list);
+ do_unoptimize_kprobes();
/*
* Step 2: Wait for quiesence period to ensure all running interrupts
@@ -584,28 +552,37 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
do_optimize_kprobes();
/* Step 4: Free cleaned kprobes after quiesence period */
- do_free_cleaned_kprobes(&free_list);
+ do_free_cleaned_kprobes();
- mutex_unlock(&kprobe_mutex);
mutex_unlock(&module_mutex);
+ mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
- else
- /* Wake up all waiters */
- complete_all(&optimizer_comp);
}
/* Wait for completing optimization and unoptimization */
-static __kprobes void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer(void)
{
- if (delayed_work_pending(&optimizing_work))
- wait_for_completion(&optimizer_comp);
+ mutex_lock(&kprobe_mutex);
+
+ while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+ mutex_unlock(&kprobe_mutex);
+
+ /* this will also make optimizing_work execute immmediately */
+ flush_delayed_work(&optimizing_work);
+ /* @optimizing_work might not have been queued yet, relax */
+ cpu_relax();
+
+ mutex_lock(&kprobe_mutex);
+ }
+
+ mutex_unlock(&kprobe_mutex);
}
/* Optimize kprobe if p is ready to be optimized */
-static __kprobes void optimize_kprobe(struct kprobe *p)
+static void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -639,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
}
/* Short cut to direct unoptimizing */
-static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
get_online_cpus();
arch_unoptimize_kprobe(op);
@@ -649,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
}
/* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
+static void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
@@ -709,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
}
/* Remove optimized instructions */
-static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+static void kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -717,14 +694,25 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
if (!list_empty(&op->list))
/* Dequeue from the (un)optimization queue */
list_del_init(&op->list);
-
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+
+ if (kprobe_unused(p)) {
+ /* Enqueue if it is unused */
+ list_add(&op->list, &freeing_list);
+ /*
+ * Remove unused probes from the hash list. After waiting
+ * for synchronization, this probe is reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes().)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ }
+
/* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
/* Try to prepare optimized instructions */
-static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -733,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -748,81 +736,98 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
return &op->kp;
}
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
* Prepare an optimized_kprobe and optimize it
* NOTE: p must be a normal registered kprobe
*/
-static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
+ /* Impossible to optimize ftrace-based kprobe */
+ if (kprobe_ftrace(p))
+ return;
+
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ jump_label_lock();
+ mutex_lock(&text_mutex);
+
ap = alloc_aggr_kprobe(p);
if (!ap)
- return;
+ goto out;
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe */
arch_remove_optimized_kprobe(op);
kfree(op);
- return;
+ goto out;
}
init_aggr_kprobe(ap, p);
- optimize_kprobe(ap);
+ optimize_kprobe(ap); /* This just kicks optimizer thread */
+
+out:
+ mutex_unlock(&text_mutex);
+ jump_label_unlock();
}
#ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
-static void __kprobes optimize_all_kprobes(void)
+static void optimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
- return;
+ goto out;
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+ mutex_unlock(&kprobe_mutex);
}
-/* This should be called with kprobe_mutex locked */
-static void __kprobes unoptimize_all_kprobes(void)
+static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
- if (!kprobes_allow_optimization)
+ if (!kprobes_allow_optimization) {
+ mutex_unlock(&kprobe_mutex);
return;
+ }
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
}
+ mutex_unlock(&kprobe_mutex);
+
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@@ -830,7 +835,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
{
int ret;
- mutex_lock(&kprobe_mutex);
+ mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -838,14 +843,14 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_mutex);
+ mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
#endif /* CONFIG_SYSCTL */
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
-static void __kprobes __arm_kprobe(struct kprobe *p)
+static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
@@ -860,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
}
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
-static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
+static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
@@ -895,21 +900,76 @@ static void reuse_unused_kprobe(struct kprobe *ap)
BUG_ON(kprobe_unused(ap));
}
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
{
arch_remove_kprobe(p);
kfree(p);
}
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */
+#ifdef CONFIG_KPROBES_ON_FTRACE
+static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+static int kprobe_ftrace_enabled;
+
+/* Must ensure p->addr is really on ftrace */
+static int prepare_kprobe(struct kprobe *p)
+{
+ if (!kprobe_ftrace(p))
+ return arch_prepare_kprobe(p);
+
+ return arch_prepare_kprobe_ftrace(p);
+}
+
+/* Caller must lock kprobe_mutex */
+static void arm_kprobe_ftrace(struct kprobe *p)
+{
+ int ret;
+
+ ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+ (unsigned long)p->addr, 0, 0);
+ WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+ kprobe_ftrace_enabled++;
+ if (kprobe_ftrace_enabled == 1) {
+ ret = register_ftrace_function(&kprobe_ftrace_ops);
+ WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ }
+}
+
+/* Caller must lock kprobe_mutex */
+static void disarm_kprobe_ftrace(struct kprobe *p)
+{
+ int ret;
+
+ kprobe_ftrace_enabled--;
+ if (kprobe_ftrace_enabled == 0) {
+ ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+ WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ }
+ ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+ (unsigned long)p->addr, 1, 0);
+ WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+}
+#else /* !CONFIG_KPROBES_ON_FTRACE */
+#define prepare_kprobe(p) arch_prepare_kprobe(p)
+#define arm_kprobe_ftrace(p) do {} while (0)
+#define disarm_kprobe_ftrace(p) do {} while (0)
+#endif
+
/* Arm a kprobe with text_mutex */
-static void __kprobes arm_kprobe(struct kprobe *kp)
+static void arm_kprobe(struct kprobe *kp)
{
+ if (unlikely(kprobe_ftrace(kp))) {
+ arm_kprobe_ftrace(kp);
+ return;
+ }
/*
* Here, since __arm_kprobe() doesn't use stop_machine(),
* this doesn't cause deadlock on text_mutex. So, we don't
@@ -921,11 +981,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
}
/* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp)
+static void disarm_kprobe(struct kprobe *kp, bool reopt)
{
+ if (unlikely(kprobe_ftrace(kp))) {
+ disarm_kprobe_ftrace(kp);
+ return;
+ }
/* Ditto */
mutex_lock(&text_mutex);
- __disarm_kprobe(kp, true);
+ __disarm_kprobe(kp, reopt);
mutex_unlock(&text_mutex);
}
@@ -933,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp)
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -947,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(aggr_pre_handler);
-static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
{
struct kprobe *kp;
@@ -961,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
}
}
+NOKPROBE_SYMBOL(aggr_post_handler);
-static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
@@ -977,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
}
return 0;
}
+NOKPROBE_SYMBOL(aggr_fault_handler);
-static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
int ret = 0;
@@ -990,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
return ret;
}
+NOKPROBE_SYMBOL(aggr_break_handler);
/* Walks the list and increments nmissed count for multiprobe case */
-void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
if (!kprobe_aggrprobe(p)) {
@@ -1003,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
return;
}
+NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
- struct hlist_head *head)
+void recycle_rp_inst(struct kretprobe_instance *ri,
+ struct hlist_head *head)
{
struct kretprobe *rp = ri->rp;
@@ -1020,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
+NOKPROBE_SYMBOL(recycle_rp_inst);
-void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
+void kretprobe_hash_lock(struct task_struct *tsk,
struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
@@ -1032,17 +1102,19 @@ __acquires(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_hash_lock);
-static void __kprobes kretprobe_table_lock(unsigned long hash,
- unsigned long *flags)
+static void kretprobe_table_lock(unsigned long hash,
+ unsigned long *flags)
__acquires(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_table_lock);
-void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
- unsigned long *flags)
+void kretprobe_hash_unlock(struct task_struct *tsk,
+ unsigned long *flags)
__releases(hlist_lock)
{
unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1051,14 +1123,16 @@ __releases(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_hash_unlock);
-static void __kprobes kretprobe_table_unlock(unsigned long hash,
- unsigned long *flags)
+static void kretprobe_table_unlock(unsigned long hash,
+ unsigned long *flags)
__releases(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_table_unlock);
/*
* This function is called from finish_task_switch when task tk becomes dead,
@@ -1066,11 +1140,11 @@ __releases(hlist_lock)
* with this task. These left over instances represent probed functions
* that have been called but will never return.
*/
-void __kprobes kprobe_flush_task(struct task_struct *tk)
+void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
unsigned long hash, flags = 0;
if (unlikely(!kprobes_initialized))
@@ -1081,40 +1155,41 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task == tk)
recycle_rp_inst(ri, &empty_rp);
}
kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
}
+NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
- hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+ hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
}
-static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
+static void cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
struct hlist_head *head;
/* No race here */
for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
kretprobe_table_lock(hash, &flags);
head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+ hlist_for_each_entry_safe(ri, next, head, hlist) {
if (ri->rp == rp)
ri->rp = NULL;
}
@@ -1122,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
}
free_rp_inst(rp);
}
+NOKPROBE_SYMBOL(cleanup_rp_inst);
/*
* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
-static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
+static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
@@ -1144,12 +1220,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
if (p->post_handler && !ap->post_handler)
ap->post_handler = aggr_post_handler;
- if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
- ap->flags &= ~KPROBE_FLAG_DISABLED;
- if (!kprobes_all_disarmed)
- /* Arm the breakpoint again. */
- __arm_kprobe(ap);
- }
return 0;
}
@@ -1157,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
/* Copy p's insn slot to ap */
copy_kprobe(p, ap);
@@ -1183,17 +1253,27 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
-static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
- struct kprobe *p)
+static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
int ret = 0;
struct kprobe *ap = orig_p;
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ jump_label_lock();
+ /*
+ * Get online CPUs to avoid text_mutex deadlock.with stop machine,
+ * which is invoked by unoptimize_kprobe() in add_new_kprobe()
+ */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+
if (!kprobe_aggrprobe(orig_p)) {
/* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
ap = alloc_aggr_kprobe(orig_p);
- if (!ap)
- return -ENOMEM;
+ if (!ap) {
+ ret = -ENOMEM;
+ goto out;
+ }
init_aggr_kprobe(ap, orig_p);
} else if (kprobe_unused(ap))
/* This probe is going to die. Rescue it */
@@ -1213,7 +1293,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
* free aggr_probe. It will be used next time, or
* freed by unregister_kprobe.
*/
- return ret;
+ goto out;
/* Prepare optimized instructions if possible. */
prepare_optimized_kprobe(ap);
@@ -1228,28 +1308,45 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
/* Copy ap's insn slot to p */
copy_kprobe(ap, p);
- return add_new_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+
+out:
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+ jump_label_unlock();
+
+ if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ if (!kprobes_all_disarmed)
+ /* Arm the breakpoint again. */
+ arm_kprobe(ap);
+ }
+ return ret;
}
-static int __kprobes in_kprobes_functions(unsigned long addr)
+bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- struct kprobe_blackpoint *kb;
+ /* The __kprobes marked functions and entry code must not be probed */
+ return addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end;
+}
- if (addr >= (unsigned long)__kprobes_text_start &&
- addr < (unsigned long)__kprobes_text_end)
- return -EINVAL;
+static bool within_kprobe_blacklist(unsigned long addr)
+{
+ struct kprobe_blacklist_entry *ent;
+
+ if (arch_within_kprobe_blacklist(addr))
+ return true;
/*
* If there exists a kprobe_blacklist, verify and
* fail any probe registration in the prohibited area
*/
- for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
- if (kb->start_addr) {
- if (addr >= kb->start_addr &&
- addr < (kb->start_addr + kb->range))
- return -EINVAL;
- }
+ list_for_each_entry(ent, &kprobe_blacklist, list) {
+ if (addr >= ent->start_addr && addr < ent->end_addr)
+ return true;
}
- return 0;
+
+ return false;
}
/*
@@ -1258,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
kprobe_opcode_t *addr = p->addr;
@@ -1281,7 +1378,7 @@ invalid:
}
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1313,69 +1410,96 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-int __kprobes register_kprobe(struct kprobe *p)
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
{
int ret = 0;
- struct kprobe *old_p;
- struct module *probed_mod;
- kprobe_opcode_t *addr;
+ unsigned long ftrace_addr;
- addr = kprobe_addr(p);
- if (IS_ERR(addr))
- return PTR_ERR(addr);
- p->addr = addr;
-
- ret = check_kprobe_rereg(p);
- if (ret)
- return ret;
+ /*
+ * If the address is located on a ftrace nop, set the
+ * breakpoint to the following instruction.
+ */
+ ftrace_addr = ftrace_location((unsigned long)p->addr);
+ if (ftrace_addr) {
+#ifdef CONFIG_KPROBES_ON_FTRACE
+ /* Given address is not on the instruction boundary */
+ if ((unsigned long)p->addr != ftrace_addr)
+ return -EILSEQ;
+ p->flags |= KPROBE_FLAG_FTRACE;
+#else /* !CONFIG_KPROBES_ON_FTRACE */
+ return -EINVAL;
+#endif
+ }
jump_label_lock();
preempt_disable();
- if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr) ||
- ftrace_text_reserved(p->addr, p->addr) ||
- jump_label_text_reserved(p->addr, p->addr))
- goto fail_with_jump_label;
- /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
- p->flags &= KPROBE_FLAG_DISABLED;
+ /* Ensure it is not in reserved area nor out of text */
+ if (!kernel_text_address((unsigned long) p->addr) ||
+ within_kprobe_blacklist((unsigned long) p->addr) ||
+ jump_label_text_reserved(p->addr, p->addr)) {
+ ret = -EINVAL;
+ goto out;
+ }
- /*
- * Check if are we probing a module.
- */
- probed_mod = __module_text_address((unsigned long) p->addr);
- if (probed_mod) {
- /* Return -ENOENT if fail. */
- ret = -ENOENT;
+ /* Check if are we probing a module */
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
- if (unlikely(!try_module_get(probed_mod)))
- goto fail_with_jump_label;
+ if (unlikely(!try_module_get(*probed_mod))) {
+ ret = -ENOENT;
+ goto out;
+ }
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
*/
- if (within_module_init((unsigned long)p->addr, probed_mod) &&
- probed_mod->state != MODULE_STATE_COMING) {
- module_put(probed_mod);
- goto fail_with_jump_label;
+ if (within_module_init((unsigned long)p->addr, *probed_mod) &&
+ (*probed_mod)->state != MODULE_STATE_COMING) {
+ module_put(*probed_mod);
+ *probed_mod = NULL;
+ ret = -ENOENT;
}
- /* ret will be updated by following code */
}
+out:
preempt_enable();
jump_label_unlock();
+ return ret;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret;
+ struct kprobe *old_p;
+ struct module *probed_mod;
+ kprobe_opcode_t *addr;
+
+ /* Adjust probe address from symbol */
+ addr = kprobe_addr(p);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+ p->addr = addr;
+
+ ret = check_kprobe_rereg(p);
+ if (ret)
+ return ret;
+
+ /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+ p->flags &= KPROBE_FLAG_DISABLED;
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
- mutex_lock(&kprobe_mutex);
- jump_label_lock(); /* needed to call jump_label_text_reserved() */
+ ret = check_kprobe_address_safe(p, &probed_mod);
+ if (ret)
+ return ret;
- get_online_cpus(); /* For avoiding text_mutex deadlock. */
- mutex_lock(&text_mutex);
+ mutex_lock(&kprobe_mutex);
old_p = get_kprobe(p->addr);
if (old_p) {
@@ -1384,7 +1508,9 @@ int __kprobes register_kprobe(struct kprobe *p)
goto out;
}
- ret = arch_prepare_kprobe(p);
+ mutex_lock(&text_mutex); /* Avoiding text modification */
+ ret = prepare_kprobe(p);
+ mutex_unlock(&text_mutex);
if (ret)
goto out;
@@ -1393,31 +1519,23 @@ int __kprobes register_kprobe(struct kprobe *p)
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (!kprobes_all_disarmed && !kprobe_disabled(p))
- __arm_kprobe(p);
+ arm_kprobe(p);
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
out:
- mutex_unlock(&text_mutex);
- put_online_cpus();
- jump_label_unlock();
mutex_unlock(&kprobe_mutex);
if (probed_mod)
module_put(probed_mod);
return ret;
-
-fail_with_jump_label:
- preempt_enable();
- jump_label_unlock();
- return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);
/* Check if all probes on the aggrprobe are disabled */
-static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1433,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
}
/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
-static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
@@ -1449,7 +1567,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- disarm_kprobe(orig_p);
+ disarm_kprobe(orig_p, true);
orig_p->flags |= KPROBE_FLAG_DISABLED;
}
}
@@ -1460,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
/*
* Unregister a kprobe without a scheduler synchronization.
*/
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+static int __unregister_kprobe_top(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1517,7 +1635,7 @@ disarmed:
return 0;
}
-static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
+static void __unregister_kprobe_bottom(struct kprobe *p)
{
struct kprobe *ap;
@@ -1533,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
/* Otherwise, do nothing. */
}
-int __kprobes register_kprobes(struct kprobe **kps, int num)
+int register_kprobes(struct kprobe **kps, int num)
{
int i, ret = 0;
@@ -1551,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(register_kprobes);
-void __kprobes unregister_kprobe(struct kprobe *p)
+void unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);
-void __kprobes unregister_kprobes(struct kprobe **kps, int num)
+void unregister_kprobes(struct kprobe **kps, int num)
{
int i;
@@ -1586,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
-int __kprobes register_jprobes(struct jprobe **jps, int num)
+int register_jprobes(struct jprobe **jps, int num)
{
struct jprobe *jp;
int ret = 0, i;
@@ -1617,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
EXPORT_SYMBOL_GPL(register_jprobes);
-int __kprobes register_jprobe(struct jprobe *jp)
+int register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(register_jprobe);
-void __kprobes unregister_jprobe(struct jprobe *jp)
+void unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(unregister_jprobe);
-void __kprobes unregister_jprobes(struct jprobe **jps, int num)
+void unregister_jprobes(struct jprobe **jps, int num)
{
int i;
@@ -1654,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
- struct pt_regs *regs)
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
unsigned long hash, flags = 0;
@@ -1693,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
}
return 0;
}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
@@ -1747,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
}
EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
{
int ret = 0, i;
@@ -1765,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
{
int i;
@@ -1794,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
- struct pt_regs *regs)
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
return 0;
}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
#endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */
-static void __kprobes kill_kprobe(struct kprobe *p)
+static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
@@ -1849,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
}
/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
+int disable_kprobe(struct kprobe *kp)
{
int ret = 0;
@@ -1865,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
EXPORT_SYMBOL_GPL(disable_kprobe);
/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
+int enable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
@@ -1898,20 +2016,56 @@ out:
}
EXPORT_SYMBOL_GPL(enable_kprobe);
-void __kprobes dump_kprobe(struct kprobe *kp)
+void dump_kprobe(struct kprobe *kp)
{
printk(KERN_WARNING "Dumping kprobe:\n");
printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
kp->symbol_name, kp->addr, kp->offset);
}
+NOKPROBE_SYMBOL(dump_kprobe);
+
+/*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+static int __init populate_kprobe_blacklist(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *iter;
+ struct kprobe_blacklist_entry *ent;
+ unsigned long entry, offset = 0, size = 0;
+
+ for (iter = start; iter < end; iter++) {
+ entry = arch_deref_entry_point((void *)*iter);
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+ pr_err("Failed to find blacklist at %p\n",
+ (void *)entry);
+ continue;
+ }
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
+ INIT_LIST_HEAD(&ent->list);
+ list_add_tail(&ent->list, &kprobe_blacklist);
+ }
+ return 0;
+}
/* Module notifier call back, checking kprobes on the module */
-static int __kprobes kprobes_module_callback(struct notifier_block *nb,
- unsigned long val, void *data)
+static int kprobes_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
{
struct module *mod = data;
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
@@ -1928,7 +2082,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -1949,14 +2103,13 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
+/* Markers of _kprobe_blacklist section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
+
static int __init init_kprobes(void)
{
int i, err = 0;
- unsigned long offset = 0, size = 0;
- char *modname, namebuf[128];
- const char *symbol_name;
- void *addr;
- struct kprobe_blackpoint *kb;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
@@ -1966,26 +2119,11 @@ static int __init init_kprobes(void)
raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
}
- /*
- * Lookup and populate the kprobe_blacklist.
- *
- * Unlike the kretprobe blacklist, we'll need to determine
- * the range of addresses that belong to the said functions,
- * since a kprobe need not necessarily be at the beginning
- * of a function.
- */
- for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
- kprobe_lookup_name(kb->name, addr);
- if (!addr)
- continue;
-
- kb->start_addr = (unsigned long)addr;
- symbol_name = kallsyms_lookup(kb->start_addr,
- &size, &offset, &modname, namebuf);
- if (!symbol_name)
- kb->range = 0;
- else
- kb->range = size;
+ err = populate_kprobe_blacklist(__start_kprobe_blacklist,
+ __stop_kprobe_blacklist);
+ if (err) {
+ pr_err("kprobes: failed to populate blacklist: %d\n", err);
+ pr_err("Please take care of using kprobes.\n");
}
if (kretprobe_blacklist_size) {
@@ -2025,7 +2163,7 @@ static int __init init_kprobes(void)
}
#ifdef CONFIG_DEBUG_FS
-static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
@@ -2047,18 +2185,19 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
if (!pp)
pp = p;
- seq_printf(pi, "%s%s%s\n",
+ seq_printf(pi, "%s%s%s%s\n",
(kprobe_gone(p) ? "[GONE]" : ""),
((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
- (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
+ (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
+ (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
-static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}
-static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
if (*pos >= KPROBE_TABLE_SIZE)
@@ -2066,24 +2205,23 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
return pos;
}
-static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+static void kprobe_seq_stop(struct seq_file *f, void *v)
{
/* Nothing to do */
}
-static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
- char *modname, namebuf[128];
+ char *modname, namebuf[KSYM_NAME_LEN];
head = &kprobe_table[i];
preempt_disable();
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (kprobe_aggrprobe(p)) {
@@ -2103,7 +2241,7 @@ static const struct seq_operations kprobes_seq_ops = {
.show = show_kprobe_addr
};
-static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+static int kprobes_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &kprobes_seq_ops);
}
@@ -2115,10 +2253,49 @@ static const struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
-static void __kprobes arm_all_kprobes(void)
+/* kprobes/blacklist -- shows which functions can not be probed */
+static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
+{
+ return seq_list_start(&kprobe_blacklist, *pos);
+}
+
+static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &kprobe_blacklist, pos);
+}
+
+static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
+{
+ struct kprobe_blacklist_entry *ent =
+ list_entry(v, struct kprobe_blacklist_entry, list);
+
+ seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+ (void *)ent->end_addr, (void *)ent->start_addr);
+ return 0;
+}
+
+static const struct seq_operations kprobe_blacklist_seq_ops = {
+ .start = kprobe_blacklist_seq_start,
+ .next = kprobe_blacklist_seq_next,
+ .stop = kprobe_seq_stop, /* Reuse void function */
+ .show = kprobe_blacklist_seq_show,
+};
+
+static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &kprobe_blacklist_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobe_blacklist_ops = {
+ .open = kprobe_blacklist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void arm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2129,14 +2306,12 @@ static void __kprobes arm_all_kprobes(void)
goto already_enabled;
/* Arming kprobes doesn't optimize kprobe itself */
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
- __arm_kprobe(p);
+ arm_kprobe(p);
}
- mutex_unlock(&text_mutex);
kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
@@ -2146,10 +2321,9 @@ already_enabled:
return;
}
-static void __kprobes disarm_all_kprobes(void)
+static void disarm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2164,15 +2338,13 @@ static void __kprobes disarm_all_kprobes(void)
kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- __disarm_kprobe(p, false);
+ disarm_kprobe(p, false);
}
}
- mutex_unlock(&text_mutex);
mutex_unlock(&kprobe_mutex);
/* Wait for disarming all kprobes by optimizer */
@@ -2208,6 +2380,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
+ buf[buf_size] = '\0';
switch (buf[0]) {
case 'y':
case 'Y':
@@ -2219,6 +2392,8 @@ static ssize_t write_enabled_file_bool(struct file *file,
case '0':
disarm_all_kprobes();
break;
+ default:
+ return -EINVAL;
}
return count;
@@ -2230,7 +2405,7 @@ static const struct file_operations fops_kp = {
.llseek = default_llseek,
};
-static int __kprobes debugfs_kprobe_init(void)
+static int __init debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
unsigned int value = 1;
@@ -2241,19 +2416,24 @@ static int __kprobes debugfs_kprobe_init(void)
file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
- if (!file) {
- debugfs_remove(dir);
- return -ENOMEM;
- }
+ if (!file)
+ goto error;
file = debugfs_create_file("enabled", 0600, dir,
&value, &fops_kp);
- if (!file) {
- debugfs_remove(dir);
- return -ENOMEM;
- }
+ if (!file)
+ goto error;
+
+ file = debugfs_create_file("blacklist", 0444, dir, NULL,
+ &debugfs_kprobe_blacklist_ops);
+ if (!file)
+ goto error;
return 0;
+
+error:
+ debugfs_remove(dir);
+ return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf5..6683ccef9ff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -18,6 +18,9 @@
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/capability.h>
+#include <linux/compiler.h>
+
+#include <linux/rcupdate.h> /* rcu_expedited */
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -26,7 +29,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
-#if defined(CONFIG_HOTPLUG)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -35,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
+#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -114,7 +117,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
unsigned long cnt;
int ret;
- if (strict_strtoul(buf, 0, &cnt))
+ if (kstrtoul(buf, 0, &cnt))
return -EINVAL;
ret = crash_shrink_memory(cnt);
@@ -127,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
{
return sprintf(buf, "%lx %x\n",
paddr_vmcoreinfo_note(),
- (unsigned int)vmcoreinfo_max_size);
+ (unsigned int)sizeof(vmcoreinfo_note));
}
KERNEL_ATTR_RO(vmcoreinfo);
@@ -141,11 +144,28 @@ static ssize_t fscaps_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(fscaps);
+int rcu_expedited;
+static ssize_t rcu_expedited_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", rcu_expedited);
+}
+static ssize_t rcu_expedited_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_expedited))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_expedited);
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
-extern const void __start_notes __attribute__((weak));
-extern const void __stop_notes __attribute__((weak));
+extern const void __start_notes __weak;
+extern const void __stop_notes __weak;
#define notes_size (&__stop_notes - &__start_notes)
static ssize_t notes_read(struct file *filp, struct kobject *kobj,
@@ -169,8 +189,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
-#if defined(CONFIG_HOTPLUG)
&uevent_seqnum_attr.attr,
+#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
#ifdef CONFIG_PROFILING
@@ -182,6 +202,7 @@ static struct attribute * kernel_attrs[] = {
&kexec_crash_size_attr.attr,
&vmcoreinfo_attr.attr,
#endif
+ &rcu_expedited_attr.attr,
NULL
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702..c2390f41307 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,8 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
+#include <linux/ptrace.h>
+#include <linux/uaccess.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -31,19 +33,41 @@ struct kthread_create_info
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
- struct completion done;
+ struct completion *done;
struct list_head list;
};
struct kthread {
- int should_stop;
+ unsigned long flags;
+ unsigned int cpu;
void *data;
+ struct completion parked;
struct completion exited;
};
-#define to_kthread(tsk) \
- container_of((tsk)->vfork_done, struct kthread, exited)
+enum KTHREAD_BITS {
+ KTHREAD_IS_PER_CPU = 0,
+ KTHREAD_SHOULD_STOP,
+ KTHREAD_SHOULD_PARK,
+ KTHREAD_IS_PARKED,
+};
+
+#define __to_kthread(vfork) \
+ container_of(vfork, struct kthread, exited)
+
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+ return __to_kthread(k->vfork_done);
+}
+
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+ struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+ if (likely(vfork))
+ return __to_kthread(vfork);
+ return NULL;
+}
/**
* kthread_should_stop - should this kthread return now?
@@ -52,13 +76,29 @@ struct kthread {
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
-int kthread_should_stop(void)
+bool kthread_should_stop(void)
{
- return to_kthread(current)->should_stop;
+ return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);
/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true. You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
+/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
*
@@ -96,30 +136,76 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+/**
+ * probe_kthread_data - speculative version of kthread_data()
+ * @task: possible kthread task in question
+ *
+ * @task could be a kthread task. Return the data value specified when it
+ * was created if accessible. If @task isn't a kthread task or its data is
+ * inaccessible for any reason, %NULL is returned. This function requires
+ * that @task itself is safe to dereference.
+ */
+void *probe_kthread_data(struct task_struct *task)
+{
+ struct kthread *kthread = to_kthread(task);
+ void *data = NULL;
+
+ probe_kernel_read(&data, &kthread->data, sizeof(data));
+ return data;
+}
+
+static void __kthread_parkme(struct kthread *self)
+{
+ __set_current_state(TASK_PARKED);
+ while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+ if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+ complete(&self->parked);
+ schedule();
+ __set_current_state(TASK_PARKED);
+ }
+ clear_bit(KTHREAD_IS_PARKED, &self->flags);
+ __set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+ __kthread_parkme(to_kthread(current));
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
+ struct completion *done;
struct kthread self;
int ret;
- self.should_stop = 0;
+ self.flags = 0;
self.data = data;
init_completion(&self.exited);
+ init_completion(&self.parked);
current->vfork_done = &self.exited;
+ /* If user was SIGKILLed, I release the structure. */
+ done = xchg(&create->done, NULL);
+ if (!done) {
+ kfree(create);
+ do_exit(-EINTR);
+ }
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
- complete(&create->done);
+ complete(done);
schedule();
ret = -EINTR;
- if (!self.should_stop)
- ret = threadfn(data);
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+ __kthread_parkme(&self);
+ ret = threadfn(data);
+ }
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
@@ -131,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
if (tsk == kthreadd_task)
return tsk->pref_node_fork;
#endif
- return numa_node_id();
+ return NUMA_NO_NODE;
}
static void create_kthread(struct kthread_create_info *create)
@@ -144,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create)
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
+ /* If user was SIGKILLed, I release the structure. */
+ struct completion *done = xchg(&create->done, NULL);
+
+ if (!done) {
+ kfree(create);
+ return;
+ }
create->result = ERR_PTR(pid);
- complete(&create->done);
+ complete(done);
}
}
@@ -169,47 +262,81 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
- * Returns a task_struct or ERR_PTR(-ENOMEM).
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
- void *data,
- int node,
+ void *data, int node,
const char namefmt[],
...)
{
- struct kthread_create_info create;
-
- create.threadfn = threadfn;
- create.data = data;
- create.node = node;
- init_completion(&create.done);
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct task_struct *task;
+ struct kthread_create_info *create = kmalloc(sizeof(*create),
+ GFP_KERNEL);
+
+ if (!create)
+ return ERR_PTR(-ENOMEM);
+ create->threadfn = threadfn;
+ create->data = data;
+ create->node = node;
+ create->done = &done;
spin_lock(&kthread_create_lock);
- list_add_tail(&create.list, &kthread_create_list);
+ list_add_tail(&create->list, &kthread_create_list);
spin_unlock(&kthread_create_lock);
wake_up_process(kthreadd_task);
- wait_for_completion(&create.done);
-
- if (!IS_ERR(create.result)) {
+ /*
+ * Wait for completion in killable state, for I might be chosen by
+ * the OOM killer while kthreadd is trying to allocate memory for
+ * new kernel thread.
+ */
+ if (unlikely(wait_for_completion_killable(&done))) {
+ /*
+ * If I was SIGKILLed before kthreadd (or new kernel thread)
+ * calls complete(), leave the cleanup of this structure to
+ * that thread.
+ */
+ if (xchg(&create->done, NULL))
+ return ERR_PTR(-EINTR);
+ /*
+ * kthreadd (or new kernel thread) will call complete()
+ * shortly.
+ */
+ wait_for_completion(&done);
+ }
+ task = create->result;
+ if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
- vsnprintf(create.result->comm, sizeof(create.result->comm),
- namefmt, args);
+ vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
va_end(args);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
- sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(create.result, cpu_all_mask);
+ sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(task, cpu_all_mask);
}
- return create.result;
+ kfree(create);
+ return task;
}
EXPORT_SYMBOL(kthread_create_on_node);
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, state)) {
+ WARN_ON(1);
+ return;
+ }
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_NO_SETAFFINITY;
+}
+
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
@@ -221,17 +348,99 @@ EXPORT_SYMBOL(kthread_create_on_node);
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
- if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
- WARN_ON(1);
- return;
+ __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(kthread_bind);
+
+/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data, unsigned int cpu,
+ const char *namefmt)
+{
+ struct task_struct *p;
+
+ p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
+ cpu);
+ if (IS_ERR(p))
+ return p;
+ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+ to_kthread(p)->cpu = cpu;
+ /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+ kthread_park(p);
+ return p;
+}
+
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+ wake_up_state(k, TASK_PARKED);
}
+}
- /* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
- p->flags |= PF_THREAD_BOUND;
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+ struct kthread *kthread = to_live_kthread(k);
+
+ if (kthread)
+ __kthread_unpark(k, kthread);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+ struct kthread *kthread = to_live_kthread(k);
+ int ret = -ENOSYS;
+
+ if (kthread) {
+ if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
+ }
+ }
+ ret = 0;
+ }
+ return ret;
}
-EXPORT_SYMBOL(kthread_bind);
/**
* kthread_stop - stop a thread created by kthread_create().
@@ -254,20 +463,19 @@ int kthread_stop(struct task_struct *k)
int ret;
trace_sched_kthread_stop(k);
- get_task_struct(k);
- kthread = to_kthread(k);
- barrier(); /* it might have exited */
- if (k->vfork_done != NULL) {
- kthread->should_stop = 1;
+ get_task_struct(k);
+ kthread = to_live_kthread(k);
+ if (kthread) {
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ __kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
-
put_task_struct(k);
- trace_sched_kthread_stop_ret(ret);
+ trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
@@ -280,7 +488,7 @@ int kthreadd(void *unused)
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
- set_mems_allowed(node_states[N_HIGH_MEMORY]);
+ set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
@@ -360,16 +568,12 @@ repeat:
struct kthread_work, node);
list_del_init(&work->node);
}
+ worker->current_work = work;
spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
work->func(work);
- smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
- work->done_seq = work->queue_seq;
- smp_mb(); /* mb worker-b1 paired with flush-b0 */
- if (atomic_read(&work->flushing))
- wake_up_all(&work->done);
} else if (!freezing(current))
schedule();
@@ -378,6 +582,19 @@ repeat:
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work,
+ struct list_head *pos)
+{
+ lockdep_assert_held(&worker->lock);
+
+ list_add_tail(&work->node, pos);
+ work->worker = worker;
+ if (likely(worker->task))
+ wake_up_process(worker->task);
+}
+
/**
* queue_kthread_work - queue a kthread_work
* @worker: target kthread_worker
@@ -395,10 +612,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
spin_lock_irqsave(&worker->lock, flags);
if (list_empty(&work->node)) {
- list_add_tail(&work->node, &worker->work_list);
- work->queue_seq++;
- if (likely(worker->task))
- wake_up_process(worker->task);
+ insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +620,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
}
EXPORT_SYMBOL_GPL(queue_kthread_work);
+struct kthread_flush_work {
+ struct kthread_work work;
+ struct completion done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+ struct kthread_flush_work *fwork =
+ container_of(work, struct kthread_flush_work, work);
+ complete(&fwork->done);
+}
+
/**
* flush_kthread_work - flush a kthread_work
* @work: work to flush
@@ -414,39 +640,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
*/
void flush_kthread_work(struct kthread_work *work)
{
- int seq = work->queue_seq;
-
- atomic_inc(&work->flushing);
+ struct kthread_flush_work fwork = {
+ KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+ COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+ };
+ struct kthread_worker *worker;
+ bool noop = false;
- /*
- * mb flush-b0 paired with worker-b1, to make sure either
- * worker sees the above increment or we see done_seq update.
- */
- smp_mb__after_atomic_inc();
+retry:
+ worker = work->worker;
+ if (!worker)
+ return;
- /* A - B <= 0 tests whether B is in front of A regardless of overflow */
- wait_event(work->done, seq - work->done_seq <= 0);
- atomic_dec(&work->flushing);
+ spin_lock_irq(&worker->lock);
+ if (work->worker != worker) {
+ spin_unlock_irq(&worker->lock);
+ goto retry;
+ }
- /*
- * rmb flush-b1 paired with worker-b0, to make sure our caller
- * sees every change made by work->func().
- */
- smp_mb__after_atomic_dec();
-}
-EXPORT_SYMBOL_GPL(flush_kthread_work);
+ if (!list_empty(&work->node))
+ insert_kthread_work(worker, &fwork.work, work->node.next);
+ else if (worker->current_work == work)
+ insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+ else
+ noop = true;
-struct kthread_flush_work {
- struct kthread_work work;
- struct completion done;
-};
+ spin_unlock_irq(&worker->lock);
-static void kthread_flush_work_fn(struct kthread_work *work)
-{
- struct kthread_flush_work *fwork =
- container_of(work, struct kthread_flush_work, work);
- complete(&fwork->done);
+ if (!noop)
+ wait_for_completion(&fwork.done);
}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
/**
* flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a..a02812743a7 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
}
static void __sched
-account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+account_global_scheduler_latency(struct task_struct *tsk,
+ struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
break;
seq_printf(m, " %ps", (void *)bt);
}
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
}
return 0;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
new file mode 100644
index 00000000000..8541bfdfd23
--- /dev/null
+++ b/kernel/locking/Makefile
@@ -0,0 +1,28 @@
+
+obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+endif
+
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
+obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_SMP) += lglock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
+obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
new file mode 100644
index 00000000000..86ae2aebf00
--- /dev/null
+++ b/kernel/locking/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+ LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+ arch_spinlock_t *lock;
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ lock = this_cpu_ptr(lg->lock);
+ arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+ arch_spinlock_t *lock;
+
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock = this_cpu_ptr(lg->lock);
+ arch_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+ arch_spinlock_t *lock;
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ lock = per_cpu_ptr(lg->lock, cpu);
+ arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+ arch_spinlock_t *lock;
+
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock = per_cpu_ptr(lg->lock, cpu);
+ arch_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+ int i;
+
+ preempt_disable();
+ lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ for_each_possible_cpu(i) {
+ arch_spinlock_t *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ arch_spin_lock(lock);
+ }
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+ int i;
+
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ for_each_possible_cpu(i) {
+ arch_spinlock_t *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ arch_spin_unlock(lock);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c
index 8889f7dd7c4..d24e4339b46 100644
--- a/kernel/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
+
static int save_trace(struct stack_trace *trace)
{
trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
return 0;
@@ -584,6 +590,7 @@ static int very_verbose(struct lock_class *class)
/*
* Is this the address of a static object:
*/
+#ifdef __KERNEL__
static int static_obj(void *obj)
{
unsigned long start = (unsigned long) &_stext,
@@ -610,6 +617,7 @@ static int static_obj(void *obj)
*/
return is_module_address(addr) || is_module_percpu_address(addr);
}
+#endif
/*
* To make lock name printouts unique, we calculate a unique
@@ -763,8 +771,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
}
raw_local_irq_restore(flags);
- printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
return NULL;
}
@@ -834,8 +841,7 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
- printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
return NULL;
}
@@ -1228,7 +1234,7 @@ static int noop_count(struct lock_list *entry, void *data)
return 0;
}
-unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
struct lock_list *uninitialized_var(target_entry);
@@ -1254,7 +1260,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
return ret;
}
-unsigned long __lockdep_count_backward_deps(struct lock_list *this)
+static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
struct lock_list *uninitialized_var(target_entry);
@@ -1930,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
for (;;) {
int distance = curr->lockdep_depth - depth + 1;
- hlock = curr->held_locks + depth-1;
+ hlock = curr->held_locks + depth - 1;
/*
* Only non-recursive-read entries get new dependencies
* added:
*/
- if (hlock->read != 2) {
+ if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
distance, trylock_loop))
return 0;
@@ -2000,7 +2006,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- struct held_lock *hlock_curr, *hlock_next;
+ struct held_lock *hlock_curr;
int i, j;
/*
@@ -2048,8 +2054,7 @@ cache_hit:
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
return 0;
}
@@ -2057,12 +2062,10 @@ cache_hit:
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
- hlock_next = hlock;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock_next->irq_context)
+ if (hlock_curr->irq_context != hlock->irq_context)
break;
- hlock_next = hlock;
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
@@ -2095,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* (If lookup_chain_cache() returns with 1 it acquires
* graph_lock for us)
*/
- if (!hlock->trylock && (hlock->check == 2) &&
+ if (!hlock->trylock && hlock->check &&
lookup_chain_cache(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
@@ -2514,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
BUG_ON(usage_bit >= LOCK_USAGE_STATES);
- if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+ if (!hlock->check)
continue;
if (!mark_lock(curr, hlock, usage_bit))
@@ -2554,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
debug_atomic_inc(hardirqs_on_events);
}
-void trace_hardirqs_on_caller(unsigned long ip)
+__visible void trace_hardirqs_on_caller(unsigned long ip)
{
time_hardirqs_on(CALLER_ADDR0, ip);
@@ -2607,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void trace_hardirqs_off_caller(unsigned long ip)
+__visible void trace_hardirqs_off_caller(unsigned long ip)
{
struct task_struct *curr = current;
@@ -2997,6 +3000,43 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+
+static int
+print_lock_nested_lock_not_held(struct task_struct *curr,
+ struct held_lock *hlock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("==================================\n");
+ printk("[ BUG: Nested lock was not taken ]\n");
+ print_kernel_ident();
+ printk("----------------------------------\n");
+
+ printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ printk("\nbut this task is not holding:\n");
+ printk("%s\n", hlock->nest_lock->name);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int __lock_is_held(struct lockdep_map *lock);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3015,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int class_idx;
u64 chain_key;
- if (!prove_locking)
- check = 1;
-
if (unlikely(!debug_locks))
return 0;
@@ -3029,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (lock->key == &__lockdep_no_validate__)
- check = 1;
+ if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ check = 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -3098,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->holdtime_stamp = lockstat_clock();
#endif
- if (check == 2 && !mark_irqflags(curr, hlock))
+ if (check && !mark_irqflags(curr, hlock))
return 0;
/* mark it as used: */
@@ -3139,6 +3176,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
chain_key = iterate_chain_key(chain_key, id);
+ if (nest_lock && !__lock_is_held(nest_lock))
+ return print_lock_nested_lock_not_held(curr, hlock, ip);
+
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
@@ -3151,9 +3191,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
- printk("BUG: MAX_LOCK_DEPTH too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
+ curr->lockdep_depth, MAX_LOCK_DEPTH);
+
+ lockdep_print_held_locks(current);
+ debug_show_all_locks();
dump_stack();
+
return 0;
}
@@ -3164,7 +3209,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
unsigned long ip)
{
if (!debug_locks_off())
@@ -3207,7 +3252,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 0;
if (curr->lockdep_depth <= 0)
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
return 1;
}
@@ -3278,7 +3323,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
lockdep_init_map(lock, name, key, 0);
@@ -3345,7 +3390,7 @@ lock_release_non_nested(struct task_struct *curr,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
if (hlock->instance == lock)
@@ -4044,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
-static void print_held_locks_bug(struct task_struct *curr)
+static void print_held_locks_bug(void)
{
if (!debug_locks_off())
return;
@@ -4053,23 +4098,23 @@ static void print_held_locks_bug(struct task_struct *curr)
printk("\n");
printk("=====================================\n");
- printk("[ BUG: lock held at task exit time! ]\n");
+ printk("[ BUG: %s/%d still has locks held! ]\n",
+ current->comm, task_pid_nr(current));
print_kernel_ident();
printk("-------------------------------------\n");
- printk("%s/%d is exiting with locks still held!\n",
- curr->comm, task_pid_nr(curr));
- lockdep_print_held_locks(curr);
-
+ lockdep_print_held_locks(current);
printk("\nstack backtrace:\n");
dump_stack();
}
-void debug_check_no_locks_held(struct task_struct *task)
+void debug_check_no_locks_held(void)
{
- if (unlikely(task->lockdep_depth > 0))
- print_held_locks_bug(task);
+ if (unlikely(current->lockdep_depth > 0))
+ print_held_locks_bug();
}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
+#ifdef __KERNEL__
void debug_show_all_locks(void)
{
struct task_struct *g, *p;
@@ -4127,6 +4172,7 @@ retry:
read_unlock(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);
+#endif
/*
* Careful: only use this function if you are sure that
@@ -4142,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);
-void lockdep_sys_exit(void)
+asmlinkage __visible void lockdep_sys_exit(void)
{
struct task_struct *curr = current;
@@ -4176,7 +4222,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
printk("-------------------------------\n");
printk("%s:%d %s!\n", file, line, s);
printk("\nother info that might help us debug this:\n\n");
- printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+ printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ !rcu_lockdep_current_cpu_online()
+ ? "RCU used illegally from offline CPU!\n"
+ : !rcu_is_watching()
+ ? "RCU used illegally from idle CPU!\n"
+ : "",
+ rcu_scheduler_active, debug_locks);
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
@@ -4196,7 +4248,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
* So complain bitterly if someone does call rcu_read_lock(),
* rcu_read_lock_bh() and so on from extended quiescent states.
*/
- if (rcu_is_cpu_idle())
+ if (!rcu_is_watching())
printk("RCU used illegally from extended quiescent state!\n");
lockdep_print_held_locks(curr);
diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8..51c4b24b632 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
-#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_ENTRIES 32768UL
-#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_LOCKDEP_CHAINS_BITS 16
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -65,7 +65,7 @@ enum {
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define MAX_STACK_TRACE_ENTRIES 524288UL
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 91c32a0b612..ef43ac4bafb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
static void print_name(struct seq_file *m, struct lock_class *class)
{
- char str[128];
+ char str[KSYM_NAME_LEN];
const char *name = class->name;
if (!name) {
@@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
+ seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
}
if (i) {
seq_puts(m, "\n");
- seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
+ seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
seq_puts(m, "\n");
}
}
static void seq_header(struct seq_file *m)
{
- seq_printf(m, "lock_stat version 0.3\n");
+ seq_puts(m, "lock_stat version 0.4\n");
if (unlikely(!debug_locks))
seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
- seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
- seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
+ seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+ seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
"%14s %14s\n",
"class name",
"con-bounces",
@@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m)
"waittime-min",
"waittime-max",
"waittime-total",
+ "waittime-avg",
"acq-bounces",
"acquisitions",
"holdtime-min",
"holdtime-max",
- "holdtime-total");
- seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+ "holdtime-total",
+ "holdtime-avg");
+ seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
seq_printf(m, "\n");
}
diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84..995b0cc2b84 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 00000000000..0955b885d0d
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,454 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+torture_param(int, nwriters_stress, -1,
+ "Number of write-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+ "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+ "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+
+static atomic_t n_lock_torture_errors;
+
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+
+static int nrealwriters_stress;
+static bool lock_is_write_held;
+
+struct lock_writer_stress_stats {
+ long n_write_lock_fail;
+ long n_write_lock_acquired;
+};
+static struct lock_writer_stress_stats *lwsa;
+
+#if defined(MODULE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(locktorture_runnable, int, 0444);
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
+
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+ void (*init)(void);
+ int (*writelock)(void);
+ void (*write_delay)(struct torture_random_state *trsp);
+ void (*writeunlock)(void);
+ unsigned long flags;
+ const char *name;
+};
+
+static struct lock_torture_ops *cur_ops;
+
+/*
+ * Definitions for lock torture testing.
+ */
+
+static int torture_lock_busted_write_lock(void)
+{
+ return 0; /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_us = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_lock_busted_write_unlock(void)
+{
+ /* BUGGY, do not use in real life!!! */
+}
+
+static struct lock_torture_ops lock_busted_ops = {
+ .writelock = torture_lock_busted_write_lock,
+ .write_delay = torture_lock_busted_write_delay,
+ .writeunlock = torture_lock_busted_write_unlock,
+ .name = "lock_busted"
+};
+
+static DEFINE_SPINLOCK(torture_spinlock);
+
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+ spin_lock(&torture_spinlock);
+ return 0;
+}
+
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_us = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+ spin_unlock(&torture_spinlock);
+}
+
+static struct lock_torture_ops spin_lock_ops = {
+ .writelock = torture_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_spin_lock_write_unlock,
+ .name = "spin_lock"
+};
+
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock_irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&torture_spinlock, flags);
+ cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+ spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+}
+
+static struct lock_torture_ops spin_lock_irq_ops = {
+ .writelock = torture_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_lock_spin_write_unlock_irq,
+ .name = "spin_lock_irq"
+};
+
+/*
+ * Lock torture writer kthread. Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+ struct lock_writer_stress_stats *lwsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+ cur_ops->writelock();
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_write_lock_fail++;
+ lock_is_write_held = 1;
+ lwsp->n_write_lock_acquired++;
+ cur_ops->write_delay(&rand);
+ lock_is_write_held = 0;
+ cur_ops->writeunlock();
+ stutter_wait("lock_torture_writer");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_writer");
+ return 0;
+}
+
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void lock_torture_printk(char *page)
+{
+ bool fail = 0;
+ int i;
+ long max = 0;
+ long min = lwsa[0].n_write_lock_acquired;
+ long long sum = 0;
+
+ for (i = 0; i < nrealwriters_stress; i++) {
+ if (lwsa[i].n_write_lock_fail)
+ fail = true;
+ sum += lwsa[i].n_write_lock_acquired;
+ if (max < lwsa[i].n_write_lock_fail)
+ max = lwsa[i].n_write_lock_fail;
+ if (min > lwsa[i].n_write_lock_fail)
+ min = lwsa[i].n_write_lock_fail;
+ }
+ page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page,
+ "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ sum, max, min, max / 2 > min ? "???" : "",
+ fail, fail ? "!!!" : "");
+ if (fail)
+ atomic_inc(&n_lock_torture_errors);
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only one
+ * call to this function at a given time!!! This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+ int size = nrealwriters_stress * 200 + 8192;
+ char *buf;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+ lock_torture_printk(buf);
+ pr_alert("%s", buf);
+ kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ lock_torture_stats_print();
+ torture_shutdown_absorb("lock_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_stats");
+ return 0;
+}
+
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+ const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+ shuffle_interval, stutter, shutdown_secs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void lock_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup())
+ return;
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters_stress; i++)
+ torture_stop_kthread(lock_torture_writer,
+ writer_tasks[i]);
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ }
+
+ torture_stop_kthread(lock_torture_stats, stats_task);
+ lock_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&n_lock_torture_errors))
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: LOCK_HOTPLUG");
+ else
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: SUCCESS");
+}
+
+static int __init lock_torture_init(void)
+{
+ int i;
+ int firsterr = 0;
+ static struct lock_torture_ops *torture_ops[] = {
+ &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+ };
+
+ if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("lock-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nwriters_stress >= 0)
+ nrealwriters_stress = nwriters_stress;
+ else
+ nrealwriters_stress = 2 * num_online_cpus();
+ lock_torture_print_module_parms(cur_ops, "Start of test");
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ lock_is_write_held = 0;
+ lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+ if (lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters_stress; i++) {
+ lwsa[i].n_write_lock_fail = 0;
+ lwsa[i].n_write_lock_acquired = 0;
+ }
+
+ /* Start up the kthreads. */
+
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ,
+ onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shuffle_interval > 0) {
+ firsterr = torture_shuffle_init(shuffle_interval);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs,
+ lock_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter);
+ if (firsterr)
+ goto unwind;
+ }
+
+ writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters_stress; i++) {
+ firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(lock_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ lock_torture_cleanup();
+ return firsterr;
+}
+
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 00000000000..be9ee1559fc
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,210 @@
+
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_SMP
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+ for (;;) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ break;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+
+ arch_mutex_cpu_relax();
+ }
+
+ return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ node->locked = 0;
+ node->next = NULL;
+ node->cpu = curr;
+
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
+ return true;
+
+ prev = decode_cpu(old);
+ node->prev = prev;
+ ACCESS_ONCE(prev->next) = node;
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ while (!smp_load_acquire(&node->locked)) {
+ /*
+ * If we need to reschedule bail... so we can block.
+ */
+ if (need_resched())
+ goto unqueue;
+
+ arch_mutex_cpu_relax();
+ }
+ return true;
+
+unqueue:
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ if (prev->next == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becomming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ arch_mutex_cpu_relax();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = ACCESS_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next(lock, node, prev);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ ACCESS_ONCE(next->prev) = prev;
+ ACCESS_ONCE(prev->next) = next;
+
+ return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node, *next;
+ int curr = encode_cpu(smp_processor_id());
+
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+ return;
+
+ /*
+ * Second most likely case.
+ */
+ node = this_cpu_ptr(&osq_node);
+ next = xchg(&node->next, NULL);
+ if (next) {
+ ACCESS_ONCE(next->locked) = 1;
+ return;
+ }
+
+ next = osq_wait_next(lock, node, NULL);
+ if (next)
+ ACCESS_ONCE(next->locked) = 1;
+}
+
+#endif
+
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 00000000000..74356dc0ce2
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,130 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+#include <asm/mcs_spinlock.h>
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+};
+
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l) \
+do { \
+ while (!(smp_load_acquire(l))) \
+ arch_mutex_cpu_relax(); \
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /*
+ * Lock acquired, don't need to set node->locked to 1. Threads
+ * only spin on its own node->locked value for lock acquisition.
+ * However, since this thread can immediately acquire the lock
+ * and does not proceed to spin on its own node->locked, this
+ * value won't be used. If a debug mode is needed to
+ * audit lock status, then set node->locked value here.
+ */
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+
+ /* Wait until the lock holder passes the lock down. */
+ arch_mcs_spin_lock_contended(&node->locked);
+}
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (likely(cmpxchg(lock, node, NULL) == node))
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+
+ /* Pass lock to next waiter. */
+ arch_mcs_spin_unlock_contended(&next->locked);
+}
+
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
+};
+
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f4..5cf6731b98e 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -71,13 +71,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void debug_mutex_unlock(struct mutex *lock)
{
- if (unlikely(!debug_locks))
- return;
+ if (likely(debug_locks)) {
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(lock->owner != current);
- DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- mutex_clear_owner(lock);
+ if (!lock->owner)
+ DEBUG_LOCKS_WARN_ON(!lock->owner);
+ else
+ DEBUG_LOCKS_WARN_ON(lock->owner != current);
+
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ mutex_clear_owner(lock);
+ }
+
+ /*
+ * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+ * mutexes so that we can do it here after we've verified state.
+ */
+ atomic_set(&lock->count, 1);
}
void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cf..0799fd3e4cf 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
new file mode 100644
index 00000000000..acca2c1a3c5
--- /dev/null
+++ b/kernel/locking/mutex.c
@@ -0,0 +1,930 @@
+/*
+ * kernel/locking/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ * from the -rt tree, where it was originally implemented for rtmutexes
+ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ * and Sven Dietrich.
+ *
+ * Also see Documentation/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/ww_mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include "mcs_spinlock.h"
+
+/*
+ * In the DEBUG case we are using the "NULL fastpath" for mutexes,
+ * which forces all calls into the slowpath:
+ */
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+# include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define __mutex_slowpath_needs_to_unlock() 0
+#else
+# include "mutex.h"
+# include <asm/mutex.h>
+#endif
+
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
+
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ atomic_set(&lock->count, 1);
+ spin_lock_init(&lock->wait_lock);
+ INIT_LIST_HEAD(&lock->wait_list);
+ mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ osq_lock_init(&lock->osq);
+#endif
+
+ debug_mutex_init(lock, name, key);
+}
+
+EXPORT_SYMBOL(__mutex_init);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
+
+/**
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides mutex must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging. )
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void __sched mutex_lock(struct mutex *lock)
+{
+ might_sleep();
+ /*
+ * The locking fastpath is the 1->0 transition from
+ * 'unlocked' into 'locked' state.
+ */
+ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+ mutex_set_owner(lock);
+}
+
+EXPORT_SYMBOL(mutex_lock);
+#endif
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ */
+
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ struct task_struct *owner;
+ int retval = 1;
+
+ if (need_resched())
+ return 0;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(lock->owner);
+ if (owner)
+ retval = owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+#endif
+
+__visible __used noinline
+void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+
+/**
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void __sched mutex_unlock(struct mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+#ifndef CONFIG_DEBUG_MUTEXES
+ /*
+ * When debugging is enabled we must not clear the owner before time,
+ * the slow path will always be taken, and that clears the owner field
+ * after verifying that it was indeed current.
+ */
+ mutex_clear_owner(lock);
+#endif
+ __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_unlock);
+
+/**
+ * ww_mutex_unlock - release the w/w mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously with any of the
+ * ww_mutex_lock* functions (with or without an acquire context). It is
+ * forbidden to release the locks after releasing the acquire context.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a unlocked mutex is not allowed.
+ */
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+
+#ifndef CONFIG_DEBUG_MUTEXES
+ /*
+ * When debugging is enabled we must not clear the owner before time,
+ * the slow path will always be taken, and that clears the owner field
+ * after verifying that it was indeed current.
+ */
+ mutex_clear_owner(&lock->base);
+#endif
+ __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+
+static inline int __sched
+__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+
+ if (!hold_ctx)
+ return 0;
+
+ if (unlikely(ctx == hold_ctx))
+ return -EALREADY;
+
+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+ ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ unsigned long flags;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ lock->ctx = ctx;
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the atomic read is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Check if lock is contended, if not there is nobody to wake up
+ */
+ if (likely(atomic_read(&lock->base.count) == 0))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, wake up everyone in this case,
+ * so they can see the new lock->ctx.
+ */
+ spin_lock_mutex(&lock->base.wait_lock, flags);
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+ spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static __always_inline int __sched
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct task_struct *task = current;
+ struct mutex_waiter waiter;
+ unsigned long flags;
+ int ret;
+
+ preempt_disable();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ /*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that there are no
+ * pending waiters and the lock owner is currently running on a
+ * (different) CPU.
+ *
+ * The rationale is that if the lock owner is running, it is likely to
+ * release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ */
+ if (!mutex_can_spin_on_owner(lock))
+ goto slowpath;
+
+ if (!osq_lock(&lock->osq))
+ goto slowpath;
+
+ for (;;) {
+ struct task_struct *owner;
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ */
+ if (ACCESS_ONCE(ww->ctx))
+ break;
+ }
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = ACCESS_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ if ((atomic_read(&lock->count) == 1) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
+ lock_acquired(&lock->dep_map, ip);
+ if (use_ww_ctx) {
+ struct ww_mutex *ww;
+ ww = container_of(lock, struct ww_mutex, base);
+
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ }
+
+ mutex_set_owner(lock);
+ osq_unlock(&lock->osq);
+ preempt_enable();
+ return 0;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ arch_mutex_cpu_relax();
+ }
+ osq_unlock(&lock->osq);
+slowpath:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched())
+ schedule_preempt_disabled();
+#endif
+ spin_lock_mutex(&lock->wait_lock, flags);
+
+ /* once more, can we acquire the lock? */
+ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ goto skip_wait;
+
+ debug_mutex_lock_common(lock, &waiter);
+ debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+
+ /* add waiting tasks to the end of the waitqueue (FIFO): */
+ list_add_tail(&waiter.list, &lock->wait_list);
+ waiter.task = task;
+
+ lock_contended(&lock->dep_map, ip);
+
+ for (;;) {
+ /*
+ * Lets try to take the lock again - this is needed even if
+ * we get here for the first time (shortly after failing to
+ * acquire the lock), to make sure that we get a wakeup once
+ * it's unlocked. Later on, if we sleep, this is the
+ * operation that gives us the lock. We xchg it to -1, so
+ * that when we release the lock, we properly wake up the
+ * other waiters:
+ */
+ if (MUTEX_SHOW_NO_WAITER(lock) &&
+ (atomic_xchg(&lock->count, -1) == 1))
+ break;
+
+ /*
+ * got a signal? (This code gets eliminated in the
+ * TASK_UNINTERRUPTIBLE case.)
+ */
+ if (unlikely(signal_pending_state(state, task))) {
+ ret = -EINTR;
+ goto err;
+ }
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
+ if (ret)
+ goto err;
+ }
+
+ __set_task_state(task, state);
+
+ /* didn't get the lock, go to sleep: */
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ schedule_preempt_disabled();
+ spin_lock_mutex(&lock->wait_lock, flags);
+ }
+ mutex_remove_waiter(lock, &waiter, current_thread_info());
+ /* set it to 0 if there are no waiters left: */
+ if (likely(list_empty(&lock->wait_list)))
+ atomic_set(&lock->count, 0);
+ debug_mutex_free_waiter(&waiter);
+
+skip_wait:
+ /* got the lock - cleanup and rejoice! */
+ lock_acquired(&lock->dep_map, ip);
+ mutex_set_owner(lock);
+
+ if (use_ww_ctx) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct mutex_waiter *cur;
+
+ /*
+ * This branch gets optimized out for the common case,
+ * and is only important for ww_mutex_lock.
+ */
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ ww->ctx = ww_ctx;
+
+ /*
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
+ */
+ list_for_each_entry(cur, &lock->wait_list, list) {
+ debug_mutex_wake_waiter(lock, cur);
+ wake_up_process(cur->task);
+ }
+ }
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ preempt_enable();
+ return 0;
+
+err:
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ debug_mutex_free_waiter(&waiter);
+ mutex_release(&lock->dep_map, 1, ip);
+ preempt_enable();
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ 0, nest, _RET_IP_, NULL, 0);
+}
+
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched
+mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_KILLABLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+ unsigned tmp;
+
+ if (ctx->deadlock_inject_countdown-- == 0) {
+ tmp = ctx->deadlock_inject_interval;
+ if (tmp > UINT_MAX/4)
+ tmp = UINT_MAX;
+ else
+ tmp = tmp*2 + tmp + tmp/2;
+
+ ctx->deadlock_inject_interval = tmp;
+ ctx->deadlock_inject_countdown = tmp;
+ ctx->contending_lock = lock;
+
+ ww_mutex_unlock(lock);
+
+ return -EDEADLK;
+ }
+#endif
+
+ return 0;
+}
+
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+ 0, &ctx->dep_map, _RET_IP_, ctx, 1);
+ if (!ret && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+ 0, &ctx->dep_map, _RET_IP_, ctx, 1);
+
+ if (!ret && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+
+#endif
+
+/*
+ * Release the lock, slowpath:
+ */
+static inline void
+__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
+
+ /*
+ * some architectures leave the lock unlocked in the fastpath failure
+ * case, others need to leave it locked. In the later case we have to
+ * unlock it here
+ */
+ if (__mutex_slowpath_needs_to_unlock())
+ atomic_set(&lock->count, 1);
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
+
+ if (!list_empty(&lock->wait_list)) {
+ /* get the first entry from the wait-list: */
+ struct mutex_waiter *waiter =
+ list_entry(lock->wait_list.next,
+ struct mutex_waiter, list);
+
+ debug_mutex_wake_waiter(lock, waiter);
+
+ wake_up_process(waiter->task);
+ }
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+__visible void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+ __mutex_unlock_common_slowpath(lock_count, 1);
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static noinline int __sched
+__mutex_lock_killable_slowpath(struct mutex *lock);
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock);
+
+/**
+ * mutex_lock_interruptible - acquire the mutex, interruptible
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex like mutex_lock(), and return 0 if the mutex has
+ * been acquired or sleep until the mutex becomes available. If a
+ * signal arrives while waiting for the lock then this function
+ * returns -EINTR.
+ *
+ * This function is similar to (but not equivalent to) down_interruptible().
+ */
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_fastpath_lock_retval(&lock->count);
+ if (likely(!ret)) {
+ mutex_set_owner(lock);
+ return 0;
+ } else
+ return __mutex_lock_interruptible_slowpath(lock);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_fastpath_lock_retval(&lock->count);
+ if (likely(!ret)) {
+ mutex_set_owner(lock);
+ return 0;
+ } else
+ return __mutex_lock_killable_slowpath(lock);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+__visible void __sched
+__mutex_lock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
+ NULL, _RET_IP_, NULL, 0);
+}
+
+static noinline int __sched
+__mutex_lock_killable_slowpath(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0,
+ NULL, _RET_IP_, NULL, 0);
+}
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
+ NULL, _RET_IP_, NULL, 0);
+}
+
+static noinline int __sched
+__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
+ NULL, _RET_IP_, ctx, 1);
+}
+
+static noinline int __sched
+__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
+ NULL, _RET_IP_, ctx, 1);
+}
+
+#endif
+
+/*
+ * Spinlock based trylock, we take the spinlock and check whether we
+ * can get the lock:
+ */
+static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
+ int prev;
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+
+ prev = atomic_xchg(&lock->count, -1);
+ if (likely(prev == 1)) {
+ mutex_set_owner(lock);
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
+
+ /* Set it back to 0 if there are no waiters: */
+ if (likely(list_empty(&lock->wait_list)))
+ atomic_set(&lock->count, 0);
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+
+ return prev == 1;
+}
+
+/**
+ * mutex_trylock - try to acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated from the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+ if (ret)
+ mutex_set_owner(lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+ if (likely(!ret)) {
+ ww_mutex_set_context_fastpath(lock, ctx);
+ mutex_set_owner(&lock->base);
+ } else
+ ret = __ww_mutex_lock_slowpath(lock, ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock);
+
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+ if (likely(!ret)) {
+ ww_mutex_set_context_fastpath(lock, ctx);
+ mutex_set_owner(&lock->base);
+ } else
+ ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+
+#endif
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b1..4115fbf83b1 100644
--- a/kernel/mutex.h
+++ b/kernel/locking/mutex.h
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
new file mode 100644
index 00000000000..652a8ee8efe
--- /dev/null
+++ b/kernel/locking/percpu-rwsem.c
@@ -0,0 +1,165 @@
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/percpu.h>
+#include <linux/wait.h>
+#include <linux/lockdep.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+ const char *name, struct lock_class_key *rwsem_key)
+{
+ brw->fast_read_ctr = alloc_percpu(int);
+ if (unlikely(!brw->fast_read_ctr))
+ return -ENOMEM;
+
+ /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
+ __init_rwsem(&brw->rw_sem, name, rwsem_key);
+ atomic_set(&brw->write_ctr, 0);
+ atomic_set(&brw->slow_read_ctr, 0);
+ init_waitqueue_head(&brw->write_waitq);
+ return 0;
+}
+
+void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+{
+ free_percpu(brw->fast_read_ctr);
+ brw->fast_read_ctr = NULL; /* catch use after free bugs */
+}
+
+/*
+ * This is the fast-path for down_read/up_read, it only needs to ensure
+ * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
+ * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
+ * serialize with the preempt-disabled section below.
+ *
+ * The nontrivial part is that we should guarantee acquire/release semantics
+ * in case when
+ *
+ * R_W: down_write() comes after up_read(), the writer should see all
+ * changes done by the reader
+ * or
+ * W_R: down_read() comes after up_write(), the reader should see all
+ * changes done by the writer
+ *
+ * If this helper fails the callers rely on the normal rw_semaphore and
+ * atomic_dec_and_test(), so in this case we have the necessary barriers.
+ *
+ * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
+ * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
+ * reader inside the critical section. See the comments in down_write and
+ * up_write below.
+ */
+static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+{
+ bool success = false;
+
+ preempt_disable();
+ if (likely(!atomic_read(&brw->write_ctr))) {
+ __this_cpu_add(*brw->fast_read_ctr, val);
+ success = true;
+ }
+ preempt_enable();
+
+ return success;
+}
+
+/*
+ * Like the normal down_read() this is not recursive, the writer can
+ * come after the first percpu_down_read() and create the deadlock.
+ *
+ * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
+ * percpu_up_read() does rwsem_release(). This pairs with the usage
+ * of ->rw_sem in percpu_down/up_write().
+ */
+void percpu_down_read(struct percpu_rw_semaphore *brw)
+{
+ might_sleep();
+ if (likely(update_fast_ctr(brw, +1))) {
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ return;
+ }
+
+ down_read(&brw->rw_sem);
+ atomic_inc(&brw->slow_read_ctr);
+ /* avoid up_read()->rwsem_release() */
+ __up_read(&brw->rw_sem);
+}
+
+void percpu_up_read(struct percpu_rw_semaphore *brw)
+{
+ rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+
+ if (likely(update_fast_ctr(brw, -1)))
+ return;
+
+ /* false-positive is possible but harmless */
+ if (atomic_dec_and_test(&brw->slow_read_ctr))
+ wake_up_all(&brw->write_waitq);
+}
+
+static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+{
+ unsigned int sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ sum += per_cpu(*brw->fast_read_ctr, cpu);
+ per_cpu(*brw->fast_read_ctr, cpu) = 0;
+ }
+
+ return sum;
+}
+
+/*
+ * A writer increments ->write_ctr to force the readers to switch to the
+ * slow mode, note the atomic_read() check in update_fast_ctr().
+ *
+ * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
+ * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
+ * counter it represents the number of active readers.
+ *
+ * Finally the writer takes ->rw_sem for writing and blocks the new readers,
+ * then waits until the slow counter becomes zero.
+ */
+void percpu_down_write(struct percpu_rw_semaphore *brw)
+{
+ /* tell update_fast_ctr() there is a pending writer */
+ atomic_inc(&brw->write_ctr);
+ /*
+ * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
+ * so that update_fast_ctr() can't succeed.
+ *
+ * 2. Ensures we see the result of every previous this_cpu_add() in
+ * update_fast_ctr().
+ *
+ * 3. Ensures that if any reader has exited its critical section via
+ * fast-path, it executes a full memory barrier before we return.
+ * See R_W case in the comment above update_fast_ctr().
+ */
+ synchronize_sched_expedited();
+
+ /* exclude other writers, and block the new readers completely */
+ down_write(&brw->rw_sem);
+
+ /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
+ atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+
+ /* wait for all readers to complete their percpu_up_read() */
+ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+}
+
+void percpu_up_write(struct percpu_rw_semaphore *brw)
+{
+ /* release the lock, but the readers can't use the fast-path */
+ up_write(&brw->rw_sem);
+ /*
+ * Insert the barrier before the next fast-path in down_read,
+ * see W_R case in the comment above update_fast_ctr().
+ */
+ synchronize_sched_expedited();
+ /* the last writer unblocks update_fast_ctr() */
+ atomic_dec(&brw->write_ctr);
+}
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 00000000000..fb5b8ac411a
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,133 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/qrwlock.h>
+
+/**
+ * rspin_until_writer_unlock - inc reader count & spin until writer is gone
+ * @lock : Pointer to queue rwlock structure
+ * @writer: Current queue rwlock writer status byte
+ *
+ * In interrupt context or at the head of the queue, the reader will just
+ * increment the reader count & wait until the writer releases the lock.
+ */
+static __always_inline void
+rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+{
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ arch_mutex_cpu_relax();
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ }
+}
+
+/**
+ * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queue_read_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /*
+ * Readers come here when they cannot get the lock without waiting
+ */
+ if (unlikely(in_interrupt())) {
+ /*
+ * Readers in interrupt context will spin until the lock is
+ * available without waiting in the queue.
+ */
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ rspin_until_writer_unlock(lock, cnts);
+ return;
+ }
+ atomic_sub(_QR_BIAS, &lock->cnts);
+
+ /*
+ * Put the reader into the wait queue
+ */
+ arch_spin_lock(&lock->lock);
+
+ /*
+ * At the head of the wait queue now, wait until the writer state
+ * goes to 0 and then try to increment the reader count and get
+ * the lock. It is possible that an incoming writer may steal the
+ * lock in the interim, so it is necessary to check the writer byte
+ * to make sure that the write lock isn't taken.
+ */
+ while (atomic_read(&lock->cnts) & _QW_WMASK)
+ arch_mutex_cpu_relax();
+
+ cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ rspin_until_writer_unlock(lock, cnts);
+
+ /*
+ * Signal the next one in queue to become queue head
+ */
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_read_lock_slowpath);
+
+/**
+ * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queue_write_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /* Put the writer into the wait queue */
+ arch_spin_lock(&lock->lock);
+
+ /* Try to acquire the lock directly if no reader is present */
+ if (!atomic_read(&lock->cnts) &&
+ (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ goto unlock;
+
+ /*
+ * Set the waiting flag to notify readers that a writer is pending,
+ * or wait for a previous writer to go away.
+ */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if (!(cnts & _QW_WMASK) &&
+ (atomic_cmpxchg(&lock->cnts, cnts,
+ cnts | _QW_WAITING) == cnts))
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+
+ /* When no more readers, set the locked flag */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if ((cnts == _QW_WAITING) &&
+ (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+unlock:
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 16502d3a71c..49b2ed3dced 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -17,13 +17,14 @@
* See rt.c in preempt-rt for proper credits and further information
*/
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
-#include <linux/plist.h>
+#include <linux/rbtree.h>
#include <linux/fs.h>
#include <linux/debug_locks.h>
@@ -56,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
- DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
@@ -153,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
memset(waiter, 0x11, sizeof(*waiter));
- plist_node_init(&waiter->list_entry, MAX_PRIO);
- plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
waiter->deadlock_task_pid = NULL;
}
void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
{
put_pid(waiter->deadlock_task_pid);
- DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
- DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
memset(waiter, 0x22, sizeof(*waiter));
}
diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d7..ab29b6a2266 100644
--- a/kernel/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
{
return (waiter != NULL);
}
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
index 98ec4947546..1d96dd0d93c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/locking/rtmutex-tester.c
@@ -10,9 +10,11 @@
#include <linux/kthread.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/stat.h>
#include "rtmutex.h"
@@ -365,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
return curr - buf;
}
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
static struct bus_type rttest_subsys = {
.name = "rttest",
diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c
index a242e691c99..fc605941b9b 100644
--- a/kernel/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -13,6 +13,8 @@
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
#include <linux/timer.h>
#include "rtmutex_common.h"
@@ -81,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
owner = *p;
} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
}
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+
+ clear_rt_mutex_waiters(lock);
+ raw_spin_unlock(&lock->wait_lock);
+ /*
+ * If a new waiter comes in between the unlock and the cmpxchg
+ * we have two situations:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * cmpxchg(p, owner, 0) == owner
+ * mark_rt_mutex_waiters(lock);
+ * acquire(lock);
+ * or:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * mark_rt_mutex_waiters(lock);
+ *
+ * cmpxchg(p, owner, 0) != owner
+ * enqueue_waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * wake waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * acquire(lock);
+ */
+ return rt_mutex_cmpxchg(lock, owner, NULL);
+}
+
#else
# define rt_mutex_cmpxchg(l,c,n) (0)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -88,12 +131,120 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
}
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ lock->owner = NULL;
+ raw_spin_unlock(&lock->wait_lock);
+ return true;
+}
#endif
+static inline int
+rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio < right->prio)
+ return 1;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 1 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return (left->task->dl.deadline < right->task->dl.deadline);
+
+ return 0;
+}
+
+static void
+rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+ struct rb_node **link = &lock->waiters.rb_node;
+ struct rb_node *parent = NULL;
+ struct rt_mutex_waiter *entry;
+ int leftmost = 1;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
+ if (rt_mutex_waiter_less(waiter, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ lock->waiters_leftmost = &waiter->tree_entry;
+
+ rb_link_node(&waiter->tree_entry, parent, link);
+ rb_insert_color(&waiter->tree_entry, &lock->waiters);
+}
+
+static void
+rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+ if (RB_EMPTY_NODE(&waiter->tree_entry))
+ return;
+
+ if (lock->waiters_leftmost == &waiter->tree_entry)
+ lock->waiters_leftmost = rb_next(&waiter->tree_entry);
+
+ rb_erase(&waiter->tree_entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+}
+
+static void
+rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+ struct rb_node **link = &task->pi_waiters.rb_node;
+ struct rb_node *parent = NULL;
+ struct rt_mutex_waiter *entry;
+ int leftmost = 1;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
+ if (rt_mutex_waiter_less(waiter, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ task->pi_waiters_leftmost = &waiter->pi_tree_entry;
+
+ rb_link_node(&waiter->pi_tree_entry, parent, link);
+ rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+}
+
+static void
+rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+ if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ return;
+
+ if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
+ task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
+
+ rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+}
+
/*
- * Calculate task priority from the waiter list priority
+ * Calculate task priority from the waiter tree priority
*
- * Return task->normal_prio when the waiter list is empty or when
+ * Return task->normal_prio when the waiter tree is empty or when
* the waiter is not allowed to do priority boosting
*/
int rt_mutex_getprio(struct task_struct *task)
@@ -101,10 +252,30 @@ int rt_mutex_getprio(struct task_struct *task)
if (likely(!task_has_pi_waiters(task)))
return task->normal_prio;
- return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+ return min(task_top_pi_waiter(task)->prio,
task->normal_prio);
}
+struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return NULL;
+
+ return task_top_pi_waiter(task)->task;
+}
+
+/*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+ if (!task_has_pi_waiters(task))
+ return 0;
+
+ return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+
/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
@@ -114,7 +285,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
- if (task->prio != prio)
+ if (task->prio != prio || dl_prio(prio))
rt_mutex_setprio(task, prio);
}
@@ -141,14 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
*/
int max_lock_depth = 1024;
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+ return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
+
/*
* Adjust the priority chain. Also used for deadlock detection.
* Decreases task's usage by one - may thus free the task.
+ *
+ * @task: the task owning the mutex (owner) for which a chain walk is
+ * probably needed
+ * @deadlock_detect: do we have to carry out deadlock detection?
+ * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * things for a task that has just got its priority adjusted, and
+ * is waiting on a mutex)
+ * @next_lock: the mutex on which the owner of @orig_lock was blocked before
+ * we dropped its pi_lock. Is never dereferenced, only used for
+ * comparison to detect lock chain changes.
+ * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
+ * its priority to the mutex owner (can be NULL in the case
+ * depicted above or if the top waiter is gone away and we are
+ * actually deboosting the owner)
+ * @top_task: the current top waiter
+ *
* Returns 0 or -EDEADLK.
*/
static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int deadlock_detect,
struct rt_mutex *orig_lock,
+ struct rt_mutex *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
@@ -182,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
put_task_struct(task);
- return deadlock_detect ? -EDEADLK : 0;
+ return -EDEADLK;
}
retry:
/*
@@ -207,19 +400,38 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * We dropped all locks after taking a refcount on @task, so
+ * the task might have moved on in the lock chain or even left
+ * the chain completely and blocks now on an unrelated lock or
+ * on @orig_lock.
+ *
+ * We stored the lock on which @task was blocked in @next_lock,
+ * so we can detect the chain change.
+ */
+ if (next_lock != waiter->lock)
+ goto out_unlock_pi;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
*/
- if (top_waiter && (!task_has_pi_waiters(task) ||
- top_waiter != task_top_pi_waiter(task)))
- goto out_unlock_pi;
+ if (top_waiter) {
+ if (!task_has_pi_waiters(task))
+ goto out_unlock_pi;
+ /*
+ * If deadlock detection is off, we stop here if we
+ * are not the top pi waiter of the task.
+ */
+ if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
+ goto out_unlock_pi;
+ }
/*
* When deadlock detection is off then we check, if further
* priority adjustment is necessary.
*/
- if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+ if (!detect_deadlock && waiter->prio == task->prio)
goto out_unlock_pi;
lock = waiter->lock;
@@ -229,20 +441,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto retry;
}
- /* Deadlock detection */
+ /*
+ * Deadlock detection. If the lock is the same as the original
+ * lock which caused us to walk the lock chain or if the
+ * current lock is owned by the task which initiated the chain
+ * walk, we detected a deadlock.
+ */
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
- ret = deadlock_detect ? -EDEADLK : 0;
+ ret = -EDEADLK;
goto out_unlock_pi;
}
top_waiter = rt_mutex_top_waiter(lock);
/* Requeue the waiter */
- plist_del(&waiter->list_entry, &lock->wait_list);
- waiter->list_entry.prio = task->prio;
- plist_add(&waiter->list_entry, &lock->wait_list);
+ rt_mutex_dequeue(lock, waiter);
+ waiter->prio = task->prio;
+ rt_mutex_enqueue(lock, waiter);
/* Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -266,25 +483,38 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (waiter == rt_mutex_top_waiter(lock)) {
/* Boost the owner */
- plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
- waiter->pi_list_entry.prio = waiter->list_entry.prio;
- plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ rt_mutex_dequeue_pi(task, top_waiter);
+ rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
} else if (top_waiter == waiter) {
/* Deboost the owner */
- plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+ rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
- waiter->pi_list_entry.prio = waiter->list_entry.prio;
- plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
}
+ /*
+ * Check whether the task which owns the current lock is pi
+ * blocked itself. If yes we store a pointer to the lock for
+ * the lock chain change detection above. After we dropped
+ * task->pi_lock next_lock cannot be dereferenced anymore.
+ */
+ next_lock = task_blocked_on_lock(task);
+
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
top_waiter = rt_mutex_top_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
+ /*
+ * We reached the end of the lock chain. Stop right here. No
+ * point to go back just to figure that out.
+ */
+ if (!next_lock)
+ goto out_put_task;
+
if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -341,7 +571,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* 3) it is top waiter
*/
if (rt_mutex_has_waiters(lock)) {
- if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+ if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
if (!waiter || waiter != rt_mutex_top_waiter(lock))
return 0;
}
@@ -355,7 +585,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/* remove the queued waiter. */
if (waiter) {
- plist_del(&waiter->list_entry, &lock->wait_list);
+ rt_mutex_dequeue(lock, waiter);
task->pi_blocked_on = NULL;
}
@@ -365,8 +595,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock)) {
top = rt_mutex_top_waiter(lock);
- top->pi_list_entry.prio = top->list_entry.prio;
- plist_add(&top->pi_list_entry, &task->pi_waiters);
+ rt_mutex_enqueue_pi(task, top);
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
@@ -395,20 +624,32 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- unsigned long flags;
+ struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ unsigned long flags;
+
+ /*
+ * Early deadlock detection. We really don't want the task to
+ * enqueue on itself just to untangle the mess later. It's not
+ * only an optimization. We drop the locks, so another waiter
+ * can come in before the chain walk detects the deadlock. So
+ * the other will detect the deadlock and return -EDEADLOCK,
+ * which is wrong, as the other waiter is not in a deadlock
+ * situation.
+ */
+ if (owner == task)
+ return -EDEADLK;
raw_spin_lock_irqsave(&task->pi_lock, flags);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
- plist_node_init(&waiter->list_entry, task->prio);
- plist_node_init(&waiter->pi_list_entry, task->prio);
+ waiter->prio = task->prio;
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
top_waiter = rt_mutex_top_waiter(lock);
- plist_add(&waiter->list_entry, &lock->wait_list);
+ rt_mutex_enqueue(lock, waiter);
task->pi_blocked_on = waiter;
@@ -417,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (!owner)
return 0;
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
- plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
- plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ rt_mutex_dequeue_pi(owner, top_waiter);
+ rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
- else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
chain_walk = 1;
+ }
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- if (!chain_walk)
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ /*
+ * Even if full deadlock detection is on, if the owner is not
+ * blocked itself, we can avoid finding this out in the chain
+ * walk.
+ */
+ if (!chain_walk || !next_lock)
return 0;
/*
@@ -442,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
- task);
+ res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+ next_lock, waiter, task);
raw_spin_lock(&lock->wait_lock);
@@ -453,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
*
* Called with lock->wait_lock held.
*/
@@ -472,12 +722,25 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
* boosted mode and go back to normal after releasing
* lock->wait_lock.
*/
- plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+ rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_set_owner(lock, NULL);
+ /*
+ * As we are waking up the top waiter, and the waiter stays
+ * queued on the lock until it gets the lock, this lock
+ * obviously has waiters. Just set the bit here and this has
+ * the added benefit of forcing all new tasks into the
+ * slow path making sure no task of lower priority than
+ * the top waiter can steal this lock.
+ */
+ lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ /*
+ * It's safe to dereference waiter as it cannot go away as
+ * long as we hold lock->wait_lock. The waiter task needs to
+ * acquire it in order to dequeue the waiter.
+ */
wake_up_process(waiter->task);
}
@@ -492,11 +755,11 @@ static void remove_waiter(struct rt_mutex *lock,
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex *next_lock = NULL;
unsigned long flags;
- int chain_walk = 0;
raw_spin_lock_irqsave(&current->pi_lock, flags);
- plist_del(&waiter->list_entry, &lock->wait_list);
+ rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
@@ -507,25 +770,23 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_lock_irqsave(&owner->pi_lock, flags);
- plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+ rt_mutex_dequeue_pi(owner, waiter);
if (rt_mutex_has_waiters(lock)) {
struct rt_mutex_waiter *next;
next = rt_mutex_top_waiter(lock);
- plist_add(&next->pi_list_entry, &owner->pi_waiters);
+ rt_mutex_enqueue_pi(owner, next);
}
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on)
- chain_walk = 1;
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
}
- WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-
- if (!chain_walk)
+ if (!next_lock)
return;
/* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -533,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+ rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
raw_spin_lock(&lock->wait_lock);
}
@@ -546,21 +807,24 @@ static void remove_waiter(struct rt_mutex *lock,
void rt_mutex_adjust_pi(struct task_struct *task)
{
struct rt_mutex_waiter *waiter;
+ struct rt_mutex *next_lock;
unsigned long flags;
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || waiter->list_entry.prio == task->prio) {
+ if (!waiter || (waiter->prio == task->prio &&
+ !dl_prio(task->prio))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
-
+ next_lock = waiter->lock;
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
- rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+
+ rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
}
/**
@@ -612,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
+{
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ /*
+ * Yell lowdly and stop the task right here.
+ */
+ rt_mutex_print_deadlock(w);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+}
+
/*
* Slow path lock function:
*/
@@ -624,6 +908,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
+ RB_CLEAR_NODE(&waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&waiter.tree_entry);
raw_spin_lock(&lock->wait_lock);
@@ -649,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(TASK_RUNNING);
- if (unlikely(ret))
+ if (unlikely(ret)) {
remove_waiter(lock, &waiter);
+ rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -706,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
rt_mutex_deadlock_account_unlock(current);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
- return;
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock(&lock->wait_lock);
}
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ */
wakeup_next_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -890,7 +1215,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
- plist_head_init(&lock->wait_list);
+ lock->waiters = RB_ROOT;
+ lock->waiters_leftmost = NULL;
debug_rt_mutex_init(lock, name);
}
@@ -958,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
return 1;
}
- ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
if (ret && !rt_mutex_owner(lock)) {
/*
diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421..f6a1f3c133b 100644
--- a/kernel/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
#define debug_rt_mutex_detect_deadlock(w,d) (d)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ WARN(1, "rtmutex deadlock detected\n");
+}
diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261..7431a9c86f3 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @list_entry: pi node to enqueue into the mutex waiters list
- * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
+ * @tree_entry: pi node to enqueue into the mutex waiters tree
+ * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
*/
struct rt_mutex_waiter {
- struct plist_node list_entry;
- struct plist_node pi_list_entry;
+ struct rb_node tree_entry;
+ struct rb_node pi_tree_entry;
struct task_struct *task;
struct rt_mutex *lock;
#ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -54,14 +54,15 @@ struct rt_mutex_waiter {
struct pid *deadlock_task_pid;
struct rt_mutex *deadlock_lock;
#endif
+ int prio;
};
/*
- * Various helpers to access the waiters-plist:
+ * Various helpers to access the waiters-tree:
*/
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
- return !plist_head_empty(&lock->wait_list);
+ return !RB_EMPTY_ROOT(&lock->waiters);
}
static inline struct rt_mutex_waiter *
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *w;
- w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
- list_entry);
+ w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
+ tree_entry);
BUG_ON(w->lock != lock);
return w;
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
static inline int task_has_pi_waiters(struct task_struct *p)
{
- return !plist_head_empty(&p->pi_waiters);
+ return !RB_EMPTY_ROOT(&p->pi_waiters);
}
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
- return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
- pi_list_entry);
+ return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
+ pi_tree_entry);
}
/*
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
new file mode 100644
index 00000000000..2c93571162c
--- /dev/null
+++ b/kernel/locking/rwsem-spinlock.c
@@ -0,0 +1,296 @@
+/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
+ * generic spinlock implementation
+ *
+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+};
+
+int rwsem_is_locked(struct rw_semaphore *sem)
+{
+ int ret = 1;
+ unsigned long flags;
+
+ if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
+ ret = (sem->count != 0);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rwsem_is_locked);
+
+/*
+ * initialise the semaphore
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ sem->count = 0;
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here, then:
+ * - the 'active count' _reached_ zero
+ * - the 'waiting count' is non-zero
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if wakewrite is non-zero
+ */
+static inline struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+{
+ struct rwsem_waiter *waiter;
+ struct task_struct *tsk;
+ int woken;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wakewrite)
+ /* Wake up a writer. Note that we do not grant it the
+ * lock - it will have to acquire it when it runs. */
+ wake_up_process(waiter->task);
+ goto out;
+ }
+
+ /* grant an infinite number of read locks to the front of the queue */
+ woken = 0;
+ do {
+ struct list_head *next = waiter->list.next;
+
+ list_del(&waiter->list);
+ tsk = waiter->task;
+ smp_mb();
+ waiter->task = NULL;
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ woken++;
+ if (next == &sem->wait_list)
+ break;
+ waiter = list_entry(next, struct rwsem_waiter, list);
+ } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+
+ sem->count += woken;
+
+ out:
+ return sem;
+}
+
+/*
+ * wake a single writer
+ */
+static inline struct rw_semaphore *
+__rwsem_wake_one_writer(struct rw_semaphore *sem)
+{
+ struct rwsem_waiter *waiter;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+ wake_up_process(waiter->task);
+
+ return sem;
+}
+
+/*
+ * get a read lock on the semaphore
+ */
+void __sched __down_read(struct rw_semaphore *sem)
+{
+ struct rwsem_waiter waiter;
+ struct task_struct *tsk;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->count++;
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ goto out;
+ }
+
+ tsk = current;
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+
+ /* set up my own style of waitqueue */
+ waiter.task = tsk;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ get_task_struct(tsk);
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we don't need to touch the semaphore struct anymore */
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ /* wait to be given the lock */
+ for (;;) {
+ if (!waiter.task)
+ break;
+ schedule();
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ }
+
+ tsk->state = TASK_RUNNING;
+ out:
+ ;
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int __down_read_trylock(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ int ret = 0;
+
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->count++;
+ ret = 1;
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * get a write lock on the semaphore
+ */
+void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ struct rwsem_waiter waiter;
+ struct task_struct *tsk;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* set up my own style of waitqueue */
+ tsk = current;
+ waiter.task = tsk;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* wait for someone to release the lock */
+ for (;;) {
+ /*
+ * That is the key to support write lock stealing: allows the
+ * task already on CPU to get the lock soon rather than put
+ * itself into sleep and waiting for system woke it or someone
+ * else in the head of the wait list up.
+ */
+ if (sem->count == 0)
+ break;
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ schedule();
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+ }
+ /* got the lock */
+ sem->count = -1;
+ list_del(&waiter.list);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+void __sched __down_write(struct rw_semaphore *sem)
+{
+ __down_write_nested(sem, 0);
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int __down_write_trylock(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count == 0) {
+ /* got the lock */
+ sem->count = -1;
+ ret = 1;
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void __up_read(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (--sem->count == 0 && !list_empty(&sem->wait_list))
+ sem = __rwsem_wake_one_writer(sem);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * release a write lock on the semaphore
+ */
+void __up_write(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ sem->count = 0;
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, 1);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void __downgrade_write(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ sem->count = 1;
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, 0);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
new file mode 100644
index 00000000000..a2391ac135c
--- /dev/null
+++ b/kernel/locking/rwsem-xadd.c
@@ -0,0 +1,513 @@
+/* rwsem.c: R/W semaphores: contention handling functions
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched/rt.h>
+
+#include "mcs_spinlock.h"
+
+/*
+ * Guide to the rw_semaphore's count field for common values.
+ * (32-bit case illustrated, similar for 64-bit)
+ *
+ * 0x0000000X (1) X readers active or attempting lock, no writer waiting
+ * X = #active_readers + #readers attempting to lock
+ * (X*ACTIVE_BIAS)
+ *
+ * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
+ * attempting to read lock or write lock.
+ *
+ * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
+ * X = #active readers + # readers attempting lock
+ * (X*ACTIVE_BIAS + WAITING_BIAS)
+ * (2) 1 writer attempting lock, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ * (3) 1 writer active, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
+ * (WAITING_BIAS + ACTIVE_BIAS)
+ * (2) 1 writer active or attempting lock, no waiters for lock
+ * (ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0000 (1) There are writers or readers queued but none active
+ * or in the process of attempting lock.
+ * (WAITING_BIAS)
+ * Note: writer can attempt to steal lock for this count by adding
+ * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
+ *
+ * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
+ * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
+ *
+ * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
+ * the count becomes more than 0 for successful lock acquisition,
+ * i.e. the case where there are only readers or nobody has lock.
+ * (1st and 2nd case above).
+ *
+ * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
+ * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
+ * acquisition (i.e. nobody else has lock or attempts lock). If
+ * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
+ * are only waiters but none active (5th case above), and attempt to
+ * steal the lock.
+ *
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ sem->count = RWSEM_UNLOCKED_VALUE;
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ sem->owner = NULL;
+ osq_lock_init(&sem->osq);
+#endif
+}
+
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+};
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then:
+ * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
+ * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
+ * - there must be someone on the queue
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if downgrading is false
+ */
+static struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+{
+ struct rwsem_waiter *waiter;
+ struct task_struct *tsk;
+ struct list_head *next;
+ long oldcount, woken, loop, adjustment;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY)
+ /* Wake writer at the front of the queue, but do not
+ * grant it the lock yet as we want other writers
+ * to be able to steal it. Readers, on the other hand,
+ * will block as they will notice the queued writer.
+ */
+ wake_up_process(waiter->task);
+ goto out;
+ }
+
+ /* Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ adjustment = 0;
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ adjustment = RWSEM_ACTIVE_READ_BIAS;
+ try_reader_grant:
+ oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+ if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+ /* A writer stole the lock. Undo our reader grant. */
+ if (rwsem_atomic_update(-adjustment, sem) &
+ RWSEM_ACTIVE_MASK)
+ goto out;
+ /* Last active locker left. Retry waking readers. */
+ goto try_reader_grant;
+ }
+ }
+
+ /* Grant an infinite number of read locks to the readers at the front
+ * of the queue. Note we increment the 'active part' of the count by
+ * the number of readers before waking any processes up.
+ */
+ woken = 0;
+ do {
+ woken++;
+
+ if (waiter->list.next == &sem->wait_list)
+ break;
+
+ waiter = list_entry(waiter->list.next,
+ struct rwsem_waiter, list);
+
+ } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+
+ adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+ if (waiter->type != RWSEM_WAITING_FOR_WRITE)
+ /* hit end of list above */
+ adjustment -= RWSEM_WAITING_BIAS;
+
+ if (adjustment)
+ rwsem_atomic_add(adjustment, sem);
+
+ next = sem->wait_list.next;
+ loop = woken;
+ do {
+ waiter = list_entry(next, struct rwsem_waiter, list);
+ next = waiter->list.next;
+ tsk = waiter->task;
+ smp_mb();
+ waiter->task = NULL;
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ } while (--loop);
+
+ sem->wait_list.next = next;
+ next->prev = &sem->wait_list;
+
+ out:
+ return sem;
+}
+
+/*
+ * Wait for the read lock to be granted
+ */
+__visible
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+ struct rwsem_waiter waiter;
+ struct task_struct *tsk = current;
+
+ /* set up my own style of waitqueue */
+ waiter.task = tsk;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ get_task_struct(tsk);
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list))
+ adjustment += RWSEM_WAITING_BIAS;
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ count = rwsem_atomic_update(adjustment, sem);
+
+ /* If there are no active locks, wake the front queued process(es).
+ *
+ * If there are no writers and we are first in the queue,
+ * wake our own waiter to join the existing active readers !
+ */
+ if (count == RWSEM_WAITING_BIAS ||
+ (count > RWSEM_WAITING_BIAS &&
+ adjustment != -RWSEM_ACTIVE_READ_BIAS))
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* wait to be given the lock */
+ while (true) {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!waiter.task)
+ break;
+ schedule();
+ }
+
+ tsk->state = TASK_RUNNING;
+
+ return sem;
+}
+
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ if (!(count & RWSEM_ACTIVE_MASK)) {
+ /* try acquiring the write lock */
+ if (sem->count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return true;
+ }
+ }
+ return false;
+}
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long old, count = ACCESS_ONCE(sem->count);
+
+ while (true) {
+ if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+ return false;
+
+ old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+ if (old == count)
+ return true;
+
+ count = old;
+ }
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool on_cpu = false;
+
+ if (need_resched())
+ return false;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner)
+ on_cpu = owner->on_cpu;
+ rcu_read_unlock();
+
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath, then there is a possibility reader(s) may have the lock.
+ * To be safe, avoid spinning in these situations.
+ */
+ return on_cpu;
+}
+
+static inline bool owner_running(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ if (sem->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * sem->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(sem, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() or when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when sem->owner is NULL.
+ */
+ return sem->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ arch_mutex_cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+__visible
+struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+ long count;
+ bool waiting = true; /* any queued threads before us */
+ struct rwsem_waiter waiter;
+
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ return sem;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
+ if (list_empty(&sem->wait_list))
+ waiting = false;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ if (waiting) {
+ count = ACCESS_ONCE(sem->count);
+
+ /*
+ * If there were already threads queued before us and there are
+ * no active writers, the lock must be read owned; so we try to
+ * wake any read locks that were queued ahead of us.
+ */
+ if (count > RWSEM_WAITING_BIAS)
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+
+ } else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+
+ /* wait until we successfully acquire the lock */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (true) {
+ if (rwsem_try_write_lock(count, sem))
+ break;
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* Block until there are no active lockers. */
+ do {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ list_del(&waiter.list);
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ return sem;
+}
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+__visible
+struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* do nothing if list empty */
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return sem;
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+__visible
+struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* do nothing if list empty */
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return sem;
+}
+
+EXPORT_SYMBOL(rwsem_down_read_failed);
+EXPORT_SYMBOL(rwsem_down_write_failed);
+EXPORT_SYMBOL(rwsem_wake);
+EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c
index b152f74f02d..e2d3bc7f03b 100644
--- a/kernel/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -10,9 +10,29 @@
#include <linux/export.h>
#include <linux/rwsem.h>
-#include <asm/system.h>
#include <linux/atomic.h>
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
/*
* lock for reading
*/
@@ -49,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write);
@@ -60,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
return ret;
}
@@ -86,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_clear_owner(sem);
__up_write(sem);
}
@@ -100,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
+ rwsem_clear_owner(sem);
__downgrade_write(sem);
}
@@ -117,16 +143,44 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_nested);
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+ might_sleep();
+ rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(_down_write_nest_lock);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write_nested);
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
#endif
diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c
index 60636a4e25c..6815171a4ff 100644
--- a/kernel/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
* down_trylock - try to acquire the semaphore, without waiting
* @sem: the semaphore to be acquired
*
- * Try to acquire the semaphore atomically. Returns 0 if the mutex has
+ * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
* been acquired successfully or 1 if it it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up);
struct semaphore_waiter {
struct list_head list;
struct task_struct *task;
- int up;
+ bool up;
};
/*
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
- waiter.up = 0;
+ waiter.up = false;
for (;;) {
if (signal_pending_state(state, task))
goto interrupted;
- if (timeout <= 0)
+ if (unlikely(timeout <= 0))
goto timed_out;
__set_task_state(task, state);
raw_spin_unlock_irq(&sem->lock);
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem)
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
- waiter->up = 1;
+ waiter->up = true;
wake_up_process(waiter->task);
}
diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c
index 84c7d96918b..4b082b5cac9 100644
--- a/kernel/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -34,6 +34,20 @@
#else
#define raw_read_can_lock(l) read_can_lock(l)
#define raw_write_can_lock(l) write_can_lock(l)
+
+/*
+ * Some architectures can relax in favour of the CPU owning the lock.
+ */
+#ifndef arch_read_relax
+# define arch_read_relax(l) cpu_relax()
+#endif
+#ifndef arch_write_relax
+# define arch_write_relax(l) cpu_relax()
+#endif
+#ifndef arch_spin_relax
+# define arch_spin_relax(l) cpu_relax()
+#endif
+
/*
* We build the __lock_function inlines here. They are too large for
* inlining all over the place, but here is only one user per function
@@ -163,7 +177,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK
+#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
new file mode 100644
index 00000000000..0374a596cff
--- /dev/null
+++ b/kernel/locking/spinlock_debug.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ *
+ * This file contains the spinlock/rwlock implementations for
+ * DEBUG_SPINLOCK.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/nmi.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+
+void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ lock->magic = SPINLOCK_MAGIC;
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__raw_spin_lock_init);
+
+void __rwlock_init(rwlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
+ lock->magic = RWLOCK_MAGIC;
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__rwlock_init);
+
+static void spin_dump(raw_spinlock_t *lock, const char *msg)
+{
+ struct task_struct *owner = NULL;
+
+ if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
+ owner = lock->owner;
+ printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
+ msg, raw_smp_processor_id(),
+ current->comm, task_pid_nr(current));
+ printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
+ ".owner_cpu: %d\n",
+ lock, lock->magic,
+ owner ? owner->comm : "<none>",
+ owner ? task_pid_nr(owner) : -1,
+ lock->owner_cpu);
+ dump_stack();
+}
+
+static void spin_bug(raw_spinlock_t *lock, const char *msg)
+{
+ if (!debug_locks_off())
+ return;
+
+ spin_dump(lock, msg);
+}
+
+#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
+
+static inline void
+debug_spin_lock_before(raw_spinlock_t *lock)
+{
+ SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(lock->owner == current, lock, "recursion");
+ SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ lock, "cpu recursion");
+}
+
+static inline void debug_spin_lock_after(raw_spinlock_t *lock)
+{
+ lock->owner_cpu = raw_smp_processor_id();
+ lock->owner = current;
+}
+
+static inline void debug_spin_unlock(raw_spinlock_t *lock)
+{
+ SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
+ SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
+ SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+ lock, "wrong CPU");
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+static void __spin_lock_debug(raw_spinlock_t *lock)
+{
+ u64 i;
+ u64 loops = loops_per_jiffy * HZ;
+
+ for (i = 0; i < loops; i++) {
+ if (arch_spin_trylock(&lock->raw_lock))
+ return;
+ __delay(1);
+ }
+ /* lockup suspected: */
+ spin_dump(lock, "lockup suspected");
+#ifdef CONFIG_SMP
+ trigger_all_cpu_backtrace();
+#endif
+
+ /*
+ * The trylock above was causing a livelock. Give the lower level arch
+ * specific lock code a chance to acquire the lock. We have already
+ * printed a warning/backtrace at this point. The non-debug arch
+ * specific code might actually succeed in acquiring the lock. If it is
+ * not successful, the end-result is the same - there is no forward
+ * progress.
+ */
+ arch_spin_lock(&lock->raw_lock);
+}
+
+void do_raw_spin_lock(raw_spinlock_t *lock)
+{
+ debug_spin_lock_before(lock);
+ if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
+ __spin_lock_debug(lock);
+ debug_spin_lock_after(lock);
+}
+
+int do_raw_spin_trylock(raw_spinlock_t *lock)
+{
+ int ret = arch_spin_trylock(&lock->raw_lock);
+
+ if (ret)
+ debug_spin_lock_after(lock);
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_spin_unlock(raw_spinlock_t *lock)
+{
+ debug_spin_unlock(lock);
+ arch_spin_unlock(&lock->raw_lock);
+}
+
+static void rwlock_bug(rwlock_t *lock, const char *msg)
+{
+ if (!debug_locks_off())
+ return;
+
+ printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
+ msg, raw_smp_processor_id(), current->comm,
+ task_pid_nr(current), lock);
+ dump_stack();
+}
+
+#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
+
+#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
+static void __read_lock_debug(rwlock_t *lock)
+{
+ u64 i;
+ u64 loops = loops_per_jiffy * HZ;
+ int print_once = 1;
+
+ for (;;) {
+ for (i = 0; i < loops; i++) {
+ if (arch_read_trylock(&lock->raw_lock))
+ return;
+ __delay(1);
+ }
+ /* lockup suspected: */
+ if (print_once) {
+ print_once = 0;
+ printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
+ "%s/%d, %p\n",
+ raw_smp_processor_id(), current->comm,
+ current->pid, lock);
+ dump_stack();
+ }
+ }
+}
+#endif
+
+void do_raw_read_lock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ arch_read_lock(&lock->raw_lock);
+}
+
+int do_raw_read_trylock(rwlock_t *lock)
+{
+ int ret = arch_read_trylock(&lock->raw_lock);
+
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_read_unlock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ arch_read_unlock(&lock->raw_lock);
+}
+
+static inline void debug_write_lock_before(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
+ RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ lock, "cpu recursion");
+}
+
+static inline void debug_write_lock_after(rwlock_t *lock)
+{
+ lock->owner_cpu = raw_smp_processor_id();
+ lock->owner = current;
+}
+
+static inline void debug_write_unlock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
+ RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+ lock, "wrong CPU");
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+#if 0 /* This can cause lockups */
+static void __write_lock_debug(rwlock_t *lock)
+{
+ u64 i;
+ u64 loops = loops_per_jiffy * HZ;
+ int print_once = 1;
+
+ for (;;) {
+ for (i = 0; i < loops; i++) {
+ if (arch_write_trylock(&lock->raw_lock))
+ return;
+ __delay(1);
+ }
+ /* lockup suspected: */
+ if (print_once) {
+ print_once = 0;
+ printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
+ "%s/%d, %p\n",
+ raw_smp_processor_id(), current->comm,
+ current->pid, lock);
+ dump_stack();
+ }
+ }
+}
+#endif
+
+void do_raw_write_lock(rwlock_t *lock)
+{
+ debug_write_lock_before(lock);
+ arch_write_lock(&lock->raw_lock);
+ debug_write_lock_after(lock);
+}
+
+int do_raw_write_trylock(rwlock_t *lock)
+{
+ int ret = arch_write_trylock(&lock->raw_lock);
+
+ if (ret)
+ debug_write_lock_after(lock);
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_write_unlock(rwlock_t *lock)
+{
+ debug_write_unlock(lock);
+ arch_write_unlock(&lock->raw_lock);
+}
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 00000000000..915e123a430
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,12 @@
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd3..81e727cf6df 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
#include <linux/ftrace_event.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernel.h>
@@ -28,6 +29,7 @@
#include <linux/vmalloc.h>
#include <linux/elf.h>
#include <linux/proc_fs.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -58,6 +60,9 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
+#include <linux/fips.h>
+#include <uapi/linux/module.h>
+#include "module-internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
@@ -102,9 +107,48 @@ static LIST_HEAD(modules);
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
+#ifdef CONFIG_MODULE_SIG
+#ifdef CONFIG_MODULE_SIG_FORCE
+static bool sig_enforce = true;
+#else
+static bool sig_enforce = false;
+
+static int param_set_bool_enable_only(const char *val,
+ const struct kernel_param *kp)
+{
+ int err;
+ bool test;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &test;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!test && sig_enforce)
+ return -EROFS;
+
+ if (test)
+ sig_enforce = true;
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_bool_enable_only = {
+ .flags = KERNEL_PARAM_FL_NOARG,
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+#define param_check_bool_enable_only param_check_bool
+
+module_param(sig_enforce, bool_enable_only, 0644);
+#endif /* !CONFIG_MODULE_SIG_FORCE */
+#endif /* CONFIG_MODULE_SIG */
/* Block module loading/unloading? */
int modules_disabled = 0;
+core_param(nomodule, modules_disabled, bint, 0);
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -135,6 +179,7 @@ struct load_info {
unsigned long symoffs, stroffs;
struct _ddebug *debug;
unsigned int num_debug;
+ bool sig_ok;
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -144,6 +189,7 @@ struct load_info {
ongoing or failed initialization etc. */
static inline int strong_try_module_get(struct module *mod)
{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
if (mod && mod->state == MODULE_STATE_COMING)
return -EBUSY;
if (try_module_get(mod))
@@ -152,9 +198,10 @@ static inline int strong_try_module_get(struct module *mod)
return -ENOENT;
}
-static inline void add_taint_module(struct module *mod, unsigned flag)
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
{
- add_taint(flag);
+ add_taint(flag, lockdep_ok);
mod->taints |= (1U << flag);
}
@@ -299,6 +346,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
#endif
};
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
return true;
}
@@ -328,26 +378,21 @@ static bool check_symbol(const struct symsearch *syms,
if (syms->licence == GPL_ONLY)
return false;
if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
- printk(KERN_WARNING "Symbol %s is being used "
- "by a non-GPL module, which will not "
- "be allowed in the future\n", fsa->name);
- printk(KERN_WARNING "Please see the file "
- "Documentation/feature-removal-schedule.txt "
- "in the kernel source tree for more details.\n");
+ pr_warn("Symbol %s is being used by a non-GPL module, "
+ "which will not be allowed in the future\n",
+ fsa->name);
}
}
#ifdef CONFIG_UNUSED_SYMBOLS
if (syms->unused && fsa->warn) {
- printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
- "however this module is using it.\n", fsa->name);
- printk(KERN_WARNING
- "This symbol will go away in the future.\n");
- printk(KERN_WARNING
- "Please evalute if this is the right api to use and if "
- "it really is, submit a report the linux kernel "
- "mailinglist together with submitting your code for "
- "inclusion.\n");
+ pr_warn("Symbol %s is marked as UNUSED, however this module is "
+ "using it.\n", fsa->name);
+ pr_warn("This symbol will go away in the future.\n");
+ pr_warn("Please evalute if this is the right api to use and if "
+ "it really is, submit a report the linux kernel "
+ "mailinglist together with submitting your code for "
+ "inclusion.\n");
}
#endif
@@ -409,16 +454,24 @@ const struct kernel_symbol *find_symbol(const char *name,
EXPORT_SYMBOL_GPL(find_symbol);
/* Search for module by name: must hold module_mutex. */
-struct module *find_module(const char *name)
+static struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
{
struct module *mod;
list_for_each_entry(mod, &modules, list) {
- if (strcmp(mod->name, name) == 0)
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
return mod;
}
return NULL;
}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, strlen(name), false);
+}
EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@@ -428,23 +481,27 @@ static inline void __percpu *mod_percpu(struct module *mod)
return mod->percpu;
}
-static int percpu_modalloc(struct module *mod,
- unsigned long size, unsigned long align)
+static int percpu_modalloc(struct module *mod, struct load_info *info)
{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ unsigned long align = pcpusec->sh_addralign;
+
+ if (!pcpusec->sh_size)
+ return 0;
+
if (align > PAGE_SIZE) {
- printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
- mod->name, align, PAGE_SIZE);
+ pr_warn("%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
align = PAGE_SIZE;
}
- mod->percpu = __alloc_reserved_percpu(size, align);
+ mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
if (!mod->percpu) {
- printk(KERN_WARNING
- "%s: Could not allocate %lu bytes percpu data\n",
- mod->name, size);
+ pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
return -ENOMEM;
}
- mod->percpu_size = size;
+ mod->percpu_size = pcpusec->sh_size;
return 0;
}
@@ -484,6 +541,8 @@ bool is_module_percpu_address(unsigned long addr)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (!mod->percpu_size)
continue;
for_each_possible_cpu(cpu) {
@@ -507,10 +566,12 @@ static inline void __percpu *mod_percpu(struct module *mod)
{
return NULL;
}
-static inline int percpu_modalloc(struct module *mod,
- unsigned long size, unsigned long align)
+static int percpu_modalloc(struct module *mod, struct load_info *info)
{
- return -ENOMEM;
+ /* UP modules shouldn't have this section: ENOMEM isn't quite right */
+ if (info->sechdrs[info->index.pcpu].sh_size != 0)
+ return -ENOMEM;
+ return 0;
}
static inline void percpu_modfree(struct module *mod)
{
@@ -540,7 +601,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
struct module_kobject *mk, char *buffer) \
{ \
- return sprintf(buffer, "%s\n", mk->mod->field); \
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
} \
static int modinfo_##field##_exists(struct module *mod) \
{ \
@@ -579,9 +640,7 @@ static int module_unload_init(struct module *mod)
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- __this_cpu_write(mod->refptr->incs, 1);
- /* Backwards compatibility macros put refcount during init. */
- mod->waiter = current;
+ raw_cpu_write(mod->refptr->incs, 1);
return 0;
}
@@ -615,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b)
pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
if (!use) {
- printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+ pr_warn("%s: out of memory loading\n", a->name);
return -ENOMEM;
}
@@ -672,7 +731,7 @@ static inline int try_force_unload(unsigned int flags)
{
int ret = (flags & O_TRUNC);
if (ret)
- add_taint(TAINT_FORCED_RMMOD);
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
return ret;
}
#else
@@ -707,16 +766,9 @@ static int __try_stop_module(void *_sref)
static int try_stop_module(struct module *mod, int flags, int *forced)
{
- if (flags & O_NONBLOCK) {
- struct stopref sref = { mod, flags, forced };
+ struct stopref sref = { mod, flags, forced };
- return stop_machine(__try_stop_module, &sref, NULL);
- } else {
- /* We don't need to stop the machine for this. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- return 0;
- }
+ return stop_machine(__try_stop_module, &sref, NULL);
}
unsigned long module_refcount(struct module *mod)
@@ -749,21 +801,6 @@ EXPORT_SYMBOL(module_refcount);
/* This exists whether we can unload or not */
static void free_module(struct module *mod);
-static void wait_for_zero_refcount(struct module *mod)
-{
- /* Since we might sleep for some time, release the mutex first */
- mutex_unlock(&module_mutex);
- for (;;) {
- pr_debug("Looking at refcount...\n");
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (module_refcount(mod) == 0)
- break;
- schedule();
- }
- current->state = TASK_RUNNING;
- mutex_lock(&module_mutex);
-}
-
SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
unsigned int, flags)
{
@@ -795,8 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
/* Doing init or already dying? */
if (mod->state != MODULE_STATE_LIVE) {
- /* FIXME: if (force), slam module count and wake up
- waiter --RR */
+ /* FIXME: if (force), slam module count damn the torpedoes */
pr_debug("%s already dying\n", mod->name);
ret = -EBUSY;
goto out;
@@ -812,18 +848,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
}
}
- /* Set this up before setting mod->state */
- mod->waiter = current;
-
/* Stop the machine so refcounts can't move and disable module. */
ret = try_stop_module(mod, flags, &forced);
if (ret != 0)
goto out;
- /* Never wait if forced. */
- if (!forced && module_refcount(mod) != 0)
- wait_for_zero_refcount(mod);
-
mutex_unlock(&module_mutex);
/* Final destruction now no one is using it. */
if (mod->exit != NULL)
@@ -903,6 +932,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
static struct module_attribute modinfo_refcnt =
__ATTR(refcnt, 0444, show_refcnt, NULL);
+void __module_get(struct module *module)
+{
+ if (module) {
+ preempt_disable();
+ __this_cpu_inc(module->refptr->incs);
+ trace_module_get(module, _RET_IP_);
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+ bool ret = true;
+
+ if (module) {
+ preempt_disable();
+
+ if (likely(module_is_live(module))) {
+ __this_cpu_inc(module->refptr->incs);
+ trace_module_get(module, _RET_IP_);
+ } else
+ ret = false;
+
+ preempt_enable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
void module_put(struct module *module)
{
if (module) {
@@ -911,9 +970,6 @@ void module_put(struct module *module)
__this_cpu_inc(module->refptr->decs);
trace_module_put(module, _RET_IP_);
- /* Maybe they're waiting for us to drop reference? */
- if (unlikely(!module_is_live(module)))
- wake_up_process(module->waiter);
preempt_enable();
}
}
@@ -954,9 +1010,11 @@ static size_t module_flags_taint(struct module *mod, char *buf)
buf[l++] = 'F';
if (mod->taints & (1 << TAINT_CRAP))
buf[l++] = 'C';
+ if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
+ buf[l++] = 'E';
/*
* TAINT_FORCED_RMMOD: could be added.
- * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
* apply to modules.
*/
return l;
@@ -977,6 +1035,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
case MODULE_STATE_GOING:
state = "going";
break;
+ default:
+ BUG();
}
return sprintf(buffer, "%s\n", state);
}
@@ -1049,9 +1109,8 @@ static int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
- printk(KERN_WARNING "%s: %s: kernel tainted.\n",
- mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
return 0;
#else
return -ENOEXEC;
@@ -1103,8 +1162,7 @@ static int check_version(Elf_Shdr *sechdrs,
goto bad_version;
}
- printk(KERN_WARNING "%s: no symbol version for %s\n",
- mod->name, symname);
+ pr_warn("%s: no symbol version for %s\n", mod->name, symname);
return 0;
bad_version:
@@ -1121,10 +1179,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
/* Since this should be found in kernel (which can't be removed),
* no locking is necessary. */
- if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
&crc, true, false))
BUG();
- return check_version(sechdrs, versindex, "module_layout", mod, crc,
+ return check_version(sechdrs, versindex,
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc,
NULL);
}
@@ -1212,8 +1271,8 @@ resolve_symbol_wait(struct module *mod,
!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
|| PTR_ERR(ksym) != -EBUSY,
30 * HZ) <= 0) {
- printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
- mod->name, owner);
+ pr_warn("%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
}
return ksym;
}
@@ -1515,21 +1574,28 @@ static void module_remove_modinfo_attrs(struct module *mod)
kfree(mod->modinfo_attrs);
}
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
static int mod_sysfs_init(struct module *mod)
{
int err;
struct kobject *kobj;
if (!module_sysfs_initialized) {
- printk(KERN_ERR "%s: module sysfs not initialized\n",
- mod->name);
+ pr_err("%s: module sysfs not initialized\n", mod->name);
err = -EINVAL;
goto out;
}
kobj = kset_find_obj(module_kset, mod->name);
if (kobj) {
- printk(KERN_ERR "%s: module is already loaded\n", mod->name);
+ pr_err("%s: module is already loaded\n", mod->name);
kobject_put(kobj);
err = -EINVAL;
goto out;
@@ -1542,7 +1608,7 @@ static int mod_sysfs_init(struct module *mod)
err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
"%s", mod->name);
if (err)
- kobject_put(&mod->mkobj.kobj);
+ mod_kobject_put(mod);
/* delay uevent until full sysfs population */
out:
@@ -1586,7 +1652,7 @@ out_unreg_param:
out_unreg_holders:
kobject_put(mod->holders_dir);
out_unreg:
- kobject_put(&mod->mkobj.kobj);
+ mod_kobject_put(mod);
out:
return err;
}
@@ -1595,7 +1661,7 @@ static void mod_sysfs_fini(struct module *mod)
{
remove_notes_attrs(mod);
remove_sect_attrs(mod);
- kobject_put(&mod->mkobj.kobj);
+ mod_kobject_put(mod);
}
#else /* !CONFIG_SYSFS */
@@ -1715,6 +1781,8 @@ void set_all_modules_text_rw(void)
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((mod->module_core) && (mod->core_text_size)) {
set_page_attributes(mod->module_core,
mod->module_core + mod->core_text_size,
@@ -1736,6 +1804,8 @@ void set_all_modules_text_ro(void)
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((mod->module_core) && (mod->core_text_size)) {
set_page_attributes(mod->module_core,
mod->module_core + mod->core_text_size,
@@ -1769,12 +1839,12 @@ static void free_module(struct module *mod)
{
trace_module_free(mod);
- /* Delete from various lists */
- mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
- mutex_unlock(&module_mutex);
mod_sysfs_teardown(mod);
+ /* We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed. */
+ mod->state = MODULE_STATE_UNFORMED;
+
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1787,6 +1857,11 @@ static void free_module(struct module *mod)
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ stop_machine(__unlink_module, mod, NULL);
+ mutex_unlock(&module_mutex);
+
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
module_free(mod, mod->module_init);
@@ -1847,8 +1922,7 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(s->name, &owner, NULL, true, false)) {
- printk(KERN_ERR
- "%s: exports duplicate symbol %s"
+ pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, s->name, module_name(owner));
return -ENOEXEC;
@@ -1873,6 +1947,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
switch (sym[i].st_shndx) {
case SHN_COMMON:
+ /* Ignore common symbols */
+ if (!strncmp(name, "__gnu_lto", 9))
+ break;
+
/* We compiled with -fno-common. These are not
supposed to happen. */
pr_debug("Common symbol: %s\n", name);
@@ -1899,8 +1977,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
- printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
- mod->name, name, PTR_ERR(ksym));
+ pr_warn("%s: Unknown symbol %s (err %li)\n",
+ mod->name, name, PTR_ERR(ksym));
ret = PTR_ERR(ksym) ?: -ENOENT;
break;
@@ -1918,26 +1996,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
return ret;
}
-int __weak apply_relocate(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- pr_err("module %s: REL relocation unsupported\n", me->name);
- return -ENOEXEC;
-}
-
-int __weak apply_relocate_add(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- pr_err("module %s: RELA relocation unsupported\n", me->name);
- return -ENOEXEC;
-}
-
static int apply_relocations(struct module *mod, const struct load_info *info)
{
unsigned int i;
@@ -2074,9 +2132,10 @@ static void set_license(struct module *mod, const char *license)
if (!license_is_gpl_compatible(license)) {
if (!test_taint(TAINT_PROPRIETARY_MODULE))
- printk(KERN_WARNING "%s: module license '%s' taints "
- "kernel.\n", mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ pr_warn("%s: module license '%s' taints kernel.\n",
+ mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
}
}
@@ -2231,7 +2290,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
Elf_Shdr *symsect = info->sechdrs + info->index.sym;
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
@@ -2243,11 +2302,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
nsrc = symsect->sh_size / sizeof(*src);
/* Compute total space required for the core symbols' strtab. */
- for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
- if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
- strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 ||
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name])+1;
ndst++;
}
+ }
/* Append room for core symbols at end of core part. */
info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
@@ -2281,15 +2342,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->core_symtab = dst = mod->module_core + info->symoffs;
mod->core_strtab = s = mod->module_core + info->stroffs;
src = mod->symtab;
- *dst = *src;
- *s++ = 0;
- for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
- if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
- continue;
-
- dst[ndst] = *src;
- dst[ndst++].st_name = s - mod->core_strtab;
- s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
+ for (ndst = i = 0; i < mod->num_symtab; i++) {
+ if (i == 0 ||
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_strtab;
+ s += strlcpy(s, &mod->strtab[src[i].st_name],
+ KSYM_NAME_LEN) + 1;
+ }
}
mod->core_num_syms = ndst;
}
@@ -2309,8 +2369,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
return;
#ifdef CONFIG_DYNAMIC_DEBUG
if (ddebug_add_module(debug, num, debug->modname))
- printk(KERN_ERR "dynamic debug error adding module: %s\n",
- debug->modname);
+ pr_err("dynamic debug error adding module: %s\n",
+ debug->modname);
#endif
}
@@ -2322,7 +2382,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
void * __weak module_alloc(unsigned long size)
{
- return size == 0 ? NULL : vmalloc_exec(size);
+ return vmalloc_exec(size);
}
static void *module_alloc_update_bounds(unsigned long size)
@@ -2351,10 +2411,10 @@ static void kmemleak_load_module(const struct module *mod,
kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
for (i = 1; i < info->hdr->e_shnum; i++) {
- const char *name = info->secstrings + info->sechdrs[i].sh_name;
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
- if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
+ /* Scan all writable sections that's not executable */
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
continue;
kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
@@ -2368,48 +2428,142 @@ static inline void kmemleak_load_module(const struct module *mod,
}
#endif
+#ifdef CONFIG_MODULE_SIG
+static int module_sig_check(struct load_info *info)
+{
+ int err = -ENOKEY;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const void *mod = info->hdr;
+
+ if (info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, &info->len);
+ }
+
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+
+ /* Not having a signature is only an error if we're strict. */
+ if (err < 0 && fips_enabled)
+ panic("Module verification failed with error %d in FIPS mode\n",
+ err);
+ if (err == -ENOKEY && !sig_enforce)
+ err = 0;
+
+ return err;
+}
+#else /* !CONFIG_MODULE_SIG */
+static int module_sig_check(struct load_info *info)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
+static int elf_header_check(struct load_info *info)
+{
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+ || info->hdr->e_type != ET_REL
+ || !elf_check_arch(info->hdr)
+ || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff))
+ return -ENOEXEC;
+
+ return 0;
+}
+
/* Sets info->hdr and info->len. */
-static int copy_and_check(struct load_info *info,
- const void __user *umod, unsigned long len,
- const char __user *uargs)
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
{
int err;
- Elf_Ehdr *hdr;
- if (len < sizeof(*hdr))
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
+ err = security_kernel_module_from_file(NULL);
+ if (err)
+ return err;
+
/* Suck in entire file: we'll want most of it. */
- /* vmalloc barfs on "unusual" numbers. Check here */
- if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
+ info->hdr = vmalloc(info->len);
+ if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(hdr, umod, len) != 0) {
- err = -EFAULT;
- goto free_hdr;
+ if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ vfree(info->hdr);
+ return -EFAULT;
}
- /* Sanity checks against insmoding binaries or wrong arch,
- weird elf version */
- if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
- || hdr->e_type != ET_REL
- || !elf_check_arch(hdr)
- || hdr->e_shentsize != sizeof(Elf_Shdr)) {
- err = -ENOEXEC;
- goto free_hdr;
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_fd(int fd, struct load_info *info)
+{
+ struct fd f = fdget(fd);
+ int err;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -ENOEXEC;
+
+ err = security_kernel_module_from_file(f.file);
+ if (err)
+ goto out;
+
+ err = vfs_getattr(&f.file->f_path, &stat);
+ if (err)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ err = -EFBIG;
+ goto out;
}
- if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
- err = -ENOEXEC;
- goto free_hdr;
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ err = -EINVAL;
+ goto out;
}
- info->hdr = hdr;
- info->len = len;
- return 0;
+ info->hdr = vmalloc(stat.size);
+ if (!info->hdr) {
+ err = -ENOMEM;
+ goto out;
+ }
-free_hdr:
- vfree(hdr);
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(info->hdr);
+ err = bytes;
+ goto out;
+ }
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+ info->len = pos;
+
+out:
+ fdput(f);
return err;
}
@@ -2418,7 +2572,7 @@ static void free_copy(struct load_info *info)
vfree(info->hdr);
}
-static int rewrite_section_headers(struct load_info *info)
+static int rewrite_section_headers(struct load_info *info, int flags)
{
unsigned int i;
@@ -2429,8 +2583,7 @@ static int rewrite_section_headers(struct load_info *info)
Elf_Shdr *shdr = &info->sechdrs[i];
if (shdr->sh_type != SHT_NOBITS
&& info->len < shdr->sh_offset + shdr->sh_size) {
- printk(KERN_ERR "Module len %lu truncated\n",
- info->len);
+ pr_err("Module len %lu truncated\n", info->len);
return -ENOEXEC;
}
@@ -2446,7 +2599,10 @@ static int rewrite_section_headers(struct load_info *info)
}
/* Track but don't keep modinfo and version sections. */
- info->index.vers = find_sec(info, "__versions");
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
info->index.info = find_sec(info, ".modinfo");
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2461,7 +2617,7 @@ static int rewrite_section_headers(struct load_info *info)
* Return the temporary module pointer (we'll replace it with the final
* one when we move the module sections around).
*/
-static struct module *setup_load_info(struct load_info *info)
+static struct module *setup_load_info(struct load_info *info, int flags)
{
unsigned int i;
int err;
@@ -2472,7 +2628,7 @@ static struct module *setup_load_info(struct load_info *info)
info->secstrings = (void *)info->hdr
+ info->sechdrs[info->hdr->e_shstrndx].sh_offset;
- err = rewrite_section_headers(info);
+ err = rewrite_section_headers(info, flags);
if (err)
return ERR_PTR(err);
@@ -2489,15 +2645,14 @@ static struct module *setup_load_info(struct load_info *info)
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
- printk(KERN_WARNING "No module found in object\n");
+ pr_warn("No module found in object\n");
return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
if (info->index.sym == 0) {
- printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
- mod->name);
+ pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
return ERR_PTR(-ENOEXEC);
}
@@ -2510,30 +2665,32 @@ static struct module *setup_load_info(struct load_info *info)
return mod;
}
-static int check_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
const char *modmagic = get_modinfo(info, "vermagic");
int err;
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "bad vermagic");
if (err)
return err;
} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
- printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+ pr_err("%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
return -ENOEXEC;
}
if (!get_modinfo(info, "intree"))
- add_taint_module(mod, TAINT_OOT_MODULE);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP);
- printk(KERN_WARNING "%s: module is from the staging directory,"
- " the quality is unknown, you have been warned.\n",
- mod->name);
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
}
/* Set up license info based on the info section */
@@ -2542,7 +2699,7 @@ static int check_modinfo(struct module *mod, struct load_info *info)
return 0;
}
-static void find_module_sections(struct module *mod, struct load_info *info)
+static int find_module_sections(struct module *mod, struct load_info *info)
{
mod->kp = section_objs(info, "__param",
sizeof(*mod->kp), &mod->num_kp);
@@ -2572,6 +2729,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
#ifdef CONFIG_CONSTRUCTORS
mod->ctors = section_objs(info, ".ctors",
sizeof(*mod->ctors), &mod->num_ctors);
+ if (!mod->ctors)
+ mod->ctors = section_objs(info, ".init_array",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ else if (find_sec(info, ".init_array")) {
+ /*
+ * This shouldn't happen with same compiler and binutils
+ * building all parts of the module.
+ */
+ printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+ mod->name);
+ return -EINVAL;
+ }
#endif
#ifdef CONFIG_TRACEPOINTS
@@ -2588,24 +2757,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
- /*
- * This section contains pointers to allocated objects in the trace
- * code and not scanning it leads to false positives.
- */
- kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
- mod->num_trace_events, GFP_KERNEL);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
sizeof(*mod->trace_bprintk_fmt_start),
&mod->num_trace_bprintk_fmt);
- /*
- * This section contains pointers to allocated objects in the trace
- * code and not scanning it leads to false positives.
- */
- kmemleak_scan_area(mod->trace_bprintk_fmt_start,
- sizeof(*mod->trace_bprintk_fmt_start) *
- mod->num_trace_bprintk_fmt, GFP_KERNEL);
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
/* sechdrs[0].sh_size is always zero */
@@ -2618,11 +2774,12 @@ static void find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->extable), &mod->num_exentries);
if (section_addr(info, "__obsparm"))
- printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
- mod->name);
+ pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
info->debug = section_objs(info, "__verbose",
sizeof(*info->debug), &info->num_debug);
+
+ return 0;
}
static int move_module(struct module *mod, struct load_info *info)
@@ -2644,20 +2801,23 @@ static int move_module(struct module *mod, struct load_info *info)
memset(ptr, 0, mod->core_size);
mod->module_core = ptr;
- ptr = module_alloc_update_bounds(mod->init_size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
- */
- kmemleak_ignore(ptr);
- if (!ptr && mod->init_size) {
- module_free(mod, mod->module_core);
- return -ENOMEM;
- }
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ if (mod->init_size) {
+ ptr = module_alloc_update_bounds(mod->init_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. This block doesn't need to be
+ * scanned as it contains data and code that will be freed
+ * after the module is initialized.
+ */
+ kmemleak_ignore(ptr);
+ if (!ptr) {
+ module_free(mod, mod->module_core);
+ return -ENOMEM;
+ }
+ memset(ptr, 0, mod->init_size);
+ mod->module_init = ptr;
+ } else
+ mod->module_init = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -2693,11 +2853,17 @@ static int check_module_license_and_versions(struct module *mod)
* using GPL-only symbols it needs.
*/
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
/* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
@@ -2746,18 +2912,17 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
-static struct module *layout_and_allocate(struct load_info *info)
+static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
- Elf_Shdr *pcpusec;
int err;
- mod = setup_load_info(info);
+ mod = setup_load_info(info, flags);
if (IS_ERR(mod))
return mod;
- err = check_modinfo(mod, info);
+ err = check_modinfo(mod, info, flags);
if (err)
return ERR_PTR(err);
@@ -2765,17 +2930,10 @@ static struct module *layout_and_allocate(struct load_info *info)
err = module_frob_arch_sections(info->hdr, info->sechdrs,
info->secstrings, mod);
if (err < 0)
- goto out;
+ return ERR_PTR(err);
- pcpusec = &info->sechdrs[info->index.pcpu];
- if (pcpusec->sh_size) {
- /* We have a special allocation for this section. */
- err = percpu_modalloc(mod,
- pcpusec->sh_size, pcpusec->sh_addralign);
- if (err)
- goto out;
- pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
- }
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
@@ -2786,17 +2944,12 @@ static struct module *layout_and_allocate(struct load_info *info)
/* Allocate and move to the final place */
err = move_module(mod, info);
if (err)
- goto free_percpu;
+ return ERR_PTR(err);
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
return mod;
-
-free_percpu:
- percpu_modfree(mod);
-out:
- return ERR_PTR(err);
}
/* mod is no longer valid after this! */
@@ -2830,57 +2983,280 @@ static int post_relocation(struct module *mod, const struct load_info *info)
return module_finalize(info->hdr, info->sechdrs, mod);
}
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* This is where the real work happens */
+static int do_init_module(struct module *mod)
+{
+ int ret = 0;
+
+ /*
+ * We want to find out whether @mod uses async during init. Clear
+ * PF_USED_ASYNC. async_schedule*() will set it.
+ */
+ current->flags &= ~PF_USED_ASYNC;
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ /* Init routine failed: abort. Try to protect us from
+ buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
+ }
+ if (ret > 0) {
+ pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ "follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
+ __func__, mod->name, ret, __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock. For example, a newly
+ * detected block device can trigger request_module() of the
+ * default iosched from async probing task. Once userland helper
+ * reaches here, async_synchronize_full() will wait on the async
+ * task waiting on request_module() and deadlock.
+ *
+ * This deadlock is avoided by perfomring async_synchronize_full()
+ * iff module init queued any async jobs. This isn't a full
+ * solution as it will deadlock the same if module loading from
+ * async jobs nests more than once; however, due to the various
+ * constraints, this hack seems to be the best option for now.
+ * Please refer to the following thread for details.
+ *
+ * http://thread.gmane.org/gmane.linux.kernel/1420814
+ */
+ if (current->flags & PF_USED_ASYNC)
+ async_synchronize_full();
+
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ mod->num_symtab = mod->core_num_syms;
+ mod->symtab = mod->core_symtab;
+ mod->strtab = mod->core_strtab;
+#endif
+ unset_module_init_ro_nx(mod);
+ module_free(mod, mod->module_init);
+ mod->module_init = NULL;
+ mod->init_size = 0;
+ mod->init_ro_size = 0;
+ mod->init_text_size = 0;
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ return 0;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+ struct module *old;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+again:
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name), true);
+ if (old != NULL) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto out_unlocked;
+ goto again;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ list_add_rcu(&mod->list, &modules);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+out_unlocked:
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* This relies on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
+ /* Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us. */
+ mod->state = MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+ return 0;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int unknown_module_param_cb(char *param, char *val, const char *modname)
+{
+ /* Check for magic 'dyndbg' arg */
+ int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ if (ret != 0)
+ pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ return 0;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
- unsigned long len,
- const char __user *uargs)
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
{
- struct load_info info = { NULL, };
struct module *mod;
long err;
+ char *after_dashes;
- pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
+ err = module_sig_check(info);
+ if (err)
+ goto free_copy;
- /* Copy in the blobs from userspace, check they are vaguely sane. */
- err = copy_and_check(&info, umod, len, uargs);
+ err = elf_header_check(info);
if (err)
- return ERR_PTR(err);
+ goto free_copy;
/* Figure out module layout, and allocate all the memory. */
- mod = layout_and_allocate(&info);
+ mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
+ goto free_module;
+
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = percpu_modalloc(mod, info);
+ if (err)
+ goto unlink_mod;
+
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
- goto free_module;
+ goto unlink_mod;
/* Now we've got everything in the final locations, we can
* find optional sections. */
- find_module_sections(mod, &info);
+ err = find_module_sections(mod, info);
+ if (err)
+ goto free_unload;
err = check_module_license_and_versions(mod);
if (err)
goto free_unload;
/* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, &info);
+ setup_modinfo(mod, info);
/* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(mod, &info);
+ err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;
- err = apply_relocations(mod, &info);
+ err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;
- err = post_relocation(mod, &info);
+ err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;
@@ -2893,61 +3269,47 @@ static struct module *load_module(void __user *umod,
goto free_arch_cleanup;
}
- /* Mark state as coming so strong_try_module_get() ignores us. */
- mod->state = MODULE_STATE_COMING;
-
- /* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. No one should access us, since
- * strong_try_module_get() will fail.
- * lockdep/oops can run asynchronous, so use the RCU list insertion
- * function to insert in a way safe to concurrent readers.
- * The mutex protects against concurrent writers.
- */
- mutex_lock(&module_mutex);
- if (find_module(mod->name)) {
- err = -EEXIST;
- goto unlock;
- }
+ dynamic_debug_setup(info->debug, info->num_debug);
- /* This has to be done once we're sure module name is unique. */
- dynamic_debug_setup(info.debug, info.num_debug);
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
- /* Find duplicate symbols */
- err = verify_export_symbols(mod);
- if (err < 0)
- goto ddebug;
-
- module_bug_finalize(info.hdr, info.sechdrs, mod);
- list_add_rcu(&mod->list, &modules);
- mutex_unlock(&module_mutex);
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
+ goto ddebug_cleanup;
/* Module is ready to execute: parsing args may do that. */
- err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
- if (err < 0)
- goto unlink;
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
+ goto bug_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
/* Link in to syfs. */
- err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
- goto unlink;
+ goto bug_cleanup;
/* Get rid of temporary copy. */
- free_copy(&info);
+ free_copy(info);
/* Done! */
trace_module_load(mod);
- return mod;
- unlink:
+ return do_init_module(mod);
+
+ bug_cleanup:
+ /* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
module_bug_cleanup(mod);
-
- ddebug:
- dynamic_debug_remove(info.debug);
- unlock:
mutex_unlock(&module_mutex);
+ ddebug_cleanup:
+ dynamic_debug_remove(info->debug);
synchronize_sched();
kfree(mod->args);
free_arch_cleanup:
@@ -2956,107 +3318,59 @@ static struct module *load_module(void __user *umod,
free_modinfo(mod);
free_unload:
module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ wake_up_all(&module_wq);
+ mutex_unlock(&module_mutex);
free_module:
- module_deallocate(mod, &info);
+ module_deallocate(mod, info);
free_copy:
- free_copy(&info);
- return ERR_PTR(err);
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
- unsigned long i;
-
- for (i = 0; i < mod->num_ctors; i++)
- mod->ctors[i]();
-#endif
+ free_copy(info);
+ return err;
}
-/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
- struct module *mod;
- int ret = 0;
+ int err;
+ struct load_info info = { };
- /* Must have permission */
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
+ err = may_init_module();
+ if (err)
+ return err;
- /* Do all the hard work */
- mod = load_module(umod, len, uargs);
- if (IS_ERR(mod))
- return PTR_ERR(mod);
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
+ err = copy_module_from_user(umod, len, &info);
+ if (err)
+ return err;
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
+ return load_module(&info, uargs, 0);
+}
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err;
+ struct load_info info = { };
- do_mod_ctors(mod);
- /* Start the module */
- if (mod->init != NULL)
- ret = do_one_initcall(mod->init);
- if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up(&module_wq);
- return ret;
- }
- if (ret > 0) {
- printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
- __func__, mod->name, ret,
- __func__);
- dump_stack();
- }
+ err = may_init_module();
+ if (err)
+ return err;
- /* Now it's a first class citizen! Wake up anyone waiting for it. */
- mod->state = MODULE_STATE_LIVE;
- wake_up(&module_wq);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_LIVE, mod);
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
- /* We need to finish all async code before the module init sequence is done */
- async_synchronize_full();
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC))
+ return -EINVAL;
- mutex_lock(&module_mutex);
- /* Drop initial reference. */
- module_put(mod);
- trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
-#endif
- unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
- mutex_unlock(&module_mutex);
+ err = copy_module_from_fd(fd, &info);
+ if (err)
+ return err;
- return 0;
+ return load_module(&info, uargs, flags);
}
static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3132,6 +3446,8 @@ const char *module_address_lookup(unsigned long addr,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
if (modname)
@@ -3155,6 +3471,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
const char *sym;
@@ -3179,6 +3497,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
const char *sym;
@@ -3206,6 +3526,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
@@ -3243,14 +3565,15 @@ unsigned long module_kallsyms_lookup_name(const char *name)
/* Don't lock: we're in enough trouble already. */
preempt_disable();
if ((colon = strchr(name, ':')) != NULL) {
- *colon = '\0';
- if ((mod = find_module(name)) != NULL)
+ if ((mod = find_module_all(name, colon - name, false)) != NULL)
ret = mod_find_symname(mod, colon+1);
- *colon = ':';
} else {
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((ret = mod_find_symname(mod, name)) != 0)
break;
+ }
}
preempt_enable();
return ret;
@@ -3265,6 +3588,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
int ret;
list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
for (i = 0; i < mod->num_symtab; i++) {
ret = fn(data, mod->strtab + mod->symtab[i].st_name,
mod, mod->symtab[i].st_value);
@@ -3280,6 +3605,7 @@ static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
if (mod->taints ||
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
@@ -3321,6 +3647,10 @@ static int m_show(struct seq_file *m, void *p)
struct module *mod = list_entry(p, struct module, list);
char buf[8];
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
seq_printf(m, "%s %u",
mod->name, mod->init_size + mod->core_size);
print_unload_info(m, mod);
@@ -3381,6 +3711,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (mod->num_exentries == 0)
continue;
@@ -3429,10 +3761,13 @@ struct module *__module_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_core(addr, mod)
|| within_module_init(addr, mod))
return mod;
+ }
return NULL;
}
EXPORT_SYMBOL_GPL(__module_address);
@@ -3485,12 +3820,15 @@ void print_modules(void)
printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list)
- printk(" %s%s", mod->name, module_flags(mod, buf));
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf));
+ }
preempt_enable();
if (last_unloaded_module[0])
- printk(" [last unloaded: %s]", last_unloaded_module);
- printk("\n");
+ pr_cont(" [last unloaded: %s]", last_unloaded_module);
+ pr_cont("\n");
}
#ifdef CONFIG_MODVERSIONS
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 00000000000..be5b8fac4bd
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,250 @@
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <crypto/public_key.h>
+#include <crypto/hash.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include "module-internal.h"
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ * - Signer's name
+ * - Key identifier
+ * - Signature data
+ * - Information block
+ */
+struct module_signature {
+ u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
+ u8 hash; /* Digest algorithm [enum hash_algo] */
+ u8 id_type; /* Key identifier type [enum pkey_id_type] */
+ u8 signer_len; /* Length of signer's name */
+ u8 key_id_len; /* Length of key identifier */
+ u8 __pad[3];
+ __be32 sig_len; /* Length of signature data */
+};
+
+/*
+ * Digest the module contents.
+ */
+static struct public_key_signature *mod_make_digest(enum hash_algo hash,
+ const void *mod,
+ unsigned long modlen)
+{
+ struct public_key_signature *pks;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ pr_devel("==>%s()\n", __func__);
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of our
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error_no_pks;
+
+ pks->pkey_hash_algo = hash;
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (void *)pks + sizeof(*pks);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = ok\n", __func__);
+ return pks;
+
+error:
+ kfree(pks);
+error_no_pks:
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Extract an MPI array from the signature data. This represents the actual
+ * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
+ * size of the MPI in bytes.
+ *
+ * RSA signatures only have one MPI, so currently we only read one.
+ */
+static int mod_extract_mpi_array(struct public_key_signature *pks,
+ const void *data, size_t len)
+{
+ size_t nbytes;
+ MPI mpi;
+
+ if (len < 3)
+ return -EBADMSG;
+ nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
+ data += 2;
+ len -= 2;
+ if (len != nbytes)
+ return -EBADMSG;
+
+ mpi = mpi_read_raw_data(data, nbytes);
+ if (!mpi)
+ return -ENOMEM;
+ pks->mpi[0] = mpi;
+ pks->nr_mpi = 1;
+ return 0;
+}
+
+/*
+ * Request an asymmetric key.
+ */
+static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
+ const u8 *key_id, size_t key_id_len)
+{
+ key_ref_t key;
+ size_t i;
+ char *id, *q;
+
+ pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
+
+ /* Construct an identifier. */
+ id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
+ if (!id)
+ return ERR_PTR(-ENOKEY);
+
+ memcpy(id, signer, signer_len);
+
+ q = id + signer_len;
+ *q++ = ':';
+ *q++ = ' ';
+ for (i = 0; i < key_id_len; i++) {
+ *q++ = hex_asc[*key_id >> 4];
+ *q++ = hex_asc[*key_id++ & 0x0f];
+ }
+
+ *q = 0;
+
+ pr_debug("Look up: \"%s\"\n", id);
+
+ key = keyring_search(make_key_ref(system_trusted_keyring, 1),
+ &key_type_asymmetric, id);
+ if (IS_ERR(key))
+ pr_warn("Request for unknown module key '%s' err %ld\n",
+ id, PTR_ERR(key));
+ kfree(id);
+
+ if (IS_ERR(key)) {
+ switch (PTR_ERR(key)) {
+ /* Hide some search errors */
+ case -EACCES:
+ case -ENOTDIR:
+ case -EAGAIN:
+ return ERR_PTR(-ENOKEY);
+ default:
+ return ERR_CAST(key);
+ }
+ }
+
+ pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
+ return key_ref_to_ptr(key);
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, unsigned long *_modlen)
+{
+ struct public_key_signature *pks;
+ struct module_signature ms;
+ struct key *key;
+ const void *sig;
+ size_t modlen = *_modlen, sig_len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+ modlen -= sizeof(ms);
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ if (sig_len >= modlen)
+ return -EBADMSG;
+ modlen -= sig_len;
+ if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
+ return -EBADMSG;
+ modlen -= (size_t)ms.signer_len + ms.key_id_len;
+
+ *_modlen = modlen;
+ sig = mod + modlen;
+
+ /* For the moment, only support RSA and X.509 identifiers */
+ if (ms.algo != PKEY_ALGO_RSA ||
+ ms.id_type != PKEY_ID_X509)
+ return -ENOPKG;
+
+ if (ms.hash >= PKEY_HASH__LAST ||
+ !hash_algo_name[ms.hash])
+ return -ENOPKG;
+
+ key = request_asymmetric_key(sig, ms.signer_len,
+ sig + ms.signer_len, ms.key_id_len);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ pks = mod_make_digest(ms.hash, mod, modlen);
+ if (IS_ERR(pks)) {
+ ret = PTR_ERR(pks);
+ goto error_put_key;
+ }
+
+ ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
+ sig_len);
+ if (ret < 0)
+ goto error_free_pks;
+
+ ret = verify_signature(key, pks);
+ pr_devel("verify_signature() = %d\n", ret);
+
+error_free_pks:
+ mpi_free(pks->rsa.s);
+ kfree(pks);
+error_put_key:
+ key_put(key);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ret;
+}
diff --git a/kernel/mutex.c b/kernel/mutex.c
deleted file mode 100644
index 89096dd8786..00000000000
--- a/kernel/mutex.c
+++ /dev/null
@@ -1,500 +0,0 @@
-/*
- * kernel/mutex.c
- *
- * Mutexes: blocking mutual exclusion locks
- *
- * Started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
- * David Howells for suggestions and improvements.
- *
- * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
- * from the -rt tree, where it was originally implemented for rtmutexes
- * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
- * and Sven Dietrich.
- *
- * Also see Documentation/mutex-design.txt.
- */
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/export.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/debug_locks.h>
-
-/*
- * In the DEBUG case we are using the "NULL fastpath" for mutexes,
- * which forces all calls into the slowpath:
- */
-#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
-# include <asm-generic/mutex-null.h>
-#else
-# include "mutex.h"
-# include <asm/mutex.h>
-#endif
-
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
-{
- atomic_set(&lock->count, 1);
- spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list);
- mutex_clear_owner(lock);
-
- debug_mutex_init(lock, name, key);
-}
-
-EXPORT_SYMBOL(__mutex_init);
-
-#ifndef CONFIG_DEBUG_LOCK_ALLOC
-/*
- * We split the mutex lock/unlock logic into separate fastpath and
- * slowpath functions, to reduce the register pressure on the fastpath.
- * We also put the fastpath first in the kernel image, to make sure the
- * branch is predicted by the CPU as default-untaken.
- */
-static __used noinline void __sched
-__mutex_lock_slowpath(atomic_t *lock_count);
-
-/**
- * mutex_lock - acquire the mutex
- * @lock: the mutex to be acquired
- *
- * Lock the mutex exclusively for this task. If the mutex is not
- * available right now, it will sleep until it can get it.
- *
- * The mutex must later on be released by the same task that
- * acquired it. Recursive locking is not allowed. The task
- * may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
- * the mutex still locked. The mutex must first be initialized
- * (or statically defined) before it can be locked. memset()-ing
- * the mutex to 0 is not allowed.
- *
- * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
- * checks that will enforce the restrictions and will also do
- * deadlock debugging. )
- *
- * This function is similar to (but not equivalent to) down().
- */
-void __sched mutex_lock(struct mutex *lock)
-{
- might_sleep();
- /*
- * The locking fastpath is the 1->0 transition from
- * 'unlocked' into 'locked' state.
- */
- __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
- mutex_set_owner(lock);
-}
-
-EXPORT_SYMBOL(mutex_lock);
-#endif
-
-static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
-
-/**
- * mutex_unlock - release the mutex
- * @lock: the mutex to be released
- *
- * Unlock a mutex that has been locked by this task previously.
- *
- * This function must not be used in interrupt context. Unlocking
- * of a not locked mutex is not allowed.
- *
- * This function is similar to (but not equivalent to) up().
- */
-void __sched mutex_unlock(struct mutex *lock)
-{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
-#ifndef CONFIG_DEBUG_MUTEXES
- /*
- * When debugging is enabled we must not clear the owner before time,
- * the slow path will always be taken, and that clears the owner field
- * after verifying that it was indeed current.
- */
- mutex_clear_owner(lock);
-#endif
- __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
-}
-
-EXPORT_SYMBOL(mutex_unlock);
-
-/*
- * Lock a mutex (possibly interruptible), slowpath:
- */
-static inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
- struct lockdep_map *nest_lock, unsigned long ip)
-{
- struct task_struct *task = current;
- struct mutex_waiter waiter;
- unsigned long flags;
-
- preempt_disable();
- mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
-
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- /*
- * Optimistic spinning.
- *
- * We try to spin for acquisition when we find that there are no
- * pending waiters and the lock owner is currently running on a
- * (different) CPU.
- *
- * The rationale is that if the lock owner is running, it is likely to
- * release the lock soon.
- *
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- */
-
- for (;;) {
- struct task_struct *owner;
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
- break;
-
- if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
- lock_acquired(&lock->dep_map, ip);
- mutex_set_owner(lock);
- preempt_enable();
- return 0;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- arch_mutex_cpu_relax();
- }
-#endif
- spin_lock_mutex(&lock->wait_lock, flags);
-
- debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
-
- /* add waiting tasks to the end of the waitqueue (FIFO): */
- list_add_tail(&waiter.list, &lock->wait_list);
- waiter.task = task;
-
- if (atomic_xchg(&lock->count, -1) == 1)
- goto done;
-
- lock_contended(&lock->dep_map, ip);
-
- for (;;) {
- /*
- * Lets try to take the lock again - this is needed even if
- * we get here for the first time (shortly after failing to
- * acquire the lock), to make sure that we get a wakeup once
- * it's unlocked. Later on, if we sleep, this is the
- * operation that gives us the lock. We xchg it to -1, so
- * that when we release the lock, we properly wake up the
- * other waiters:
- */
- if (atomic_xchg(&lock->count, -1) == 1)
- break;
-
- /*
- * got a signal? (This code gets eliminated in the
- * TASK_UNINTERRUPTIBLE case.)
- */
- if (unlikely(signal_pending_state(state, task))) {
- mutex_remove_waiter(lock, &waiter,
- task_thread_info(task));
- mutex_release(&lock->dep_map, 1, ip);
- spin_unlock_mutex(&lock->wait_lock, flags);
-
- debug_mutex_free_waiter(&waiter);
- preempt_enable();
- return -EINTR;
- }
- __set_task_state(task, state);
-
- /* didn't get the lock, go to sleep: */
- spin_unlock_mutex(&lock->wait_lock, flags);
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- spin_lock_mutex(&lock->wait_lock, flags);
- }
-
-done:
- lock_acquired(&lock->dep_map, ip);
- /* got the lock - rejoice! */
- mutex_remove_waiter(lock, &waiter, current_thread_info());
- mutex_set_owner(lock);
-
- /* set it to 0 if there are no waiters left: */
- if (likely(list_empty(&lock->wait_list)))
- atomic_set(&lock->count, 0);
-
- spin_unlock_mutex(&lock->wait_lock, flags);
-
- debug_mutex_free_waiter(&waiter);
- preempt_enable();
-
- return 0;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __sched
-mutex_lock_nested(struct mutex *lock, unsigned int subclass)
-{
- might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
-}
-
-EXPORT_SYMBOL_GPL(mutex_lock_nested);
-
-void __sched
-_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
-{
- might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
-}
-
-EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
-
-int __sched
-mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
-{
- might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
-}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
-
-int __sched
-mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
-{
- might_sleep();
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, NULL, _RET_IP_);
-}
-
-EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
-#endif
-
-/*
- * Release the lock, slowpath:
- */
-static inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
- unsigned long flags;
-
- spin_lock_mutex(&lock->wait_lock, flags);
- mutex_release(&lock->dep_map, nested, _RET_IP_);
- debug_mutex_unlock(lock);
-
- /*
- * some architectures leave the lock unlocked in the fastpath failure
- * case, others need to leave it locked. In the later case we have to
- * unlock it here
- */
- if (__mutex_slowpath_needs_to_unlock())
- atomic_set(&lock->count, 1);
-
- if (!list_empty(&lock->wait_list)) {
- /* get the first entry from the wait-list: */
- struct mutex_waiter *waiter =
- list_entry(lock->wait_list.next,
- struct mutex_waiter, list);
-
- debug_mutex_wake_waiter(lock, waiter);
-
- wake_up_process(waiter->task);
- }
-
- spin_unlock_mutex(&lock->wait_lock, flags);
-}
-
-/*
- * Release the lock, slowpath:
- */
-static __used noinline void
-__mutex_unlock_slowpath(atomic_t *lock_count)
-{
- __mutex_unlock_common_slowpath(lock_count, 1);
-}
-
-#ifndef CONFIG_DEBUG_LOCK_ALLOC
-/*
- * Here come the less common (and hence less performance-critical) APIs:
- * mutex_lock_interruptible() and mutex_trylock().
- */
-static noinline int __sched
-__mutex_lock_killable_slowpath(atomic_t *lock_count);
-
-static noinline int __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
-
-/**
- * mutex_lock_interruptible - acquire the mutex, interruptible
- * @lock: the mutex to be acquired
- *
- * Lock the mutex like mutex_lock(), and return 0 if the mutex has
- * been acquired or sleep until the mutex becomes available. If a
- * signal arrives while waiting for the lock then this function
- * returns -EINTR.
- *
- * This function is similar to (but not equivalent to) down_interruptible().
- */
-int __sched mutex_lock_interruptible(struct mutex *lock)
-{
- int ret;
-
- might_sleep();
- ret = __mutex_fastpath_lock_retval
- (&lock->count, __mutex_lock_interruptible_slowpath);
- if (!ret)
- mutex_set_owner(lock);
-
- return ret;
-}
-
-EXPORT_SYMBOL(mutex_lock_interruptible);
-
-int __sched mutex_lock_killable(struct mutex *lock)
-{
- int ret;
-
- might_sleep();
- ret = __mutex_fastpath_lock_retval
- (&lock->count, __mutex_lock_killable_slowpath);
- if (!ret)
- mutex_set_owner(lock);
-
- return ret;
-}
-EXPORT_SYMBOL(mutex_lock_killable);
-
-static __used noinline void __sched
-__mutex_lock_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
-
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
-}
-
-static noinline int __sched
-__mutex_lock_killable_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
-
- return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
-}
-
-static noinline int __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
-
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
-}
-#endif
-
-/*
- * Spinlock based trylock, we take the spinlock and check whether we
- * can get the lock:
- */
-static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
- unsigned long flags;
- int prev;
-
- spin_lock_mutex(&lock->wait_lock, flags);
-
- prev = atomic_xchg(&lock->count, -1);
- if (likely(prev == 1)) {
- mutex_set_owner(lock);
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- }
-
- /* Set it back to 0 if there are no waiters: */
- if (likely(list_empty(&lock->wait_list)))
- atomic_set(&lock->count, 0);
-
- spin_unlock_mutex(&lock->wait_lock, flags);
-
- return prev == 1;
-}
-
-/**
- * mutex_trylock - try to acquire the mutex, without waiting
- * @lock: the mutex to be acquired
- *
- * Try to acquire the mutex atomically. Returns 1 if the mutex
- * has been acquired successfully, and 0 on contention.
- *
- * NOTE: this function follows the spin_trylock() convention, so
- * it is negated from the down_trylock() return values! Be careful
- * about this when converting semaphore users to mutexes.
- *
- * This function must not be used in interrupt context. The
- * mutex must be released by the same task that acquired it.
- */
-int __sched mutex_trylock(struct mutex *lock)
-{
- int ret;
-
- ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
- if (ret)
- mutex_set_owner(lock);
-
- return ret;
-}
-EXPORT_SYMBOL(mutex_trylock);
-
-/**
- * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
- * @cnt: the atomic which we are to dec
- * @lock: the mutex to return holding if we dec to 0
- *
- * return true and hold lock if we dec to 0, return false otherwise
- */
-int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
-{
- /* dec if we can't possibly hit 0 */
- if (atomic_add_unless(cnt, -1, 1))
- return 0;
- /* we might hit 0, so take the lock */
- mutex_lock(lock);
- if (!atomic_dec_and_test(cnt)) {
- /* when we actually did the dec, we didn't hit 0 */
- mutex_unlock(lock);
- return 0;
- }
- /* we hit 0, and we hold the lock */
- return 1;
-}
-EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7..4803da6eab6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+static int notifier_call_chain(struct notifier_block **nl,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
}
return ret;
}
+NOKPROBE_SYMBOL(notifier_call_chain);
/*
* Atomic notifier chain routines. Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
return ret;
}
EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
@@ -309,7 +312,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
* racy then it does not matter what the result of the test
* is, we re-check the list after having taken the lock anyway:
*/
- if (rcu_dereference_raw(nh->head)) {
+ if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
nr_calls);
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace __kprobes notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
};
return atomic_notifier_call_chain(&die_chain, val, &args);
}
+NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b576f7f14bc..8e7811086b8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,22 +22,22 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
- .uts_ns = &init_uts_ns,
+ .count = ATOMIC_INIT(1),
+ .uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
- .ipc_ns = &init_ipc_ns,
+ .ipc_ns = &init_ipc_ns,
#endif
- .mnt_ns = NULL,
- .pid_ns = &init_pid_ns,
+ .mnt_ns = NULL,
+ .pid_ns_for_children = &init_pid_ns,
#ifdef CONFIG_NET
- .net_ns = &init_net,
+ .net_ns = &init_net,
#endif
};
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
* leave it to the caller to do proper locking and attach it to task.
*/
static struct nsproxy *create_new_namespaces(unsigned long flags,
- struct task_struct *tsk, struct fs_struct *new_fs)
+ struct task_struct *tsk, struct user_namespace *user_ns,
+ struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
int err;
@@ -66,31 +67,32 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
if (!new_nsp)
return ERR_PTR(-ENOMEM);
- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
}
- new_nsp->uts_ns = copy_utsname(flags, tsk);
+ new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
- new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+ new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
}
- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
- if (IS_ERR(new_nsp->pid_ns)) {
- err = PTR_ERR(new_nsp->pid_ns);
+ new_nsp->pid_ns_for_children =
+ copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
+ if (IS_ERR(new_nsp->pid_ns_for_children)) {
+ err = PTR_ERR(new_nsp->pid_ns_for_children);
goto out_pid;
}
- new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
+ new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
@@ -99,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;
out_net:
- if (new_nsp->pid_ns)
- put_pid_ns(new_nsp->pid_ns);
+ if (new_nsp->pid_ns_for_children)
+ put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
if (new_nsp->ipc_ns)
put_ipc_ns(new_nsp->ipc_ns);
@@ -122,23 +124,18 @@ out_ns:
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
+ struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
- int err = 0;
- if (!old_ns)
+ if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWPID | CLONE_NEWNET)))) {
+ get_nsproxy(old_ns);
return 0;
-
- get_nsproxy(old_ns);
-
- if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET)))
- return 0;
-
- if (!capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out;
}
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
/*
* CLONE_NEWIPC must detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -146,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
* means share undolist with parent, so we must forbid using
* it along with CLONE_NEWIPC.
*/
- if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
- err = -EINVAL;
- goto out;
- }
+ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
+ (CLONE_NEWIPC | CLONE_SYSVSEM))
+ return -EINVAL;
- new_ns = create_new_namespaces(flags, tsk, tsk->fs);
- if (IS_ERR(new_ns)) {
- err = PTR_ERR(new_ns);
- goto out;
- }
+ new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
+ if (IS_ERR(new_ns))
+ return PTR_ERR(new_ns);
tsk->nsproxy = new_ns;
-
-out:
- put_nsproxy(old_ns);
- return err;
+ return 0;
}
void free_nsproxy(struct nsproxy *ns)
@@ -172,8 +163,8 @@ void free_nsproxy(struct nsproxy *ns)
put_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
put_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns)
- put_pid_ns(ns->pid_ns);
+ if (ns->pid_ns_for_children)
+ put_pid_ns(ns->pid_ns_for_children);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
@@ -183,19 +174,21 @@ void free_nsproxy(struct nsproxy *ns)
* On success, returns the new nsproxy.
*/
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
- struct nsproxy **new_nsp, struct fs_struct *new_fs)
+ struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
+ struct user_namespace *user_ns;
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET)))
+ CLONE_NEWNET | CLONE_NEWPID)))
return 0;
- if (!capable(CAP_SYS_ADMIN))
+ user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = create_new_namespaces(unshare_flags, current,
- new_fs ? new_fs : current->fs);
+ *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
goto out;
@@ -237,24 +230,21 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_inode *ei;
+ struct proc_ns *ei;
struct file *file;
int err;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
file = proc_ns_fget(fd);
if (IS_ERR(file))
return PTR_ERR(file);
err = -EINVAL;
- ei = PROC_I(file->f_dentry->d_inode);
+ ei = get_proc_ns(file_inode(file));
ops = ei->ns_ops;
if (nstype && (ops->type != nstype))
goto out;
- new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new_nsproxy)) {
err = PTR_ERR(new_nsproxy);
goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index b4525993151..161402f0b51 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,6 +1,8 @@
/*
* padata.c - generic interface to process data streams in parallel
*
+ * See Documentation/padata.txt for an api documentation.
+ *
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
*
@@ -29,7 +31,6 @@
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
-#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
#define MAX_OBJ_NUM 1000
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +44,18 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
return target_cpu;
}
-static int padata_cpu_hash(struct padata_priv *padata)
+static int padata_cpu_hash(struct parallel_data *pd)
{
+ unsigned int seq_nr;
int cpu_index;
- struct parallel_data *pd;
-
- pd = padata->pd;
/*
* Hash the sequence numbers to the cpus by taking
* seq_nr mod. number of cpus in use.
*/
- cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+
+ seq_nr = atomic_inc_return(&pd->seq_nr);
+ cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
return padata_index_to_cpu(pd, cpu_index);
}
@@ -111,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst,
rcu_read_lock_bh();
- pd = rcu_dereference(pinst->pd);
+ pd = rcu_dereference_bh(pinst->pd);
err = -EINVAL;
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
@@ -132,12 +133,7 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->pd = pd;
padata->cb_cpu = cb_cpu;
- if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
- atomic_set(&pd->seq_nr, -1);
-
- padata->seq_nr = atomic_inc_return(&pd->seq_nr);
-
- target_cpu = padata_cpu_hash(padata);
+ target_cpu = padata_cpu_hash(pd);
queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
@@ -173,8 +169,8 @@ EXPORT_SYMBOL(padata_do_parallel);
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
int cpu, num_cpus;
- int next_nr, next_index;
- struct padata_parallel_queue *queue, *next_queue;
+ unsigned int next_nr, next_index;
+ struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
@@ -189,14 +185,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
- if (unlikely(next_nr > pd->max_seq_nr)) {
- next_nr = next_nr - pd->max_seq_nr - 1;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
- next_queue = per_cpu_ptr(pd->pqueue, cpu);
- pd->processed = 0;
- }
-
padata = NULL;
reorder = &next_queue->reorder;
@@ -205,8 +193,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
padata = list_entry(reorder->list.next,
struct padata_priv, list);
- BUG_ON(next_nr != padata->seq_nr);
-
spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
@@ -217,8 +203,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
goto out;
}
- queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
- if (queue->cpu_index == next_queue->cpu_index) {
+ if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
goto out;
}
@@ -230,6 +215,7 @@ out:
static void padata_reorder(struct parallel_data *pd)
{
+ int cb_cpu;
struct padata_priv *padata;
struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)
return;
}
- squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
+ cb_cpu = padata->cb_cpu;
+ squeue = per_cpu_ptr(pd->squeue, cb_cpu);
spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
spin_unlock(&squeue->serial.lock);
- queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
+ queue_work_on(cb_cpu, pinst->wq, &squeue->work);
}
spin_unlock_bh(&pd->lock);
@@ -367,13 +354,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
return -ENOMEM;
- cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+ cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
free_cpumask_var(pd->cpumask.cbcpu);
return -ENOMEM;
}
- cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+ cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
return 0;
}
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
/* Initialize all percpu queues used by parallel workers */
static void padata_init_pqueues(struct parallel_data *pd)
{
- int cpu_index, num_cpus, cpu;
+ int cpu_index, cpu;
struct padata_parallel_queue *pqueue;
cpu_index = 0;
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
INIT_WORK(&pqueue->work, padata_parallel_worker);
atomic_set(&pqueue->num_obj, 0);
}
-
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
- pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
}
/* Allocate and initialize the internal cpumask dependend resources. */
@@ -580,7 +564,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
static bool padata_validate_cpumask(struct padata_instance *pinst,
const struct cpumask *cpumask)
{
- if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+ if (!cpumask_intersects(cpumask, cpu_online_mask)) {
pinst->flags |= PADATA_INVALID;
return false;
}
@@ -694,7 +678,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd;
- if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
pinst->cpumask.cbcpu);
if (!pd)
@@ -762,6 +746,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
return -ENOMEM;
padata_replace(pinst, pd);
+
+ cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
+ cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
}
return 0;
@@ -858,6 +845,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
@@ -869,6 +858,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
@@ -877,22 +868,6 @@ static int padata_cpu_callback(struct notifier_block *nfb,
if (err)
return notifier_from_errno(err);
break;
-
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- __padata_remove_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
-
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- __padata_add_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
}
return NOTIFY_OK;
@@ -1098,18 +1073,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
pinst->flags = 0;
-#ifdef CONFIG_HOTPLUG_CPU
- pinst->cpu_notifier.notifier_call = padata_cpu_callback;
- pinst->cpu_notifier.priority = 0;
- register_hotcpu_notifier(&pinst->cpu_notifier);
-#endif
-
put_online_cpus();
BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
+#ifdef CONFIG_HOTPLUG_CPU
+ pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+ pinst->cpu_notifier.priority = 0;
+ register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
return pinst;
err_free_masks:
diff --git a/kernel/panic.c b/kernel/panic.c
index 80aed44e345..62e16cef9cc 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -15,6 +15,7 @@
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/random.h>
+#include <linux/ftrace.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
@@ -22,18 +23,18 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
-#include <linux/dmi.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
-int panic_on_oops;
+int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
+static bool crash_kexec_post_notifiers;
-int panic_timeout;
+int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -75,6 +76,14 @@ void panic(const char *fmt, ...)
int state = 0;
/*
+ * Disable local interrupts. This will prevent panic_smp_self_stop
+ * from deadlocking the first cpu that invokes the panic, since
+ * there is nothing to prevent an interrupt handler (that runs
+ * after the panic_lock is acquired) from invoking panic again.
+ */
+ local_irq_disable();
+
+ /*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
@@ -92,23 +101,23 @@ void panic(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ pr_emerg("Kernel panic - not syncing: %s\n", buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!oops_in_progress)
+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
dump_stack();
#endif
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
- * Do we want to call this before we try to display a message?
+ * If we want to run this after calling panic_notifiers, pass
+ * the "crash_kexec_post_notifiers" option to the kernel.
*/
- crash_kexec(NULL);
-
- kmsg_dump(KMSG_DUMP_PANIC);
+ if (!crash_kexec_post_notifiers)
+ crash_kexec(NULL);
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -117,8 +126,23 @@ void panic(const char *fmt, ...)
*/
smp_send_stop();
+ /*
+ * Run any panic handlers, including those that might need to
+ * add information to the kmsg dump output.
+ */
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+ kmsg_dump(KMSG_DUMP_PANIC);
+
+ /*
+ * If you doubt kdump always works fine in any situation,
+ * "crash_kexec_post_notifiers" offers you a chance to run
+ * panic_notifiers and dumping kmsg before kdump.
+ * Note: since some panic_notifiers can make crashed kernel
+ * more unstable, it can increase risks of the kdump failure too.
+ */
+ crash_kexec(NULL);
+
bust_spinlocks(0);
if (!panic_blink)
@@ -129,7 +153,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -153,7 +177,7 @@ void panic(const char *fmt, ...)
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
- printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
+ pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
@@ -164,6 +188,7 @@ void panic(const char *fmt, ...)
disabled_wait(caller);
}
#endif
+ pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -187,7 +212,7 @@ struct tnt {
static const struct tnt tnts[] = {
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_UNSAFE_SMP, 'S', ' ' },
+ { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
@@ -198,6 +223,7 @@ static const struct tnt tnts[] = {
{ TAINT_CRAP, 'C', ' ' },
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
{ TAINT_OOT_MODULE, 'O', ' ' },
+ { TAINT_UNSIGNED_MODULE, 'E', ' ' },
};
/**
@@ -216,12 +242,13 @@ static const struct tnt tnts[] = {
* 'C' - modules from drivers/staging are loaded.
* 'I' - Working around severe firmware bug.
* 'O' - Out-of-tree module has been loaded.
+ * 'E' - Unsigned module has been loaded.
*
* The string is overwritten by the next call to print_tainted().
*/
const char *print_tainted(void)
{
- static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
+ static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
if (tainted_mask) {
char *s;
@@ -251,26 +278,18 @@ unsigned long get_taint(void)
return tainted_mask;
}
-void add_taint(unsigned flag)
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
- /*
- * Can't trust the integrity of the kernel anymore.
- * We don't call directly debug_locks_off() because the issue
- * is not necessarily serious enough to set oops_in_progress to 1
- * Also we want to keep up lockdep for staging/out-of-tree
- * development and post-warning case.
- */
- switch (flag) {
- case TAINT_CRAP:
- case TAINT_OOT_MODULE:
- case TAINT_WARN:
- case TAINT_FIRMWARE_WORKAROUND:
- break;
-
- default:
- if (__debug_locks_off())
- printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
- }
+ if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
+ pr_warn("Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
@@ -375,8 +394,7 @@ late_initcall(init_oops_id);
void print_oops_end_marker(void)
{
init_oops_id();
- printk(KERN_WARNING "---[ end trace %016llx ]---\n",
- (unsigned long long)oops_id);
+ pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
}
/*
@@ -399,13 +417,11 @@ struct slowpath_args {
static void warn_slowpath_common(const char *file, int line, void *caller,
unsigned taint, struct slowpath_args *args)
{
- const char *board;
+ disable_trace_on_warning();
- printk(KERN_WARNING "------------[ cut here ]------------\n");
- printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (board)
- printk(KERN_WARNING "Hardware name: %s\n", board);
+ pr_warn("------------[ cut here ]------------\n");
+ pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
+ raw_smp_processor_id(), current->pid, file, line, caller);
if (args)
vprintk(args->fmt, args->args);
@@ -413,7 +429,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
print_modules();
dump_stack();
print_oops_end_marker();
- add_taint(taint);
+ /* Just a warning, don't kill lockdep. */
+ add_taint(taint, LOCKDEP_STILL_OK);
}
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -455,7 +472,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
-void __stack_chk_fail(void)
+__visible void __stack_chk_fail(void)
{
panic("stack-protector: Kernel stack is corrupted in: %p\n",
__builtin_return_address(0));
@@ -467,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init setup_crash_kexec_post_notifiers(char *s)
+{
+ crash_kexec_post_notifiers = true;
+ return 0;
+}
+early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+
static int __init oops_setup(char *s)
{
if (!s)
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1f..1e52ca233fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
@@ -86,9 +85,13 @@ bool parameq(const char *a, const char *b)
static int parse_one(char *param,
char *val,
+ const char *doing,
const struct kernel_param *params,
unsigned num_params,
- int (*handle_unknown)(char *param, char *val))
+ s16 min_level,
+ s16 max_level,
+ int (*handle_unknown)(char *param, char *val,
+ const char *doing))
{
unsigned int i;
int err;
@@ -96,12 +99,15 @@ static int parse_one(char *param,
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
+ if (params[i].level < min_level
+ || params[i].level > max_level)
+ return 0;
/* No one handled NULL, so do it here. */
- if (!val && params[i].ops->set != param_set_bool
- && params[i].ops->set != param_set_bint)
+ if (!val &&
+ !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
return -EINVAL;
- pr_debug("They are equal! Calling %p\n",
- params[i].ops->set);
+ pr_debug("handling %s with %p\n", param,
+ params[i].ops->set);
mutex_lock(&param_lock);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
@@ -110,11 +116,11 @@ static int parse_one(char *param,
}
if (handle_unknown) {
- pr_debug("Unknown argument: calling %p\n", handle_unknown);
- return handle_unknown(param, val);
+ pr_debug("doing %s: %s='%s'\n", doing, param, val);
+ return handle_unknown(param, val, doing);
}
- pr_debug("Unknown argument `%s'\n", param);
+ pr_debug("Unknown argument '%s'\n", param);
return -ENOENT;
}
@@ -171,70 +177,68 @@ static char *next_arg(char *args, char **param, char **val)
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *name,
- char *args,
- const struct kernel_param *params,
- unsigned num,
- int (*unknown)(char *param, char *val))
+char *parse_args(const char *doing,
+ char *args,
+ const struct kernel_param *params,
+ unsigned num,
+ s16 min_level,
+ s16 max_level,
+ int (*unknown)(char *param, char *val, const char *doing))
{
char *param, *val;
- pr_debug("Parsing ARGS: %s\n", args);
-
/* Chew leading spaces */
args = skip_spaces(args);
+ if (*args)
+ pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
+
while (*args) {
int ret;
int irq_was_disabled;
args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0)
+ return args;
irq_was_disabled = irqs_disabled();
- ret = parse_one(param, val, params, num, unknown);
- if (irq_was_disabled && !irqs_disabled()) {
- printk(KERN_WARNING "parse_args(): option '%s' enabled "
- "irq's!\n", param);
- }
+ ret = parse_one(param, val, doing, params, num,
+ min_level, max_level, unknown);
+ if (irq_was_disabled && !irqs_disabled())
+ pr_warn("%s: option '%s' enabled irq's!\n",
+ doing, param);
+
switch (ret) {
case -ENOENT:
- printk(KERN_ERR "%s: Unknown parameter `%s'\n",
- name, param);
- return ret;
+ pr_err("%s: Unknown parameter `%s'\n", doing, param);
+ return ERR_PTR(ret);
case -ENOSPC:
- printk(KERN_ERR
- "%s: `%s' too large for parameter `%s'\n",
- name, val ?: "", param);
- return ret;
+ pr_err("%s: `%s' too large for parameter `%s'\n",
+ doing, val ?: "", param);
+ return ERR_PTR(ret);
case 0:
break;
default:
- printk(KERN_ERR
- "%s: `%s' invalid for parameter `%s'\n",
- name, val ?: "", param);
- return ret;
+ pr_err("%s: `%s' invalid for parameter `%s'\n",
+ doing, val ?: "", param);
+ return ERR_PTR(ret);
}
}
/* All parsed OK. */
- return 0;
+ return NULL;
}
/* Lazy bastard, eh? */
-#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
+#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \
int param_set_##name(const char *val, const struct kernel_param *kp) \
{ \
- tmptype l; \
- int ret; \
- \
- ret = strtolfn(val, 0, &l); \
- if (ret < 0 || ((type)l != l)) \
- return ret < 0 ? ret : -EINVAL; \
- *((type *)kp->arg) = l; \
- return 0; \
+ return strtolfn(val, 0, (type *)kp->arg); \
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return sprintf(buffer, format, *((type *)kp->arg)); \
+ return scnprintf(buffer, PAGE_SIZE, format, \
+ *((type *)kp->arg)); \
} \
struct kernel_param_ops param_ops_##name = { \
.set = param_set_##name, \
@@ -245,19 +249,18 @@ int parse_args(const char *name,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
if (strlen(val) > 1024) {
- printk(KERN_ERR "%s: string parameter too long\n",
- kp->name);
+ pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
@@ -279,7 +282,7 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
@@ -298,39 +301,23 @@ EXPORT_SYMBOL(param_ops_charp);
/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, const struct kernel_param *kp)
{
- bool v;
- int ret;
-
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
- ret = strtobool(val, &v);
- if (ret)
- return ret;
-
- if (kp->flags & KPARAM_ISBOOL)
- *(bool *)kp->arg = v;
- else
- *(int *)kp->arg = v;
- return 0;
+ return strtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
- bool val;
- if (kp->flags & KPARAM_ISBOOL)
- val = *(bool *)kp->arg;
- else
- val = *(int *)kp->arg;
-
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", val ? 'Y' : 'N');
+ return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
struct kernel_param_ops param_ops_bool = {
+ .flags = KERNEL_PARAM_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
@@ -344,7 +331,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
struct kernel_param dummy;
dummy.arg = &boolval;
- dummy.flags = KPARAM_ISBOOL;
ret = param_set_bool(val, &dummy);
if (ret == 0)
*(bool *)kp->arg = !boolval;
@@ -373,7 +359,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
/* Match bool exactly, by re-using it. */
boolkp = *kp;
boolkp.arg = &v;
- boolkp.flags |= KPARAM_ISBOOL;
ret = param_set_bool(val, &boolkp);
if (ret == 0)
@@ -383,6 +368,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
EXPORT_SYMBOL(param_set_bint);
struct kernel_param_ops param_ops_bint = {
+ .flags = KERNEL_PARAM_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
};
@@ -394,7 +380,7 @@ static int param_array(const char *name,
unsigned int min, unsigned int max,
void *elem, int elemsize,
int (*set)(const char *, const struct kernel_param *kp),
- u16 flags,
+ s16 level,
unsigned int *num)
{
int ret;
@@ -404,7 +390,7 @@ static int param_array(const char *name,
/* Get the name right for errors. */
kp.name = name;
kp.arg = elem;
- kp.flags = flags;
+ kp.level = level;
*num = 0;
/* We expect a comma-separated list of values. */
@@ -412,8 +398,7 @@ static int param_array(const char *name,
int len;
if (*num == max) {
- printk(KERN_ERR "%s: can only take %i arguments\n",
- name, max);
+ pr_err("%s: can only take %i arguments\n", name, max);
return -EINVAL;
}
len = strcspn(val, ",");
@@ -432,8 +417,7 @@ static int param_array(const char *name,
} while (save == ',');
if (*num < min) {
- printk(KERN_ERR "%s: needs at least %i arguments\n",
- name, min);
+ pr_err("%s: needs at least %i arguments\n", name, min);
return -EINVAL;
}
return 0;
@@ -445,7 +429,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
unsigned int temp_num;
return param_array(kp->name, val, 1, arr->max, arr->elem,
- arr->elemsize, arr->ops->set, kp->flags,
+ arr->elemsize, arr->ops->set, kp->level,
arr->num ?: &temp_num);
}
@@ -492,7 +476,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
const struct kparam_string *kps = kp->str;
if (strlen(val)+1 > kps->maxlen) {
- printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+ pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
@@ -628,10 +612,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
GFP_KERNEL);
if (!new) {
- kfree(mk->mp);
+ kfree(attrs);
err = -ENOMEM;
goto fail;
}
+ /* Despite looking like the typical realloc() bug, this is safe.
+ * We *want* the old 'attrs' to be freed either way, and we'll store
+ * the new one in the success case. */
attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
if (!attrs) {
err = -ENOMEM;
@@ -762,11 +749,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
#endif
if (err) {
kobject_put(&mk->kobj);
- printk(KERN_ERR
- "Module '%s' failed add to sysfs, error number %d\n",
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
name, err);
- printk(KERN_ERR
- "The system will be unstable now.\n");
return NULL;
}
@@ -802,7 +786,7 @@ static void __init kernel_add_sysfs_param(const char *name,
}
/*
- * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
+ * param_sysfs_builtin - add sysfs parameters for built-in modules
*
* Add module_parameters to sysfs for "modules" built into the kernel.
*
@@ -842,7 +826,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
struct module_version_attribute *vattr =
container_of(mattr, struct module_version_attribute, mattr);
- return sprintf(buf, "%s\n", vattr->version);
+ return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}
extern const struct module_version_attribute *__start___modver[];
@@ -927,7 +911,14 @@ static const struct kset_uevent_ops module_uevent_ops = {
struct kset *module_kset;
int module_sysfs_initialized;
+static void module_kobj_release(struct kobject *kobj)
+{
+ struct module_kobject *mk = to_module_kobject(kobj);
+ complete(mk->kobj_completion);
+}
+
struct kobj_type module_ktype = {
+ .release = module_kobj_release,
.sysfs_ops = &module_sysfs_ops,
};
diff --git a/kernel/pid.c b/kernel/pid.c
index ce8e00deacc..9b9a2669814 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
- * (C) 2002-2003 William Irwin, IBM
- * (C) 2004 William Irwin, Oracle
+ * (C) 2002-2003 Nadia Yvette Chambers, IBM
+ * (C) 2004 Nadia Yvette Chambers, Oracle
* (C) 2002-2004 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
@@ -36,6 +36,8 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/proc_ns.h>
+#include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -50,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#define BITS_PER_PAGE (PAGE_SIZE*8)
-#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-
static inline int mk_pid(struct pid_namespace *pid_ns,
struct pidmap *map, int off)
{
@@ -76,26 +75,14 @@ struct pid_namespace init_pid_ns = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
.last_pid = 0,
+ .nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
+ .user_ns = &init_user_ns,
+ .proc_inum = PROC_PID_INIT_INO,
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-int is_container_init(struct task_struct *tsk)
-{
- int ret = 0;
- struct pid *pid;
-
- rcu_read_lock();
- pid = task_pid(tsk);
- if (pid != NULL && pid->numbers[pid->level].nr == 1)
- ret = 1;
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL(is_container_init);
-
/*
* Note: disable interrupts while the pidmap_lock is held as an
* interrupt might come in and do read_lock(&tasklist_lock).
@@ -195,15 +182,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
break;
}
if (likely(atomic_read(&map->nr_free))) {
- do {
+ for ( ; ; ) {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
set_last_pid(pid_ns, last, pid);
return pid;
}
offset = find_next_offset(map, offset);
+ if (offset >= BITS_PER_PAGE)
+ break;
pid = mk_pid(pid_ns, map, offset);
- } while (offset < BITS_PER_PAGE && pid < pid_max);
+ if (pid >= pid_max)
+ break;
+ }
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
@@ -269,8 +260,29 @@ void free_pid(struct pid *pid)
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- hlist_del_rcu(&pid->numbers[i].pid_chain);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+ hlist_del_rcu(&upid->pid_chain);
+ switch(--ns->nr_hashed) {
+ case 2:
+ case 1:
+ /* When all that is left in the pid namespace
+ * is the reaper wake up the reaper. The reaper
+ * may be sleeping in zap_pid_ns_processes().
+ */
+ wake_up_process(ns->child_reaper);
+ break;
+ case PIDNS_HASH_ADDING:
+ /* Handle a fork failure of the first process */
+ WARN_ON(ns->child_reaper);
+ ns->nr_hashed = 0;
+ /* fall through */
+ case 0:
+ schedule_work(&ns->proc_work);
+ break;
+ }
+ }
spin_unlock_irqrestore(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
@@ -292,6 +304,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
goto out;
tmp = ns;
+ pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
@@ -302,22 +315,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = tmp->parent;
}
+ if (unlikely(is_child_reaper(pid))) {
+ if (pid_ns_prepare_proc(ns))
+ goto out_free;
+ }
+
get_pid_ns(ns);
- pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- for ( ; upid >= pid->numbers; --upid)
+ if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ goto out_unlock;
+ for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ upid->ns->nr_hashed++;
+ }
spin_unlock_irq(&pidmap_lock);
out:
return pid;
+out_unlock:
+ spin_unlock_irq(&pidmap_lock);
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
@@ -327,12 +350,18 @@ out_free:
goto out;
}
+void disable_pid_allocation(struct pid_namespace *ns)
+{
+ spin_lock_irq(&pidmap_lock);
+ ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ spin_unlock_irq(&pidmap_lock);
+}
+
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct hlist_node *elem;
struct upid *pnr;
- hlist_for_each_entry_rcu(pnr, elem,
+ hlist_for_each_entry_rcu(pnr,
&pid_hash[pid_hashfn(nr, ns)], pid_chain)
if (pnr->nr == nr && pnr->ns == ns)
return container_of(pnr, struct pid,
@@ -344,21 +373,17 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
- return find_pid_ns(nr, current->nsproxy->pid_ns);
+ return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
-void attach_pid(struct task_struct *task, enum pid_type type,
- struct pid *pid)
+void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid_link *link;
-
- link = &task->pids[type];
- link->pid = pid;
- hlist_add_head_rcu(&link->node, &pid->tasks[type]);
+ struct pid_link *link = &task->pids[type];
+ hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
}
static void __change_pid(struct task_struct *task, enum pid_type type,
@@ -390,7 +415,7 @@ void change_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
__change_pid(task, type, pid);
- attach_pid(task, type, pid);
+ attach_pid(task, type);
}
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
@@ -428,7 +453,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -479,10 +504,11 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
}
return nr;
}
+EXPORT_SYMBOL_GPL(pid_nr_ns);
pid_t pid_vnr(struct pid *pid)
{
- return pid_nr_ns(pid, current->nsproxy->pid_ns);
+ return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
@@ -493,7 +519,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
@@ -543,12 +569,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- int i, pidhash_size;
+ unsigned int i, pidhash_size;
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL,
- &pidhash_shift, NULL, 4096);
- pidhash_size = 1 << pidhash_shift;
+ &pidhash_shift, NULL,
+ 0, 4096);
+ pidhash_size = 1U << pidhash_shift;
for (i = 0; i < pidhash_size; i++)
INIT_HLIST_HEAD(&pid_hash[i]);
@@ -556,6 +583,9 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
+ /* Veryify no one has done anything silly */
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046..db95d8eb761 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,13 +10,14 @@
#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
-#include <linux/proc_fs.h>
-
-#define BITS_PER_PAGE (PAGE_SIZE*8)
+#include <linux/proc_ns.h>
+#include <linux/reboot.h>
+#include <linux/export.h>
struct pid_cache {
int nr_ids;
@@ -69,12 +70,29 @@ err_alloc:
return NULL;
}
-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static void proc_cleanup_work(struct work_struct *work)
+{
+ struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+ pid_ns_release_proc(ns);
+}
+
+/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
+#define MAX_PID_NS_LEVEL 32
+
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+ struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i, err = -ENOMEM;
+ int i;
+ int err;
+
+ if (level > MAX_PID_NS_LEVEL) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
goto out;
@@ -87,9 +105,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
if (ns->pid_cachep == NULL)
goto out_free_map;
+ err = proc_alloc_inum(&ns->proc_inum);
+ if (err)
+ goto out_free_map;
+
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
+ ns->user_ns = get_user_ns(user_ns);
+ ns->nr_hashed = PIDNS_HASH_ADDING;
+ INIT_WORK(&ns->proc_work, proc_cleanup_work);
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -97,14 +122,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
- err = pid_ns_prepare_proc(ns);
- if (err)
- goto out_put_parent_pid_ns;
-
return ns;
-out_put_parent_pid_ns:
- put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
@@ -113,42 +132,68 @@ out:
return ERR_PTR(err);
}
+static void delayed_free_pidns(struct rcu_head *p)
+{
+ kmem_cache_free(pid_ns_cachep,
+ container_of(p, struct pid_namespace, rcu));
+}
+
static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
+ proc_free_inum(ns->proc_inum);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
- kmem_cache_free(pid_ns_cachep, ns);
+ put_user_ns(ns->user_ns);
+ call_rcu(&ns->rcu, delayed_free_pidns);
}
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns);
- if (flags & (CLONE_THREAD|CLONE_PARENT))
+ if (task_active_pid_ns(current) != old_ns)
return ERR_PTR(-EINVAL);
- return create_pid_namespace(old_ns);
+ return create_pid_namespace(user_ns, old_ns);
}
-void free_pid_ns(struct kref *kref)
+static void free_pid_ns(struct kref *kref)
{
- struct pid_namespace *ns, *parent;
+ struct pid_namespace *ns;
ns = container_of(kref, struct pid_namespace, kref);
-
- parent = ns->parent;
destroy_pid_namespace(ns);
+}
+
+void put_pid_ns(struct pid_namespace *ns)
+{
+ struct pid_namespace *parent;
- if (parent != NULL)
- put_pid_ns(parent);
+ while (ns != &init_pid_ns) {
+ parent = ns->parent;
+ if (!kref_put(&ns->kref, free_pid_ns))
+ break;
+ ns = parent;
+ }
}
+EXPORT_SYMBOL_GPL(put_pid_ns);
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
int rc;
- struct task_struct *task;
+ struct task_struct *task, *me = current;
+ int init_pids = thread_group_leader(me) ? 1 : 2;
+
+ /* Don't allow any more processes into the pid namespace */
+ disable_pid_allocation(pid_ns);
+
+ /* Ignore SIGCHLD causing any terminated children to autoreap */
+ spin_lock_irq(&me->sighand->siglock);
+ me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
+ spin_unlock_irq(&me->sighand->siglock);
/*
* The last thread in the cgroup-init thread group is terminating.
@@ -168,13 +213,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
while (nr > 0) {
rcu_read_lock();
- /*
- * Any nested-container's init processes won't ignore the
- * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
- */
task = pid_task(find_vpid(nr), PIDTYPE_PID);
- if (task)
- send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
+ if (task && !__fatal_signal_pending(task))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
rcu_read_unlock();
@@ -182,21 +223,39 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
read_unlock(&tasklist_lock);
+ /* Firstly reap the EXIT_ZOMBIE children we may have. */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
+ /*
+ * sys_wait4() above can't reap the TASK_DEAD children.
+ * Make sure they all go away, see free_pid().
+ */
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (pid_ns->nr_hashed == init_pids)
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ if (pid_ns->reboot)
+ current->signal->group_exit_code = pid_ns->reboot;
+
acct_exit_ns(pid_ns);
return;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
- if (write && !capable(CAP_SYS_ADMIN))
+ if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -205,26 +264,126 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &current->nsproxy->pid_ns->last_pid;
- return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ tmp.data = &pid_ns->last_pid;
+ return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
+extern int pid_max;
+static int zero = 0;
static struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
+ .extra1 = &zero,
+ .extra2 = &pid_max,
},
{ }
};
-
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+ if (pid_ns == &init_pid_ns)
+ return 0;
+
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART2:
+ case LINUX_REBOOT_CMD_RESTART:
+ pid_ns->reboot = SIGHUP;
+ break;
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ case LINUX_REBOOT_CMD_HALT:
+ pid_ns->reboot = SIGINT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ read_lock(&tasklist_lock);
+ force_sig(SIGKILL, pid_ns->child_reaper);
+ read_unlock(&tasklist_lock);
+
+ do_exit(0);
+
+ /* Not reached */
+ return 0;
+}
+
+static void *pidns_get(struct task_struct *task)
+{
+ struct pid_namespace *ns;
+
+ rcu_read_lock();
+ ns = task_active_pid_ns(task);
+ if (ns)
+ get_pid_ns(ns);
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void pidns_put(void *ns)
+{
+ put_pid_ns(ns);
+}
+
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *ancestor, *new = ns;
+
+ if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Only allow entering the current active pid namespace
+ * or a child of the current active pid namespace.
+ *
+ * This is required for fork to return a usable pid value and
+ * this maintains the property that processes and their
+ * children can not escape their current pid namespace.
+ */
+ if (new->level < active->level)
+ return -EINVAL;
+
+ ancestor = new;
+ while (ancestor->level > active->level)
+ ancestor = ancestor->parent;
+ if (ancestor != active)
+ return -EINVAL;
+
+ put_pid_ns(nsproxy->pid_ns_for_children);
+ nsproxy->pid_ns_for_children = get_pid_ns(new);
+ return 0;
+}
+
+static unsigned int pidns_inum(void *ns)
+{
+ struct pid_namespace *pid_ns = ns;
+ return pid_ns->proc_inum;
+}
+
+const struct proc_ns_operations pidns_operations = {
+ .name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .inum = pidns_inum,
+};
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
+#endif
return 0;
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa2..3b8946416a5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,9 @@
#include <asm/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
+#include <linux/random.h>
+#include <linux/tick.h>
+#include <linux/workqueue.h>
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
@@ -48,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
return error;
}
-static inline union cpu_time_count
+static inline unsigned long long
timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
{
- union cpu_time_count ret;
- ret.sched = 0; /* high half always zero when .cpu used */
+ unsigned long long ret;
+
+ ret = 0; /* high half always zero when .cpu used */
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+ ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
} else {
- ret.cpu = timespec_to_cputime(tp);
+ ret = cputime_to_expires(timespec_to_cputime(tp));
}
return ret;
}
static void sample_to_timespec(const clockid_t which_clock,
- union cpu_time_count cpu,
+ unsigned long long expires,
struct timespec *tp)
{
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
- *tp = ns_to_timespec(cpu.sched);
+ *tp = ns_to_timespec(expires);
else
- cputime_to_timespec(cpu.cpu, tp);
-}
-
-static inline int cpu_time_before(const clockid_t which_clock,
- union cpu_time_count now,
- union cpu_time_count then)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- return now.sched < then.sched;
- } else {
- return now.cpu < then.cpu;
- }
-}
-static inline void cpu_time_add(const clockid_t which_clock,
- union cpu_time_count *acc,
- union cpu_time_count val)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- acc->sched += val.sched;
- } else {
- acc->cpu += val.cpu;
- }
-}
-static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
- union cpu_time_count a,
- union cpu_time_count b)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- a.sched -= b.sched;
- } else {
- a.cpu -= b.cpu;
- }
- return a;
+ cputime_to_timespec((__force cputime_t)expires, tp);
}
/*
@@ -108,57 +80,64 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
* given the current clock sample.
*/
static void bump_cpu_timer(struct k_itimer *timer,
- union cpu_time_count now)
+ unsigned long long now)
{
int i;
+ unsigned long long delta, incr;
- if (timer->it.cpu.incr.sched == 0)
+ if (timer->it.cpu.incr == 0)
return;
- if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
- unsigned long long delta, incr;
+ if (now < timer->it.cpu.expires)
+ return;
- if (now.sched < timer->it.cpu.expires.sched)
- return;
- incr = timer->it.cpu.incr.sched;
- delta = now.sched + incr - timer->it.cpu.expires.sched;
- /* Don't use (incr*2 < delta), incr*2 might overflow. */
- for (i = 0; incr < delta - incr; i++)
- incr = incr << 1;
- for (; i >= 0; incr >>= 1, i--) {
- if (delta < incr)
- continue;
- timer->it.cpu.expires.sched += incr;
- timer->it_overrun += 1 << i;
- delta -= incr;
- }
- } else {
- cputime_t delta, incr;
+ incr = timer->it.cpu.incr;
+ delta = now + incr - timer->it.cpu.expires;
- if (now.cpu < timer->it.cpu.expires.cpu)
- return;
- incr = timer->it.cpu.incr.cpu;
- delta = now.cpu + incr - timer->it.cpu.expires.cpu;
- /* Don't use (incr*2 < delta), incr*2 might overflow. */
- for (i = 0; incr < delta - incr; i++)
- incr += incr;
- for (; i >= 0; incr = incr >> 1, i--) {
- if (delta < incr)
- continue;
- timer->it.cpu.expires.cpu += incr;
- timer->it_overrun += 1 << i;
- delta -= incr;
- }
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; incr < delta - incr; i++)
+ incr = incr << 1;
+
+ for (; i >= 0; incr >>= 1, i--) {
+ if (delta < incr)
+ continue;
+
+ timer->it.cpu.expires += incr;
+ timer->it_overrun += 1 << i;
+ delta -= incr;
}
}
-static inline cputime_t prof_ticks(struct task_struct *p)
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
{
- return p->utime + p->stime;
+ if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+ return 1;
+ return 0;
}
-static inline cputime_t virt_ticks(struct task_struct *p)
+
+static inline unsigned long long prof_ticks(struct task_struct *p)
{
- return p->utime;
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+
+ return cputime_to_expires(utime + stime);
+}
+static inline unsigned long long virt_ticks(struct task_struct *p)
+{
+ cputime_t utime;
+
+ task_cputime(p, &utime, NULL);
+
+ return cputime_to_expires(utime);
}
static int
@@ -199,48 +178,24 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
* Sample a per-thread clock for the given task.
*/
static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
- union cpu_time_count *cpu)
+ unsigned long long *sample)
{
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
- cpu->cpu = prof_ticks(p);
+ *sample = prof_ticks(p);
break;
case CPUCLOCK_VIRT:
- cpu->cpu = virt_ticks(p);
+ *sample = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = task_sched_runtime(p);
+ *sample = task_sched_runtime(p);
break;
}
return 0;
}
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct signal_struct *sig = tsk->signal;
- struct task_struct *t;
-
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
-
- rcu_read_lock();
- /* make sure we can trust tsk->thread_group list */
- if (!likely(pid_alive(tsk)))
- goto out;
-
- t = tsk;
- do {
- times->utime += t->utime;
- times->stime += t->stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- } while_each_thread(tsk, t);
-out:
- rcu_read_unlock();
-}
-
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
{
if (b->utime > a->utime)
@@ -278,11 +233,12 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
/*
* Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
*/
static int cpu_clock_sample_group(const clockid_t which_clock,
struct task_struct *p,
- union cpu_time_count *cpu)
+ unsigned long long *sample)
{
struct task_cputime cputime;
@@ -291,44 +247,67 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
return -EINVAL;
case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
- cpu->cpu = cputime.utime + cputime.stime;
+ *sample = cputime_to_expires(cputime.utime + cputime.stime);
break;
case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
- cpu->cpu = cputime.utime;
+ *sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
thread_group_cputime(p, &cputime);
- cpu->sched = cputime.sum_exec_runtime;
+ *sample = cputime.sum_exec_runtime;
break;
}
return 0;
}
+static int posix_cpu_clock_get_task(struct task_struct *tsk,
+ const clockid_t which_clock,
+ struct timespec *tp)
+{
+ int err = -EINVAL;
+ unsigned long long rtn;
+
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ if (same_thread_group(tsk, current))
+ err = cpu_clock_sample(which_clock, tsk, &rtn);
+ } else {
+ unsigned long flags;
+ struct sighand_struct *sighand;
+
+ /*
+ * while_each_thread() is not yet entirely RCU safe,
+ * keep locking the group while sampling process
+ * clock for now.
+ */
+ sighand = lock_task_sighand(tsk, &flags);
+ if (!sighand)
+ return err;
+
+ if (tsk == current || thread_group_leader(tsk))
+ err = cpu_clock_sample_group(which_clock, tsk, &rtn);
+
+ unlock_task_sighand(tsk, &flags);
+ }
+
+ if (!err)
+ sample_to_timespec(which_clock, rtn, tp);
+
+ return err;
+}
+
static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
{
const pid_t pid = CPUCLOCK_PID(which_clock);
- int error = -EINVAL;
- union cpu_time_count rtn;
+ int err = -EINVAL;
if (pid == 0) {
/*
* Special case constant value for our own clocks.
* We don't have to do any lookup to find ourselves.
*/
- if (CPUCLOCK_PERTHREAD(which_clock)) {
- /*
- * Sampling just ourselves we can do with no locking.
- */
- error = cpu_clock_sample(which_clock,
- current, &rtn);
- } else {
- read_lock(&tasklist_lock);
- error = cpu_clock_sample_group(which_clock,
- current, &rtn);
- read_unlock(&tasklist_lock);
- }
+ err = posix_cpu_clock_get_task(current, which_clock, tp);
} else {
/*
* Find the given PID, and validate that the caller
@@ -337,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
struct task_struct *p;
rcu_read_lock();
p = find_task_by_vpid(pid);
- if (p) {
- if (CPUCLOCK_PERTHREAD(which_clock)) {
- if (same_thread_group(p, current)) {
- error = cpu_clock_sample(which_clock,
- p, &rtn);
- }
- } else {
- read_lock(&tasklist_lock);
- if (thread_group_leader(p) && p->sighand) {
- error =
- cpu_clock_sample_group(which_clock,
- p, &rtn);
- }
- read_unlock(&tasklist_lock);
- }
- }
+ if (p)
+ err = posix_cpu_clock_get_task(p, which_clock, tp);
rcu_read_unlock();
}
- if (error)
- return error;
- sample_to_timespec(which_clock, rtn, tp);
- return 0;
+ return err;
}
@@ -416,75 +378,58 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
*/
static int posix_cpu_timer_del(struct k_itimer *timer)
{
- struct task_struct *p = timer->it.cpu.task;
int ret = 0;
+ unsigned long flags;
+ struct sighand_struct *sighand;
+ struct task_struct *p = timer->it.cpu.task;
- if (likely(p != NULL)) {
- read_lock(&tasklist_lock);
- if (unlikely(p->sighand == NULL)) {
- /*
- * We raced with the reaping of the task.
- * The deletion should have cleared us off the list.
- */
- BUG_ON(!list_empty(&timer->it.cpu.entry));
- } else {
- spin_lock(&p->sighand->siglock);
- if (timer->it.cpu.firing)
- ret = TIMER_RETRY;
- else
- list_del(&timer->it.cpu.entry);
- spin_unlock(&p->sighand->siglock);
- }
- read_unlock(&tasklist_lock);
+ WARN_ON_ONCE(p == NULL);
- if (!ret)
- put_task_struct(p);
+ /*
+ * Protect against sighand release/switch in exit/exec and process/
+ * thread timer list entry concurrent read/writes.
+ */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL)) {
+ /*
+ * We raced with the reaping of the task.
+ * The deletion should have cleared us off the list.
+ */
+ WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
+ } else {
+ if (timer->it.cpu.firing)
+ ret = TIMER_RETRY;
+ else
+ list_del(&timer->it.cpu.entry);
+
+ unlock_task_sighand(p, &flags);
}
+ if (!ret)
+ put_task_struct(p);
+
return ret;
}
+static void cleanup_timers_list(struct list_head *head)
+{
+ struct cpu_timer_list *timer, *next;
+
+ list_for_each_entry_safe(timer, next, head, entry)
+ list_del_init(&timer->entry);
+}
+
/*
* Clean out CPU timers still ticking when a thread exited. The task
* pointer is cleared, and the expiry time is replaced with the residual
* time for later timer_gettime calls to return.
* This must be called with the siglock held.
*/
-static void cleanup_timers(struct list_head *head,
- cputime_t utime, cputime_t stime,
- unsigned long long sum_exec_runtime)
+static void cleanup_timers(struct list_head *head)
{
- struct cpu_timer_list *timer, *next;
- cputime_t ptime = utime + stime;
-
- list_for_each_entry_safe(timer, next, head, entry) {
- list_del_init(&timer->entry);
- if (timer->expires.cpu < ptime) {
- timer->expires.cpu = 0;
- } else {
- timer->expires.cpu -= ptime;
- }
- }
-
- ++head;
- list_for_each_entry_safe(timer, next, head, entry) {
- list_del_init(&timer->entry);
- if (timer->expires.cpu < utime) {
- timer->expires.cpu = 0;
- } else {
- timer->expires.cpu -= utime;
- }
- }
-
- ++head;
- list_for_each_entry_safe(timer, next, head, entry) {
- list_del_init(&timer->entry);
- if (timer->expires.sched < sum_exec_runtime) {
- timer->expires.sched = 0;
- } else {
- timer->expires.sched -= sum_exec_runtime;
- }
- }
+ cleanup_timers_list(head);
+ cleanup_timers_list(++head);
+ cleanup_timers_list(++head);
}
/*
@@ -494,30 +439,14 @@ static void cleanup_timers(struct list_head *head,
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
- cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+ add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ sizeof(unsigned long long));
+ cleanup_timers(tsk->cpu_timers);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
- struct signal_struct *const sig = tsk->signal;
-
- cleanup_timers(tsk->signal->cpu_timers,
- tsk->utime + sig->utime, tsk->stime + sig->stime,
- tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
-}
-
-static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
-{
- /*
- * That's all for this thread or process.
- * We leave our residual in expires to be reported.
- */
- put_task_struct(timer->it.cpu.task);
- timer->it.cpu.task = NULL;
- timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
- timer->it.cpu.expires,
- now);
+ cleanup_timers(tsk->signal->cpu_timers);
}
static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -527,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
/*
* Insert the timer on the appropriate list before any timers that
- * expire later. This must be called with the tasklist_lock held
- * for reading, interrupts disabled and p->sighand->siglock taken.
+ * expire later. This must be called with the sighand lock held.
*/
static void arm_timer(struct k_itimer *timer)
{
@@ -549,14 +477,14 @@ static void arm_timer(struct k_itimer *timer)
listpos = head;
list_for_each_entry(next, head, entry) {
- if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
+ if (nt->expires < next->expires)
break;
listpos = &next->entry;
}
list_add(&nt->entry, listpos);
if (listpos == head) {
- union cpu_time_count *exp = &nt->expires;
+ unsigned long long exp = nt->expires;
/*
* We are the new earliest-expiring POSIX 1.b timer, hence
@@ -567,17 +495,17 @@ static void arm_timer(struct k_itimer *timer)
switch (CPUCLOCK_WHICH(timer->it_clock)) {
case CPUCLOCK_PROF:
- if (expires_gt(cputime_expires->prof_exp, exp->cpu))
- cputime_expires->prof_exp = exp->cpu;
+ if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
+ cputime_expires->prof_exp = expires_to_cputime(exp);
break;
case CPUCLOCK_VIRT:
- if (expires_gt(cputime_expires->virt_exp, exp->cpu))
- cputime_expires->virt_exp = exp->cpu;
+ if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
+ cputime_expires->virt_exp = expires_to_cputime(exp);
break;
case CPUCLOCK_SCHED:
if (cputime_expires->sched_exp == 0 ||
- cputime_expires->sched_exp > exp->sched)
- cputime_expires->sched_exp = exp->sched;
+ cputime_expires->sched_exp > exp)
+ cputime_expires->sched_exp = exp;
break;
}
}
@@ -592,20 +520,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
/*
* User don't want any signal.
*/
- timer->it.cpu.expires.sched = 0;
+ timer->it.cpu.expires = 0;
} else if (unlikely(timer->sigq == NULL)) {
/*
* This a special case for clock_nanosleep,
* not a normal timer from sys_timer_create.
*/
wake_up_process(timer->it_process);
- timer->it.cpu.expires.sched = 0;
- } else if (timer->it.cpu.incr.sched == 0) {
+ timer->it.cpu.expires = 0;
+ } else if (timer->it.cpu.incr == 0) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
posix_timer_event(timer, 0);
- timer->it.cpu.expires.sched = 0;
+ timer->it.cpu.expires = 0;
} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
/*
* The signal did not get queued because the signal
@@ -619,11 +547,12 @@ static void cpu_timer_fire(struct k_itimer *timer)
/*
* Sample a process (thread group) timer for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
*/
static int cpu_timer_sample_group(const clockid_t which_clock,
struct task_struct *p,
- union cpu_time_count *cpu)
+ unsigned long long *sample)
{
struct task_cputime cputime;
@@ -632,61 +561,89 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
default:
return -EINVAL;
case CPUCLOCK_PROF:
- cpu->cpu = cputime.utime + cputime.stime;
+ *sample = cputime_to_expires(cputime.utime + cputime.stime);
break;
case CPUCLOCK_VIRT:
- cpu->cpu = cputime.utime;
+ *sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ *sample = cputime.sum_exec_runtime + task_delta_exec(p);
break;
}
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+static void nohz_kick_work_fn(struct work_struct *work)
+{
+ tick_nohz_full_kick_all();
+}
+
+static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
+
+/*
+ * We need the IPIs to be sent from sane process context.
+ * The posix cpu timers are always set with irqs disabled.
+ */
+static void posix_cpu_timer_kick_nohz(void)
+{
+ if (context_tracking_is_enabled())
+ schedule_work(&nohz_kick_work);
+}
+
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return false;
+
+ if (tsk->signal->cputimer.running)
+ return false;
+
+ return true;
+}
+#else
+static inline void posix_cpu_timer_kick_nohz(void) { }
+#endif
+
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
* If we return TIMER_RETRY, it's necessary to release the timer's lock
* and try again. (This happens when the timer is in the middle of firing.)
*/
-static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
struct itimerspec *new, struct itimerspec *old)
{
+ unsigned long flags;
+ struct sighand_struct *sighand;
struct task_struct *p = timer->it.cpu.task;
- union cpu_time_count old_expires, new_expires, old_incr, val;
+ unsigned long long old_expires, new_expires, old_incr, val;
int ret;
- if (unlikely(p == NULL)) {
- /*
- * Timer refers to a dead task's clock.
- */
- return -ESRCH;
- }
+ WARN_ON_ONCE(p == NULL);
new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
- read_lock(&tasklist_lock);
/*
- * We need the tasklist_lock to protect against reaping that
- * clears p->sighand. If p has just been reaped, we can no
+ * Protect against sighand release/switch in exit/exec and p->cpu_timers
+ * and p->signal->cpu_timers read/write in arm_timer()
+ */
+ sighand = lock_task_sighand(p, &flags);
+ /*
+ * If p has just been reaped, we can no
* longer get any information about it at all.
*/
- if (unlikely(p->sighand == NULL)) {
- read_unlock(&tasklist_lock);
- put_task_struct(p);
- timer->it.cpu.task = NULL;
+ if (unlikely(sighand == NULL)) {
return -ESRCH;
}
/*
* Disarm any old timer after extracting its expiry time.
*/
- BUG_ON(!irqs_disabled());
+ WARN_ON_ONCE(!irqs_disabled());
ret = 0;
old_incr = timer->it.cpu.incr;
- spin_lock(&p->sighand->siglock);
old_expires = timer->it.cpu.expires;
if (unlikely(timer->it.cpu.firing)) {
timer->it.cpu.firing = -1;
@@ -709,7 +666,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
}
if (old) {
- if (old_expires.sched == 0) {
+ if (old_expires == 0) {
old->it_value.tv_sec = 0;
old->it_value.tv_nsec = 0;
} else {
@@ -724,11 +681,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
* new setting.
*/
bump_cpu_timer(timer, val);
- if (cpu_time_before(timer->it_clock, val,
- timer->it.cpu.expires)) {
- old_expires = cpu_time_sub(
- timer->it_clock,
- timer->it.cpu.expires, val);
+ if (val < timer->it.cpu.expires) {
+ old_expires = timer->it.cpu.expires - val;
sample_to_timespec(timer->it_clock,
old_expires,
&old->it_value);
@@ -746,13 +700,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
* disable this firing since we are already reporting
* it as an overrun (thanks to bump_cpu_timer above).
*/
- spin_unlock(&p->sighand->siglock);
- read_unlock(&tasklist_lock);
+ unlock_task_sighand(p, &flags);
goto out;
}
- if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
- cpu_time_add(timer->it_clock, &new_expires, val);
+ if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
+ new_expires += val;
}
/*
@@ -761,14 +714,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
* arm the timer (we'll just fake it for timer_gettime).
*/
timer->it.cpu.expires = new_expires;
- if (new_expires.sched != 0 &&
- cpu_time_before(timer->it_clock, val, new_expires)) {
+ if (new_expires != 0 && val < new_expires) {
arm_timer(timer);
}
- spin_unlock(&p->sighand->siglock);
- read_unlock(&tasklist_lock);
-
+ unlock_task_sighand(p, &flags);
/*
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
@@ -786,8 +736,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
timer->it_overrun_last = 0;
timer->it_overrun = -1;
- if (new_expires.sched != 0 &&
- !cpu_time_before(timer->it_clock, val, new_expires)) {
+ if (new_expires != 0 && !(val < new_expires)) {
/*
* The designated time already passed, so we notify
* immediately, even if the thread never runs to
@@ -802,14 +751,17 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
sample_to_timespec(timer->it_clock,
old_incr, &old->it_interval);
}
+ if (!ret)
+ posix_cpu_timer_kick_nohz();
return ret;
}
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
{
- union cpu_time_count now;
+ unsigned long long now;
struct task_struct *p = timer->it.cpu.task;
- int clear_dead;
+
+ WARN_ON_ONCE(p == NULL);
/*
* Easy part: convert the reload time.
@@ -817,63 +769,44 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
sample_to_timespec(timer->it_clock,
timer->it.cpu.incr, &itp->it_interval);
- if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */
+ if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
return;
}
- if (unlikely(p == NULL)) {
- /*
- * This task already died and the timer will never fire.
- * In this case, expires is actually the dead value.
- */
- dead:
- sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
- &itp->it_value);
- return;
- }
-
/*
* Sample the clock to take the difference with the expiry time.
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
cpu_clock_sample(timer->it_clock, p, &now);
- clear_dead = p->exit_state;
} else {
- read_lock(&tasklist_lock);
- if (unlikely(p->sighand == NULL)) {
+ struct sighand_struct *sighand;
+ unsigned long flags;
+
+ /*
+ * Protect against sighand release/switch in exit/exec and
+ * also make timer sampling safe if it ends up calling
+ * thread_group_cputime().
+ */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL)) {
/*
* The process has been reaped.
* We can't even collect a sample any more.
* Call the timer disarmed, nothing else to do.
*/
- put_task_struct(p);
- timer->it.cpu.task = NULL;
- timer->it.cpu.expires.sched = 0;
- read_unlock(&tasklist_lock);
- goto dead;
+ timer->it.cpu.expires = 0;
+ sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+ &itp->it_value);
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
- clear_dead = (unlikely(p->exit_state) &&
- thread_group_empty(p));
+ unlock_task_sighand(p, &flags);
}
- read_unlock(&tasklist_lock);
}
- if (unlikely(clear_dead)) {
- /*
- * We've noticed that the thread is dead, but
- * not yet reaped. Take this opportunity to
- * drop our task ref.
- */
- clear_dead_task(timer, now);
- goto dead;
- }
-
- if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+ if (now < timer->it.cpu.expires) {
sample_to_timespec(timer->it_clock,
- cpu_time_sub(timer->it_clock,
- timer->it.cpu.expires, now),
+ timer->it.cpu.expires - now,
&itp->it_value);
} else {
/*
@@ -885,6 +818,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
}
}
+static unsigned long long
+check_timers_list(struct list_head *timers,
+ struct list_head *firing,
+ unsigned long long curr)
+{
+ int maxfire = 20;
+
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t;
+
+ t = list_first_entry(timers, struct cpu_timer_list, entry);
+
+ if (!--maxfire || curr < t->expires)
+ return t->expires;
+
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ return 0;
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them off
* the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -893,54 +848,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
- int maxfire;
struct list_head *timers = tsk->cpu_timers;
struct signal_struct *const sig = tsk->signal;
+ struct task_cputime *tsk_expires = &tsk->cputime_expires;
+ unsigned long long expires;
unsigned long soft;
- maxfire = 20;
- tsk->cputime_expires.prof_exp = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
- tsk->cputime_expires.prof_exp = t->expires.cpu;
- break;
- }
- t->firing = 1;
- list_move_tail(&t->entry, firing);
- }
+ expires = check_timers_list(timers, firing, prof_ticks(tsk));
+ tsk_expires->prof_exp = expires_to_cputime(expires);
- ++timers;
- maxfire = 20;
- tsk->cputime_expires.virt_exp = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
- tsk->cputime_expires.virt_exp = t->expires.cpu;
- break;
- }
- t->firing = 1;
- list_move_tail(&t->entry, firing);
- }
+ expires = check_timers_list(++timers, firing, virt_ticks(tsk));
+ tsk_expires->virt_exp = expires_to_cputime(expires);
- ++timers;
- maxfire = 20;
- tsk->cputime_expires.sched_exp = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
- tsk->cputime_expires.sched_exp = t->expires.sched;
- break;
- }
- t->firing = 1;
- list_move_tail(&t->entry, firing);
- }
+ tsk_expires->sched_exp = check_timers_list(++timers, firing,
+ tsk->se.sum_exec_runtime);
/*
* Check for the special case thread timers.
@@ -988,7 +909,8 @@ static void stop_process_timers(struct signal_struct *sig)
static u32 onecputick;
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
- cputime_t *expires, cputime_t cur_time, int signo)
+ unsigned long long *expires,
+ unsigned long long cur_time, int signo)
{
if (!it->expires)
return;
@@ -1016,21 +938,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
}
}
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime: The struct to compare.
- *
- * Checks @cputime to see if all fields are zero. Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
- return 1;
- return 0;
-}
-
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1039,9 +946,8 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
static void check_process_timers(struct task_struct *tsk,
struct list_head *firing)
{
- int maxfire;
struct signal_struct *const sig = tsk->signal;
- cputime_t utime, ptime, virt_expires, prof_expires;
+ unsigned long long utime, ptime, virt_expires, prof_expires;
unsigned long long sum_sched_runtime, sched_expires;
struct list_head *timers = sig->cpu_timers;
struct task_cputime cputime;
@@ -1051,52 +957,13 @@ static void check_process_timers(struct task_struct *tsk,
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
- utime = cputime.utime;
- ptime = utime + cputime.stime;
+ utime = cputime_to_expires(cputime.utime);
+ ptime = utime + cputime_to_expires(cputime.stime);
sum_sched_runtime = cputime.sum_exec_runtime;
- maxfire = 20;
- prof_expires = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *tl = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || ptime < tl->expires.cpu) {
- prof_expires = tl->expires.cpu;
- break;
- }
- tl->firing = 1;
- list_move_tail(&tl->entry, firing);
- }
- ++timers;
- maxfire = 20;
- virt_expires = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *tl = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || utime < tl->expires.cpu) {
- virt_expires = tl->expires.cpu;
- break;
- }
- tl->firing = 1;
- list_move_tail(&tl->entry, firing);
- }
-
- ++timers;
- maxfire = 20;
- sched_expires = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *tl = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
- sched_expires = tl->expires.sched;
- break;
- }
- tl->firing = 1;
- list_move_tail(&tl->entry, firing);
- }
+ prof_expires = check_timers_list(timers, firing, ptime);
+ virt_expires = check_timers_list(++timers, firing, utime);
+ sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
/*
* Check for the special case process timers.
@@ -1135,8 +1002,8 @@ static void check_process_timers(struct task_struct *tsk,
}
}
- sig->cputime_expires.prof_exp = prof_expires;
- sig->cputime_expires.virt_exp = virt_expires;
+ sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
+ sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
@@ -1148,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk,
*/
void posix_cpu_timer_schedule(struct k_itimer *timer)
{
+ struct sighand_struct *sighand;
+ unsigned long flags;
struct task_struct *p = timer->it.cpu.task;
- union cpu_time_count now;
+ unsigned long long now;
- if (unlikely(p == NULL))
- /*
- * The task was cleaned up already, no future firings.
- */
- goto out;
+ WARN_ON_ONCE(p == NULL);
/*
* Fetch the current sample and update the timer's expiry time.
@@ -1163,48 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
cpu_clock_sample(timer->it_clock, p, &now);
bump_cpu_timer(timer, now);
- if (unlikely(p->exit_state)) {
- clear_dead_task(timer, now);
+ if (unlikely(p->exit_state))
+ goto out;
+
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (!sighand)
goto out;
- }
- read_lock(&tasklist_lock); /* arm_timer needs it. */
- spin_lock(&p->sighand->siglock);
} else {
- read_lock(&tasklist_lock);
- if (unlikely(p->sighand == NULL)) {
+ /*
+ * Protect arm_timer() and timer sampling in case of call to
+ * thread_group_cputime().
+ */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL)) {
/*
* The process has been reaped.
* We can't even collect a sample any more.
*/
- put_task_struct(p);
- timer->it.cpu.task = p = NULL;
- timer->it.cpu.expires.sched = 0;
- goto out_unlock;
+ timer->it.cpu.expires = 0;
+ goto out;
} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
- /*
- * We've noticed that the thread is dead, but
- * not yet reaped. Take this opportunity to
- * drop our task ref.
- */
- clear_dead_task(timer, now);
- goto out_unlock;
+ unlock_task_sighand(p, &flags);
+ /* Optimizations: if the process is dying, no need to rearm */
+ goto out;
}
- spin_lock(&p->sighand->siglock);
cpu_timer_sample_group(timer->it_clock, p, &now);
bump_cpu_timer(timer, now);
- /* Leave the tasklist_lock locked for the call below. */
+ /* Leave the sighand locked for the call below. */
}
/*
* Now re-arm for the new expiry time.
*/
- BUG_ON(!irqs_disabled());
+ WARN_ON_ONCE(!irqs_disabled());
arm_timer(timer);
- spin_unlock(&p->sighand->siglock);
-
-out_unlock:
- read_unlock(&tasklist_lock);
+ unlock_task_sighand(p, &flags);
+ /* Kick full dynticks CPUs in case they need to tick on the new timer */
+ posix_cpu_timer_kick_nohz();
out:
timer->it_overrun_last = timer->it_overrun;
timer->it_overrun = -1;
@@ -1247,11 +1109,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
if (!task_cputime_zero(&tsk->cputime_expires)) {
struct task_cputime task_sample = {
- .utime = tsk->utime,
- .stime = tsk->stime,
+ .utime = utime,
+ .stime = stime,
.sum_exec_runtime = tsk->se.sum_exec_runtime
};
@@ -1285,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
struct k_itimer *timer, *next;
unsigned long flags;
- BUG_ON(!irqs_disabled());
+ WARN_ON_ONCE(!irqs_disabled());
/*
* The fast path checks that there are no expired thread or thread
@@ -1350,9 +1215,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
cputime_t *newval, cputime_t *oldval)
{
- union cpu_time_count now;
+ unsigned long long now;
- BUG_ON(clock_idx == CPUCLOCK_SCHED);
+ WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
cpu_timer_sample_group(clock_idx, tsk, &now);
if (oldval) {
@@ -1362,17 +1227,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
* it to be absolute.
*/
if (*oldval) {
- if (*oldval <= now.cpu) {
+ if (*oldval <= now) {
/* Just about to fire. */
*oldval = cputime_one_jiffy;
} else {
- *oldval -= now.cpu;
+ *oldval -= now;
}
}
if (!*newval)
- return;
- *newval += now.cpu;
+ goto out;
+ *newval += now;
}
/*
@@ -1389,6 +1254,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
tsk->signal->cputime_expires.virt_exp = *newval;
break;
}
+out:
+ posix_cpu_timer_kick_nohz();
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
@@ -1420,10 +1287,12 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
}
while (!signal_pending(current)) {
- if (timer.it.cpu.expires.sched == 0) {
+ if (timer.it.cpu.expires == 0) {
/*
- * Our timer fired and was reset.
+ * Our timer fired and was reset, below
+ * deletion can not fail.
*/
+ posix_cpu_timer_del(&timer);
spin_unlock_irq(&timer.it_lock);
return 0;
}
@@ -1441,9 +1310,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ if (!error) {
+ /*
+ * Timer is now unarmed, deletion can not fail.
+ */
+ posix_cpu_timer_del(&timer);
+ }
spin_unlock_irq(&timer.it_lock);
+ while (error == TIMER_RETRY) {
+ /*
+ * We need to handle case when timer was or is in the
+ * middle of firing. In other cases we already freed
+ * resources.
+ */
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_del(&timer);
+ spin_unlock_irq(&timer.it_lock);
+ }
+
if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b70..424c2d4265c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
-#include <linux/idr.h>
+#include <linux/hash.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/export.h>
+#include <linux/hashtable.h>
/*
- * Management arrays for POSIX timers. Timers are kept in slab memory
- * Timer ids are allocated by an external routine that keeps track of the
- * id and the timer. The external interface is:
- *
- * void *idr_find(struct idr *idp, int id); to find timer_id <id>
- * int idr_get_new(struct idr *idp, void *ptr); to get a new id and
- * related it to <ptr>
- * void idr_remove(struct idr *idp, int id); to release <id>
- * void idr_init(struct idr *idp); to initialize <idp>
- * which we supply.
- * The idr_get_new *may* call slab for more memory so it must not be
- * called under a spin lock. Likewise idr_remore may release memory
- * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast. A -1 return
- * indicates that the requested id does not exist.
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
+ * with 512 entries.
+ * Timer ids are allocated by local routine, which selects proper hash head by
+ * key, constructed from current->signal address and per signal struct counter.
+ * This keeps timer ids unique per process, but now they can intersect between
+ * processes.
*/
/*
* Lets keep our timers in a slab cache :-)
*/
static struct kmem_cache *posix_timers_cache;
-static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
/*
* we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
__timr; \
})
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+ return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+ struct signal_struct *sig,
+ timer_t id)
+{
+ struct k_itimer *timer;
+
+ hlist_for_each_entry_rcu(timer, head, t_hash) {
+ if ((timer->it_signal == sig) && (timer->it_id == id))
+ return timer;
+ }
+ return NULL;
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+
+ return __posix_timers_find(head, sig, id);
+}
+
+static int posix_timer_add(struct k_itimer *timer)
+{
+ struct signal_struct *sig = current->signal;
+ int first_free_id = sig->posix_timer_id;
+ struct hlist_head *head;
+ int ret = -ENOENT;
+
+ do {
+ spin_lock(&hash_lock);
+ head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+ if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ hlist_add_head_rcu(&timer->t_hash, head);
+ ret = sig->posix_timer_id;
+ }
+ if (++sig->posix_timer_id < 0)
+ sig->posix_timer_id = 0;
+ if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+ /* Loop over all possible ids completed */
+ ret = -EAGAIN;
+ spin_unlock(&hash_lock);
+ } while (ret == -ENOENT);
+ return ret;
+}
+
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
return 0;
}
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+ timekeeping_clocktai(tp);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
.clock_getres = posix_get_coarse_res,
.clock_get = posix_get_monotonic_coarse,
};
+ struct k_clock clock_tai = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_tai,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
struct k_clock clock_boottime = {
.clock_getres = hrtimer_get_res,
.clock_get = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+ posix_timers_register_clock(CLOCK_TAI, &clock_tai);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
NULL);
- idr_init(&posix_timers_id);
return 0;
}
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
{
if (it_id_set) {
unsigned long flags;
- spin_lock_irqsave(&idr_lock, flags);
- idr_remove(&posix_timers_id, tmr->it_id);
- spin_unlock_irqrestore(&idr_lock, flags);
+ spin_lock_irqsave(&hash_lock, flags);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock_irqrestore(&hash_lock, flags);
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
@@ -552,22 +610,9 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
- retry:
- if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
- error = -EAGAIN;
- goto out;
- }
- spin_lock_irq(&idr_lock);
- error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
- spin_unlock_irq(&idr_lock);
- if (error) {
- if (error == -EAGAIN)
- goto retry;
- /*
- * Weird looking, but we return EAGAIN if the IDR is
- * full (proper POSIX return value for this)
- */
- error = -EAGAIN;
+ new_timer_id = posix_timer_add(new_timer);
+ if (new_timer_id < 0) {
+ error = new_timer_id;
goto out;
}
@@ -639,8 +684,15 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
+ /*
+ * timer_t could be any type >= int and we want to make sure any
+ * @timer_id outside positive int range fails lookup.
+ */
+ if ((unsigned long long)timer_id > INT_MAX)
+ return NULL;
+
rcu_read_lock();
- timr = idr_find(&posix_timers_id, (int)timer_id);
+ timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
@@ -997,7 +1049,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
err = kc->clock_adj(which_clock, &ktx);
- if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+ if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
return err;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e321..9a83d780fac 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,9 +100,35 @@ config PM_SLEEP_SMP
depends on SMP
depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
depends on PM_SLEEP
- select HOTPLUG
select HOTPLUG_CPU
+config PM_AUTOSLEEP
+ bool "Opportunistic sleep"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow the kernel to trigger a system transition into a global sleep
+ state automatically whenever there are no active wakeup sources.
+
+config PM_WAKELOCKS
+ bool "User space wakeup sources interface"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow user space to create, activate and deactivate wakeup source
+ objects with the help of a sysfs-based interface.
+
+config PM_WAKELOCKS_LIMIT
+ int "Maximum number of user space wakeup sources (0 = no limit)"
+ range 0 100000
+ default 100
+ depends on PM_WAKELOCKS
+
+config PM_WAKELOCKS_GC
+ bool "Garbage collector for user space wakeup sources"
+ depends on PM_WAKELOCKS
+ default y
+
config PM_RUNTIME
bool "Run-time PM core functionality"
depends on !IA64_HP_SIM
@@ -148,10 +174,26 @@ config PM_TEST_SUSPEND
You probably want to have your system's RTC driver statically
linked, ensuring that it's available when this test runs.
-config CAN_PM_TRACE
+config PM_SLEEP_DEBUG
def_bool y
depends on PM_DEBUG && PM_SLEEP
+config DPM_WATCHDOG
+ bool "Device suspend/resume watchdog"
+ depends on PM_DEBUG && PSTORE
+ ---help---
+ Sets up a watchdog timer to capture drivers that are
+ locked up attempting to suspend/resume a device.
+ A detected lockup causes system panic with message
+ captured in pstore device for inspection in subsequent
+ boot session.
+
+config DPM_WATCHDOG_TIMEOUT
+ int "Watchdog timeout in seconds"
+ range 1 120
+ default 12
+ depends on DPM_WATCHDOG
+
config PM_TRACE
bool
help
@@ -169,7 +211,7 @@ config PM_TRACE
config PM_TRACE_RTC
bool "Suspend/resume event tracing"
- depends on CAN_PM_TRACE
+ depends on PM_SLEEP_DEBUG
depends on X86
select PM_TRACE
---help---
@@ -215,8 +257,7 @@ config ARCH_HAS_OPP
bool
config PM_OPP
- bool "Operating Performance Point (OPP) Layer library"
- depends on ARCH_HAS_OPP
+ bool
---help---
SOCs have a standard set of tuples consisting of frequency and
voltage pairs that the device will support per voltage domain. This
@@ -236,6 +277,30 @@ config PM_GENERIC_DOMAINS
bool
depends on PM
+config WQ_POWER_EFFICIENT_DEFAULT
+ bool "Enable workqueue power-efficient mode by default"
+ depends on PM
+ default n
+ help
+ Per-cpu workqueues are generally preferred because they show
+ better performance thanks to cache locality; unfortunately,
+ per-cpu workqueues tend to be more power hungry than unbound
+ workqueues.
+
+ Enabling workqueue.power_efficient kernel parameter makes the
+ per-cpu workqueues which were observed to contribute
+ significantly to power consumption unbound, leading to measurably
+ lower power usage at the cost of small performance overhead.
+
+ This config option determines whether workqueue.power_efficient
+ is enabled by default.
+
+ If in doubt, say N.
+
+config PM_GENERIC_DOMAINS_SLEEP
+ def_bool y
+ depends on PM_SLEEP && PM_GENERIC_DOMAINS
+
config PM_GENERIC_DOMAINS_RUNTIME
def_bool y
depends on PM_RUNTIME && PM_GENERIC_DOMAINS
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba..29472bff11e 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,12 +1,15 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
-obj-$(CONFIG_PM) += main.o qos.o
+obj-y += qos.o
+obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
+obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
+obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 00000000000..9012ecf7b81
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,128 @@
+/*
+ * kernel/power/autosleep.c
+ *
+ * Opportunistic sleep support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/pm_wakeup.h>
+
+#include "power.h"
+
+static suspend_state_t autosleep_state;
+static struct workqueue_struct *autosleep_wq;
+/*
+ * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
+ * is active, otherwise a deadlock with try_to_suspend() is possible.
+ * Alternatively mutex_lock_interruptible() can be used. This will then fail
+ * if an auto_sleep cycle tries to freeze processes.
+ */
+static DEFINE_MUTEX(autosleep_lock);
+static struct wakeup_source *autosleep_ws;
+
+static void try_to_suspend(struct work_struct *work)
+{
+ unsigned int initial_count, final_count;
+
+ if (!pm_get_wakeup_count(&initial_count, true))
+ goto out;
+
+ mutex_lock(&autosleep_lock);
+
+ if (!pm_save_wakeup_count(initial_count) ||
+ system_state != SYSTEM_RUNNING) {
+ mutex_unlock(&autosleep_lock);
+ goto out;
+ }
+
+ if (autosleep_state == PM_SUSPEND_ON) {
+ mutex_unlock(&autosleep_lock);
+ return;
+ }
+ if (autosleep_state >= PM_SUSPEND_MAX)
+ hibernate();
+ else
+ pm_suspend(autosleep_state);
+
+ mutex_unlock(&autosleep_lock);
+
+ if (!pm_get_wakeup_count(&final_count, false))
+ goto out;
+
+ /*
+ * If the wakeup occured for an unknown reason, wait to prevent the
+ * system from trying to suspend and waking up in a tight loop.
+ */
+ if (final_count == initial_count)
+ schedule_timeout_uninterruptible(HZ / 2);
+
+ out:
+ queue_up_suspend_work();
+}
+
+static DECLARE_WORK(suspend_work, try_to_suspend);
+
+void queue_up_suspend_work(void)
+{
+ if (autosleep_state > PM_SUSPEND_ON)
+ queue_work(autosleep_wq, &suspend_work);
+}
+
+suspend_state_t pm_autosleep_state(void)
+{
+ return autosleep_state;
+}
+
+int pm_autosleep_lock(void)
+{
+ return mutex_lock_interruptible(&autosleep_lock);
+}
+
+void pm_autosleep_unlock(void)
+{
+ mutex_unlock(&autosleep_lock);
+}
+
+int pm_autosleep_set_state(suspend_state_t state)
+{
+
+#ifndef CONFIG_HIBERNATION
+ if (state >= PM_SUSPEND_MAX)
+ return -EINVAL;
+#endif
+
+ __pm_stay_awake(autosleep_ws);
+
+ mutex_lock(&autosleep_lock);
+
+ autosleep_state = state;
+
+ __pm_relax(autosleep_ws);
+
+ if (state > PM_SUSPEND_ON) {
+ pm_wakep_autosleep_enabled(true);
+ queue_up_suspend_work();
+ } else {
+ pm_wakep_autosleep_enabled(false);
+ }
+
+ mutex_unlock(&autosleep_lock);
+ return 0;
+}
+
+int __init pm_autosleep_init(void)
+{
+ autosleep_ws = wakeup_source_register("autosleep");
+ if (!autosleep_ws)
+ return -ENOMEM;
+
+ autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
+ if (autosleep_wq)
+ return 0;
+
+ wakeup_source_unregister(autosleep_ws);
+ return -ENOMEM;
+}
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index d09dd10c5a5..9a58bc25881 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector,
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
bio->bi_end_io = end_swap_bio_read;
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b..aba9c545a0e 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,18 +4,133 @@
* Originally from swsusp.
*/
+#include <linux/console.h>
#include <linux/vt_kern.h>
#include <linux/kbd_kern.h>
#include <linux/vt.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include "power.h"
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
+static DEFINE_MUTEX(vt_switch_mutex);
+
+struct pm_vt_switch {
+ struct list_head head;
+ struct device *dev;
+ bool required;
+};
+
+static LIST_HEAD(pm_vt_switch_list);
+
+
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument. If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+ struct pm_vt_switch *entry, *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ /* already registered, update requirement */
+ tmp->required = required;
+ goto out;
+ }
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out;
+
+ entry->required = required;
+ entry->dev = dev;
+
+ list_add(&entry->head, &pm_vt_switch_list);
+out:
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+ struct pm_vt_switch *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ list_del(&tmp->head);
+ kfree(tmp);
+ break;
+ }
+ }
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ * 1) no driver has indicated a requirement one way or another, so preserve
+ * the old behavior
+ * 2) console suspend is disabled, we want to see debug messages across
+ * suspend/resume
+ * 3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+ struct pm_vt_switch *entry;
+ bool ret = true;
+
+ mutex_lock(&vt_switch_mutex);
+ if (list_empty(&pm_vt_switch_list))
+ goto out;
+
+ if (!console_suspend_enabled)
+ goto out;
+
+ list_for_each_entry(entry, &pm_vt_switch_list, head) {
+ if (entry->required)
+ goto out;
+ }
+
+ ret = false;
+out:
+ mutex_unlock(&vt_switch_mutex);
+ return ret;
+}
+
int pm_prepare_console(void)
{
+ if (!pm_vt_switch())
+ return 0;
+
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
return 1;
@@ -26,6 +141,9 @@ int pm_prepare_console(void)
void pm_restore_console(void)
{
+ if (!pm_vt_switch())
+ return;
+
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d2887033..fcc2611d3f1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
*
* This file is released under the GPLv2.
*/
@@ -16,7 +17,6 @@
#include <linux/string.h>
#include <linux/device.h>
#include <linux/async.h>
-#include <linux/kmod.h>
#include <linux/delay.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -26,25 +26,31 @@
#include <linux/freezer.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
-#include <scsi/scsi_scan.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+#include <trace/events/power.h>
#include "power.h"
static int nocompress;
static int noresume;
+static int nohibernate;
static int resume_wait;
-static int resume_delay;
+static unsigned int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
-int in_suspend __nosavedata;
+__visible int in_suspend __nosavedata;
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
HIBERNATION_SHUTDOWN,
HIBERNATION_REBOOT,
+#ifdef CONFIG_SUSPEND
+ HIBERNATION_SUSPEND,
+#endif
/* keep last */
__HIBERNATION_AFTER_LAST
};
@@ -57,6 +63,11 @@ bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
+bool hibernation_available(void)
+{
+ return (nohibernate == 0);
+}
+
/**
* hibernation_set_ops - Set the global hibernate operations.
* @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -78,6 +89,7 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
unlock_system_sleep();
}
+EXPORT_SYMBOL_GPL(hibernation_set_ops);
static bool entering_platform_hibernation;
@@ -223,19 +235,23 @@ static void platform_recover(int platform_mode)
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
- s64 elapsed_centisecs64;
- int centisecs;
- int k;
- int kps;
+ u64 elapsed_centisecs64;
+ unsigned int centisecs;
+ unsigned int k;
+ unsigned int kps;
elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ /*
+ * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
+ * it is obvious enough for what went wrong.
+ */
do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+ printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
msg, k,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
@@ -245,8 +261,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
* create_image - Create a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
*
- * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
- * and execute the drivers' .thaw_noirq() callbacks.
+ * Execute device drivers' "late" and "noirq" freeze callbacks, create a
+ * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
*
* Control reappears in this routine after the subsequent restore.
*/
@@ -254,7 +270,7 @@ static int create_image(int platform_mode)
{
int error;
- error = dpm_suspend_noirq(PMSG_FREEZE);
+ error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting hibernation\n");
@@ -283,16 +299,18 @@ static int create_image(int platform_mode)
in_suspend = 1;
save_processor_state();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
/* Restore control flow magically appears here */
restore_processor_state();
- if (!in_suspend) {
+ if (!in_suspend)
events_check_enabled = false;
- platform_leave(platform_mode);
- }
+
+ platform_leave(platform_mode);
Power_up:
syscore_resume();
@@ -306,7 +324,7 @@ static int create_image(int platform_mode)
Platform_finish:
platform_finish(platform_mode);
- dpm_resume_noirq(in_suspend ?
+ dpm_resume_start(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
return error;
@@ -343,16 +361,17 @@ int hibernation_snapshot(int platform_mode)
* successful freezer test.
*/
freezer_test_done = true;
- goto Cleanup;
+ goto Thaw;
}
error = dpm_prepare(PMSG_FREEZE);
if (error) {
dpm_complete(PMSG_RECOVER);
- goto Cleanup;
+ goto Thaw;
}
suspend_console();
+ ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -378,6 +397,7 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
+ ftrace_start();
resume_console();
dpm_complete(msg);
@@ -385,6 +405,8 @@ int hibernation_snapshot(int platform_mode)
platform_end(platform_mode);
return error;
+ Thaw:
+ thaw_kernel_threads();
Cleanup:
swsusp_free();
goto Close;
@@ -394,16 +416,16 @@ int hibernation_snapshot(int platform_mode)
* resume_target_kernel - Restore system state from a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
*
- * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- * highmem that have not been restored yet from the image and run the low-level
- * code that will restore the remaining contents of memory and switch to the
- * just restored target kernel.
+ * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
+ * contents of highmem that have not been restored yet from the image and run
+ * the low-level code that will restore the remaining contents of memory and
+ * switch to the just restored target kernel.
*/
static int resume_target_kernel(bool platform_mode)
{
int error;
- error = dpm_suspend_noirq(PMSG_QUIESCE);
+ error = dpm_suspend_end(PMSG_QUIESCE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting resume\n");
@@ -460,7 +482,7 @@ static int resume_target_kernel(bool platform_mode)
Cleanup:
platform_restore_cleanup(platform_mode);
- dpm_resume_noirq(PMSG_RECOVER);
+ dpm_resume_start(PMSG_RECOVER);
return error;
}
@@ -478,6 +500,7 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
+ ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -485,6 +508,7 @@ int hibernation_restore(int platform_mode)
dpm_resume_end(PMSG_RECOVER);
}
pm_restore_gfp_mask();
+ ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -511,6 +535,7 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
+ ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -518,7 +543,7 @@ int hibernation_platform_enter(void)
goto Resume_devices;
}
- error = dpm_suspend_noirq(PMSG_HIBERNATE);
+ error = dpm_suspend_end(PMSG_HIBERNATE);
if (error)
goto Resume_devices;
@@ -549,11 +574,12 @@ int hibernation_platform_enter(void)
Platform_finish:
hibernation_ops->finish();
- dpm_resume_noirq(PMSG_RESTORE);
+ dpm_resume_start(PMSG_RESTORE);
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
+ ftrace_start();
resume_console();
Close:
@@ -571,6 +597,10 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
+#ifdef CONFIG_SUSPEND
+ int error;
+#endif
+
switch (hibernation_mode) {
case HIBERNATION_REBOOT:
kernel_restart(NULL);
@@ -578,8 +608,28 @@ static void power_down(void)
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
case HIBERNATION_SHUTDOWN:
- kernel_power_off();
+ if (pm_power_off)
+ kernel_power_off();
break;
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ if (error) {
+ if (hibernation_ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+ power_down();
+ }
+ /*
+ * Restore swap signature.
+ */
+ error = swsusp_unmark();
+ if (error)
+ printk(KERN_ERR "PM: Swap will be unusable! "
+ "Try swapon -a.\n");
+ return;
+#endif
}
kernel_halt();
/*
@@ -587,7 +637,8 @@ static void power_down(void)
* corruption after resume.
*/
printk(KERN_CRIT "PM: Please power down manually\n");
- while(1);
+ while (1)
+ cpu_relax();
}
/**
@@ -597,6 +648,11 @@ int hibernate(void)
{
int error;
+ if (!hibernation_available()) {
+ pr_debug("PM: Hibernation not available.\n");
+ return -EPERM;
+ }
+
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -609,30 +665,23 @@ int hibernate(void)
if (error)
goto Exit;
- error = usermodehelper_disable();
- if (error)
- goto Exit;
-
- /* Allocate memory management structures */
- error = create_basic_memory_bitmaps();
- if (error)
- goto Exit;
-
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
error = freeze_processes();
if (error)
- goto Finish;
+ goto Exit;
- error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
+ lock_device_hotplug();
+ /* Allocate memory management structures */
+ error = create_basic_memory_bitmaps();
if (error)
goto Thaw;
- if (freezer_test_done) {
- freezer_test_done = false;
- goto Thaw;
- }
+
+ error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
+ if (error || freezer_test_done)
+ goto Free_bitmaps;
if (in_suspend) {
unsigned int flags = 0;
@@ -655,11 +704,14 @@ int hibernate(void)
pr_debug("PM: Image restored successfully.\n");
}
+ Free_bitmaps:
+ free_basic_memory_bitmaps();
Thaw:
+ unlock_device_hotplug();
thaw_processes();
- Finish:
- free_basic_memory_bitmaps();
- usermodehelper_enable();
+
+ /* Don't bother checking whether freezer_test_done is true */
+ freezer_test_done = false;
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
@@ -693,7 +745,7 @@ static int software_resume(void)
/*
* If the user said "noresume".. bail out early.
*/
- if (noresume)
+ if (noresume || !hibernation_available())
return 0;
/*
@@ -726,6 +778,17 @@ static int software_resume(void)
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
+
+ /*
+ * name_to_dev_t is ineffective to verify parition if resume_file is in
+ * integer format. (e.g. major:minor)
+ */
+ if (isdigit(resume_file[0]) && resume_wait) {
+ int partno;
+ while (!get_gendisk(swsusp_resume_device, &partno))
+ msleep(10);
+ }
+
if (!swsusp_resume_device) {
/*
* Some device discovery might still be in progress; we need
@@ -739,13 +802,6 @@ static int software_resume(void)
async_synchronize_full();
}
- /*
- * We can't depend on SCSI devices being available after loading
- * one of their modules until scsi_complete_async_scans() is
- * called and the resume device usually is a SCSI one.
- */
- scsi_complete_async_scans();
-
swsusp_resume_device = name_to_dev_t(resume_file);
if (!swsusp_resume_device) {
error = -ENODEV;
@@ -772,27 +828,20 @@ static int software_resume(void)
pm_prepare_console();
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
if (error)
- goto close_finish;
-
- error = usermodehelper_disable();
- if (error)
- goto close_finish;
-
- error = create_basic_memory_bitmaps();
- if (error) {
- usermodehelper_enable();
- goto close_finish;
- }
+ goto Close_Finish;
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
- if (error) {
- swsusp_close(FMODE_READ);
- goto Done;
- }
+ if (error)
+ goto Close_Finish;
pr_debug("PM: Loading hibernation image.\n");
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Thaw;
+
error = swsusp_read(&flags);
swsusp_close(FMODE_READ);
if (!error)
@@ -800,10 +849,10 @@ static int software_resume(void)
printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
swsusp_free();
- thaw_processes();
- Done:
free_basic_memory_bitmaps();
- usermodehelper_enable();
+ Thaw:
+ unlock_device_hotplug();
+ thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
pm_restore_console();
@@ -813,18 +862,21 @@ static int software_resume(void)
mutex_unlock(&pm_mutex);
pr_debug("PM: Hibernation image not present or could not be loaded.\n");
return error;
-close_finish:
+ Close_Finish:
swsusp_close(FMODE_READ);
goto Finish;
}
-late_initcall(software_resume);
+late_initcall_sync(software_resume);
static const char * const hibernation_modes[] = {
[HIBERNATION_PLATFORM] = "platform",
[HIBERNATION_SHUTDOWN] = "shutdown",
[HIBERNATION_REBOOT] = "reboot",
+#ifdef CONFIG_SUSPEND
+ [HIBERNATION_SUSPEND] = "suspend",
+#endif
};
/*
@@ -859,12 +911,18 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
int i;
char *start = buf;
+ if (!hibernation_available())
+ return sprintf(buf, "[disabled]\n");
+
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
continue;
switch (i) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+#endif
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -890,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
char *p;
int mode = HIBERNATION_INVALID;
+ if (!hibernation_available())
+ return -EPERM;
+
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
@@ -905,6 +966,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
switch (mode) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+#endif
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
@@ -935,16 +999,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- unsigned int maj, min;
dev_t res;
- int ret = -EINVAL;
+ int len = n;
+ char *name;
- if (sscanf(buf, "%u:%u", &maj, &min) != 2)
- goto out;
+ if (len && buf[len-1] == '\n')
+ len--;
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
- res = MKDEV(maj,min);
- if (maj != MAJOR(res) || min != MINOR(res))
- goto out;
+ res = name_to_dev_t(name);
+ kfree(name);
+ if (!res)
+ return -EINVAL;
lock_system_sleep();
swsusp_resume_device = res;
@@ -952,9 +1020,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
printk(KERN_INFO "PM: Starting manual resume from disk\n");
noresume = 0;
software_resume();
- ret = n;
- out:
- return ret;
+ return n;
}
power_attr(resume);
@@ -1052,6 +1118,10 @@ static int __init hibernate_setup(char *str)
noresume = 1;
else if (!strncmp(str, "nocompress", 10))
nocompress = 1;
+ else if (!strncmp(str, "no", 2)) {
+ noresume = 1;
+ nohibernate = 1;
+ }
return 1;
}
@@ -1069,13 +1139,30 @@ static int __init resumewait_setup(char *str)
static int __init resumedelay_setup(char *str)
{
- resume_delay = simple_strtoul(str, NULL, 0);
+ int rc = kstrtouint(str, 0, &resume_delay);
+
+ if (rc)
+ return rc;
+ return 1;
+}
+
+static int __init nohibernate_setup(char *str)
+{
+ noresume = 1;
+ nohibernate = 1;
return 1;
}
+static int __init kaslr_nohibernate_setup(char *str)
+{
+ return nohibernate_setup(str);
+}
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
__setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
+__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a1..8e90f330f13 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
{
unsigned long val;
- if (strict_strtoul(buf, 10, &val))
+ if (kstrtoul(buf, 10, &val))
return -EINVAL;
if (val > 1)
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
last_errno %= REC_FAILED_NUM;
last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
last_step %= REC_FAILED_NUM;
- seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
- "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+ seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+ "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
"success", suspend_stats.success,
"fail", suspend_stats.fail,
"failed_freeze", suspend_stats.failed_freeze,
"failed_prepare", suspend_stats.failed_prepare,
"failed_suspend", suspend_stats.failed_suspend,
+ "failed_suspend_late",
+ suspend_stats.failed_suspend_late,
"failed_suspend_noirq",
suspend_stats.failed_suspend_noirq,
"failed_resume", suspend_stats.failed_resume,
+ "failed_resume_early",
+ suspend_stats.failed_resume_early,
"failed_resume_noirq",
suspend_stats.failed_resume_noirq);
seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
@@ -231,76 +235,130 @@ late_initcall(pm_debugfs_init);
#endif /* CONFIG_PM_SLEEP */
+#ifdef CONFIG_PM_SLEEP_DEBUG
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1. 0 disables printing and 1 enables it.
+ */
+bool pm_print_times_enabled;
+
+static ssize_t pm_print_times_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+
+static ssize_t pm_print_times_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_print_times_enabled = !!val;
+ return n;
+}
+
+power_attr(pm_print_times);
+
+static inline void pm_print_times_init(void)
+{
+ pm_print_times_enabled = !!initcall_debug;
+}
+#else /* !CONFIG_PP_SLEEP_DEBUG */
+static inline void pm_print_times_init(void) {}
+#endif /* CONFIG_PM_SLEEP_DEBUG */
+
struct kobject *power_kobj;
/**
- * state - control system power state.
+ * state - control system sleep states.
*
- * show() returns what states are supported, which is hard-coded to
- * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
- * 'disk' (Suspend-to-Disk).
+ * show() returns available sleep state labels, which may be "mem", "standby",
+ * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
+ * description of what they mean.
*
- * store() accepts one of those strings, translates it into the
- * proper enumerated value, and initiates a suspend transition.
+ * store() accepts one of those strings, translates it into the proper
+ * enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
- int i;
+ suspend_state_t i;
+
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (pm_states[i].state)
+ s += sprintf(s,"%s ", pm_states[i].label);
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (pm_states[i] && valid_state(i))
- s += sprintf(s,"%s ", pm_states[i]);
- }
#endif
-#ifdef CONFIG_HIBERNATION
- s += sprintf(s, "%s\n", "disk");
-#else
+ if (hibernation_available())
+ s += sprintf(s, "disk ");
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
-#endif
return (s - buf);
}
-static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
+static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_STANDBY;
- const char * const *s;
+ suspend_state_t state = PM_SUSPEND_MIN;
+ struct pm_sleep_state *s;
#endif
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- /* First, check if we are requested to hibernate */
- if (len == 4 && !strncmp(buf, "disk", len)) {
- error = hibernate();
- goto Exit;
- }
+ /* Check hibernation first. */
+ if (len == 4 && !strncmp(buf, "disk", len))
+ return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
- if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
- break;
- }
- if (state < PM_SUSPEND_MAX && *s) {
- error = enter_state(state);
- if (error) {
- suspend_stats.fail++;
- dpm_save_failed_errno(error);
- } else
- suspend_stats.success++;
- }
+ for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
+ if (s->state && len == strlen(s->label)
+ && !strncmp(buf, s->label, len))
+ return s->state;
#endif
- Exit:
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_state(buf, n);
+ if (state < PM_SUSPEND_MAX)
+ error = pm_suspend(state);
+ else if (state == PM_SUSPEND_MAX)
+ error = hibernate();
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
return error ? error : n;
}
@@ -341,7 +399,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
{
unsigned int val;
- return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
+ return pm_get_wakeup_count(&val, true) ?
+ sprintf(buf, "%u\n", val) : -EINTR;
}
static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -349,15 +408,108 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
const char *buf, size_t n)
{
unsigned int val;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+ error = -EINVAL;
if (sscanf(buf, "%u", &val) == 1) {
if (pm_save_wakeup_count(val))
- return n;
+ error = n;
+ else
+ pm_print_active_wakeup_sources();
}
- return -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error;
}
power_attr(wakeup_count);
+
+#ifdef CONFIG_PM_AUTOSLEEP
+static ssize_t autosleep_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ suspend_state_t state = pm_autosleep_state();
+
+ if (state == PM_SUSPEND_ON)
+ return sprintf(buf, "off\n");
+
+#ifdef CONFIG_SUSPEND
+ if (state < PM_SUSPEND_MAX)
+ return sprintf(buf, "%s\n", pm_states[state].state ?
+ pm_states[state].label : "error");
+#endif
+#ifdef CONFIG_HIBERNATION
+ return sprintf(buf, "disk\n");
+#else
+ return sprintf(buf, "error");
+#endif
+}
+
+static ssize_t autosleep_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state = decode_state(buf, n);
+ int error;
+
+ if (state == PM_SUSPEND_ON
+ && strcmp(buf, "off") && strcmp(buf, "off\n"))
+ return -EINVAL;
+
+ error = pm_autosleep_set_state(state);
+ return error ? error : n;
+}
+
+power_attr(autosleep);
+#endif /* CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+static ssize_t wake_lock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, true);
+}
+
+static ssize_t wake_lock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_lock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_lock);
+
+static ssize_t wake_unlock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, false);
+}
+
+static ssize_t wake_unlock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_unlock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_unlock);
+
+#endif /* CONFIG_PM_WAKELOCKS */
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_TRACE
@@ -377,6 +529,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
if (sscanf(buf, "%d", &val) == 1) {
pm_trace_enabled = !!val;
+ if (pm_trace_enabled) {
+ pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n"
+ "PM: Correct system time has to be restored manually after resume.\n");
+ }
return n;
}
return -EINVAL;
@@ -402,6 +558,30 @@ power_attr(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ freeze_timeout_msecs = val;
+ return n;
+}
+
+power_attr(pm_freeze_timeout);
+
+#endif /* CONFIG_FREEZER*/
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -411,9 +591,22 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
&wakeup_count_attr.attr,
+#ifdef CONFIG_PM_AUTOSLEEP
+ &autosleep_attr.attr,
+#endif
+#ifdef CONFIG_PM_WAKELOCKS
+ &wake_lock_attr.attr,
+ &wake_unlock_attr.attr,
+#endif
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
+#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_print_times_attr.attr,
+#endif
+#endif
+#ifdef CONFIG_FREEZER
+ &pm_freeze_timeout_attr.attr,
#endif
NULL,
};
@@ -446,7 +639,11 @@ static int __init pm_init(void)
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
- return sysfs_create_group(power_kobj, &attr_group);
+ error = sysfs_create_group(power_kobj, &attr_group);
+ if (error)
+ return error;
+ pm_print_times_init();
+ return pm_autosleep_init();
}
core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee520..c60f13b5270 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -2,6 +2,7 @@
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
+#include <linux/compiler.h>
struct swsusp_info {
struct new_utsname uts;
@@ -11,7 +12,7 @@ struct swsusp_info {
unsigned long image_pages;
unsigned long pages;
unsigned long size;
-} __attribute__((aligned(PAGE_SIZE)));
+} __aligned(PAGE_SIZE);
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
@@ -49,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
+asmlinkage int swsusp_save(void);
+
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
@@ -156,6 +159,9 @@ extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
extern void swsusp_close(fmode_t);
+#ifdef CONFIG_SUSPEND
+extern int swsusp_unmark(void);
+#endif
/* kernel/power/block_io.c */
extern struct block_device *hib_resume_bdev;
@@ -172,19 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
+struct pm_sleep_state {
+ const char *label;
+ suspend_state_t state;
+};
+
/* kernel/power/suspend.c */
-extern const char *const pm_states[];
+extern struct pm_sleep_state pm_states[];
-extern bool valid_state(suspend_state_t state);
extern int suspend_devices_and_enter(suspend_state_t state);
-extern int enter_state(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
-static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
-static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
#ifdef CONFIG_PM_TEST_SUSPEND
@@ -234,16 +241,14 @@ static inline int suspend_freeze_processes(void)
int error;
error = freeze_processes();
-
/*
* freeze_processes() automatically thaws every task if freezing
* fails. So we need not do anything extra upon error.
*/
if (error)
- goto Finish;
+ return error;
error = freeze_kernel_threads();
-
/*
* freeze_kernel_threads() thaws only kernel threads upon freezing
* failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +256,6 @@ static inline int suspend_freeze_processes(void)
if (error)
thaw_processes();
- Finish:
return error;
}
@@ -269,3 +273,30 @@ static inline void suspend_thaw_processes(void)
{
}
#endif
+
+#ifdef CONFIG_PM_AUTOSLEEP
+
+/* kernel/power/autosleep.c */
+extern int pm_autosleep_init(void);
+extern int pm_autosleep_lock(void);
+extern void pm_autosleep_unlock(void);
+extern suspend_state_t pm_autosleep_state(void);
+extern int pm_autosleep_set_state(suspend_state_t state);
+
+#else /* !CONFIG_PM_AUTOSLEEP */
+
+static inline int pm_autosleep_init(void) { return 0; }
+static inline int pm_autosleep_lock(void) { return 0; }
+static inline void pm_autosleep_unlock(void) {}
+static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
+
+#endif /* !CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+
+/* kernel/power/wakelock.c */
+extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
+extern int pm_wake_lock(const char *buf);
+extern int pm_wake_unlock(const char *buf);
+
+#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index d52359374e8..7ef6866b521 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,12 +32,12 @@ static void handle_poweroff(int key)
static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
- .help_msg = "powerOff",
+ .help_msg = "poweroff(o)",
.action_msg = "Power Off",
.enable_mask = SYSRQ_ENABLE_BOOT,
};
-static int pm_sysrq_init(void)
+static int __init pm_sysrq_init(void)
{
register_sysrq_key('o', &sysrq_poweroff_op);
return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60..4ee194eb524 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,11 +16,13 @@
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/workqueue.h>
+#include <linux/kmod.h>
+#include <trace/events/power.h>
/*
* Timeout for stopping processes
*/
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
@@ -29,13 +31,14 @@ static int try_to_freeze_tasks(bool user_only)
unsigned int todo;
bool wq_busy = false;
struct timeval start, end;
- u64 elapsed_csecs64;
- unsigned int elapsed_csecs;
+ u64 elapsed_msecs64;
+ unsigned int elapsed_msecs;
bool wakeup = false;
+ int sleep_usecs = USEC_PER_MSEC;
do_gettimeofday(&start);
- end_time = jiffies + TIMEOUT;
+ end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
if (!user_only)
freeze_workqueues_begin();
@@ -47,20 +50,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- /*
- * Now that we've done set_freeze_flag, don't
- * perturb a task in TASK_STOPPED or TASK_TRACED.
- * It is "frozen enough". If the task does wake
- * up, it will immediately call try_to_freeze.
- *
- * Because freeze_task() goes through p's
- * scheduler lock after setting TIF_FREEZE, it's
- * guaranteed that either we see TASK_RUNNING or
- * try_to_stop() after schedule() in ptrace/signal
- * stop sees TIF_FREEZE.
- */
- if (!task_is_stopped_or_traced(p) &&
- !freezer_should_skip(p))
+ if (!freezer_should_skip(p))
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
@@ -80,34 +70,39 @@ static int try_to_freeze_tasks(bool user_only)
/*
* We need to retry, but first give the freezing tasks some
- * time to enter the regrigerator.
+ * time to enter the refrigerator. Start with an initial
+ * 1 ms sleep followed by exponential backoff until 8 ms.
*/
- msleep(10);
+ usleep_range(sleep_usecs / 2, sleep_usecs);
+ if (sleep_usecs < 8 * USEC_PER_MSEC)
+ sleep_usecs *= 2;
}
do_gettimeofday(&end);
- elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
- do_div(elapsed_csecs64, NSEC_PER_SEC / 100);
- elapsed_csecs = elapsed_csecs64;
+ elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
+ do_div(elapsed_msecs64, NSEC_PER_MSEC);
+ elapsed_msecs = elapsed_msecs64;
if (todo) {
printk("\n");
- printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
+ printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
"(%d tasks refusing to freeze, wq_busy=%d):\n",
wakeup ? "aborted" : "failed",
- elapsed_csecs / 100, elapsed_csecs % 100,
+ elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- if (!wakeup && !freezer_should_skip(p) &&
- p != current && freezing(p) && !frozen(p))
- sched_show_task(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
+ if (!wakeup) {
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (p != current && !freezer_should_skip(p)
+ && freezing(p) && !frozen(p))
+ sched_show_task(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ }
} else {
- printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
- elapsed_csecs % 100);
+ printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
+ elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -115,6 +110,8 @@ static int try_to_freeze_tasks(bool user_only)
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
+ * The current thread will not be frozen. The same process that calls
+ * freeze_processes must later call thaw_processes.
*
* On success, returns 0. On failure, -errno and system is fully thawed.
*/
@@ -122,6 +119,13 @@ int freeze_processes(void)
{
int error;
+ error = __usermodehelper_disable(UMH_FREEZING);
+ if (error)
+ return error;
+
+ /* Make sure this task doesn't get frozen */
+ current->flags |= PF_SUSPEND_TASK;
+
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
@@ -130,6 +134,7 @@ int freeze_processes(void)
error = try_to_freeze_tasks(true);
if (!error) {
printk("done.");
+ __usermodehelper_set_disable_depth(UMH_DISABLED);
oom_killer_disable();
}
printk("\n");
@@ -169,7 +174,9 @@ int freeze_kernel_threads(void)
void thaw_processes(void)
{
struct task_struct *g, *p;
+ struct task_struct *curr = current;
+ trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
atomic_dec(&system_freezing_cnt);
pm_freezing = false;
@@ -179,16 +186,25 @@ void thaw_processes(void)
printk("Restarting tasks ... ");
+ __usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
read_lock(&tasklist_lock);
do_each_thread(g, p) {
+ /* No other threads should have PF_SUSPEND_TASK set */
+ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
__thaw_task(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+ WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
+ curr->flags &= ~PF_SUSPEND_TASK;
+
+ usermodehelper_enable();
+
schedule();
printk("done.\n");
+ trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
void thaw_kernel_threads(void)
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417..884b7705886 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -44,6 +44,7 @@
#include <linux/uaccess.h>
#include <linux/export.h>
+#include <trace/events/power.h>
/*
* locking rule: all changes to constraints or notifiers lists
@@ -65,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = {
.list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
.notifiers = &cpu_dma_lat_notifier,
};
@@ -78,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = {
.list = PLIST_HEAD_INIT(network_lat_constraints.list),
.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
.notifiers = &network_lat_notifier,
};
@@ -92,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = {
.list = PLIST_HEAD_INIT(network_tput_constraints.list),
.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.type = PM_QOS_MAX,
.notifiers = &network_throughput_notifier,
};
@@ -127,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = {
static inline int pm_qos_get_value(struct pm_qos_constraints *c)
{
if (plist_head_empty(&c->list))
- return c->default_value;
+ return c->no_constraint_value;
switch (c->type) {
case PM_QOS_MIN:
@@ -139,6 +143,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
default:
/* runtime check for not using enum */
BUG();
+ return PM_QOS_DEFAULT_VALUE;
}
}
@@ -168,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
{
unsigned long flags;
int prev_value, curr_value, new_value;
+ int ret;
spin_lock_irqsave(&pm_qos_lock, flags);
prev_value = pm_qos_get_value(c);
@@ -201,14 +207,81 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
spin_unlock_irqrestore(&pm_qos_lock, flags);
+ trace_pm_qos_update_target(action, prev_value, curr_value);
if (prev_value != curr_value) {
- blocking_notifier_call_chain(c->notifiers,
- (unsigned long)curr_value,
- NULL);
- return 1;
+ ret = 1;
+ if (c->notifiers)
+ blocking_notifier_call_chain(c->notifiers,
+ (unsigned long)curr_value,
+ NULL);
} else {
- return 0;
+ ret = 0;
}
+ return ret;
+}
+
+/**
+ * pm_qos_flags_remove_req - Remove device PM QoS flags request.
+ * @pqf: Device PM QoS flags set to remove the request from.
+ * @req: Request to remove from the set.
+ */
+static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
+ struct pm_qos_flags_request *req)
+{
+ s32 val = 0;
+
+ list_del(&req->node);
+ list_for_each_entry(req, &pqf->list, node)
+ val |= req->flags;
+
+ pqf->effective_flags = val;
+}
+
+/**
+ * pm_qos_update_flags - Update a set of PM QoS flags.
+ * @pqf: Set of flags to update.
+ * @req: Request to add to the set, to modify, or to remove from the set.
+ * @action: Action to take on the set.
+ * @val: Value of the request to add or modify.
+ *
+ * Update the given set of PM QoS flags and call notifiers if the aggregate
+ * value has changed. Returns 1 if the aggregate constraint value has changed,
+ * 0 otherwise.
+ */
+bool pm_qos_update_flags(struct pm_qos_flags *pqf,
+ struct pm_qos_flags_request *req,
+ enum pm_qos_req_action action, s32 val)
+{
+ unsigned long irqflags;
+ s32 prev_value, curr_value;
+
+ spin_lock_irqsave(&pm_qos_lock, irqflags);
+
+ prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
+
+ switch (action) {
+ case PM_QOS_REMOVE_REQ:
+ pm_qos_flags_remove_req(pqf, req);
+ break;
+ case PM_QOS_UPDATE_REQ:
+ pm_qos_flags_remove_req(pqf, req);
+ case PM_QOS_ADD_REQ:
+ req->flags = val;
+ INIT_LIST_HEAD(&req->node);
+ list_add_tail(&req->node, &pqf->list);
+ pqf->effective_flags |= val;
+ break;
+ default:
+ /* no action */
+ ;
+ }
+
+ curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
+
+ spin_unlock_irqrestore(&pm_qos_lock, irqflags);
+
+ trace_pm_qos_update_flags(action, prev_value, curr_value);
+ return prev_value != curr_value;
}
/**
@@ -229,6 +302,32 @@ int pm_qos_request_active(struct pm_qos_request *req)
}
EXPORT_SYMBOL_GPL(pm_qos_request_active);
+static void __pm_qos_update_request(struct pm_qos_request *req,
+ s32 new_value)
+{
+ trace_pm_qos_update_request(req->pm_qos_class, new_value);
+
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
+}
+
+/**
+ * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
+ * @work: work struct for the delayed work (timeout)
+ *
+ * This cancels the timeout request by falling back to the default at timeout.
+ */
+static void pm_qos_work_fn(struct work_struct *work)
+{
+ struct pm_qos_request *req = container_of(to_delayed_work(work),
+ struct pm_qos_request,
+ work);
+
+ __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+}
+
/**
* pm_qos_add_request - inserts new qos request into the list
* @req: pointer to a preallocated handle
@@ -253,6 +352,8 @@ void pm_qos_add_request(struct pm_qos_request *req,
return;
}
req->pm_qos_class = pm_qos_class;
+ INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
+ trace_pm_qos_add_request(pm_qos_class, value);
pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
&req->node, PM_QOS_ADD_REQ, value);
}
@@ -279,12 +380,39 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
+ cancel_delayed_work_sync(&req->work);
+ __pm_qos_update_request(req, new_value);
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_request);
+
+/**
+ * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
+ * @req : handle to list element holding a pm_qos request to use
+ * @new_value: defines the temporal qos request
+ * @timeout_us: the effective duration of this qos request in usecs.
+ *
+ * After timeout_us, this qos request is cancelled automatically.
+ */
+void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
+ unsigned long timeout_us)
+{
+ if (!req)
+ return;
+ if (WARN(!pm_qos_request_active(req),
+ "%s called for unknown object.", __func__))
+ return;
+
+ cancel_delayed_work_sync(&req->work);
+
+ trace_pm_qos_update_request_timeout(req->pm_qos_class,
+ new_value, timeout_us);
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_UPDATE_REQ, new_value);
+
+ schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
}
-EXPORT_SYMBOL_GPL(pm_qos_update_request);
/**
* pm_qos_remove_request - modifies an existing qos request
@@ -305,6 +433,9 @@ void pm_qos_remove_request(struct pm_qos_request *req)
return;
}
+ cancel_delayed_work_sync(&req->work);
+
+ trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_REMOVE_REQ,
PM_QOS_DEFAULT_VALUE);
@@ -366,7 +497,7 @@ static int find_pm_qos_object_by_minor(int minor)
{
int pm_qos_class;
- for (pm_qos_class = 0;
+ for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
if (minor ==
pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
@@ -380,7 +511,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
long pm_qos_class;
pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
- if (pm_qos_class >= 0) {
+ if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
return -ENOMEM;
@@ -433,30 +564,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
- } else if (count <= 11) { /* ASCII perhaps? */
- char ascii_value[11];
- unsigned long int ulval;
+ } else {
int ret;
- if (copy_from_user(ascii_value, buf, count))
- return -EFAULT;
-
- if (count > 10) {
- if (ascii_value[10] == '\n')
- ascii_value[10] = '\0';
- else
- return -EINVAL;
- } else {
- ascii_value[count] = '\0';
- }
- ret = strict_strtoul(ascii_value, 16, &ulval);
- if (ret) {
- pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
- return -EINVAL;
- }
- value = (s32)lower_32_bits(ulval);
- } else {
- return -EINVAL;
+ ret = kstrtos32_from_user(buf, count, 16, &value);
+ if (ret)
+ return ret;
}
req = filp->private_data;
@@ -469,21 +582,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
static int __init pm_qos_power_init(void)
{
int ret = 0;
+ int i;
- ret = register_pm_qos_misc(&cpu_dma_pm_qos);
- if (ret < 0) {
- printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
- return ret;
- }
- ret = register_pm_qos_misc(&network_lat_pm_qos);
- if (ret < 0) {
- printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
- return ret;
+ BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+
+ for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
+ ret = register_pm_qos_misc(pm_qos_array[i]);
+ if (ret < 0) {
+ printk(KERN_ERR "pm_qos_param: %s setup failed\n",
+ pm_qos_array[i]->name);
+ return ret;
+ }
}
- ret = register_pm_qos_misc(&network_throughput_pm_qos);
- if (ret < 0)
- printk(KERN_ERR
- "pm_qos_param: network_throughput setup failed\n");
return ret;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e53700..1ea328aafdc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -27,6 +27,7 @@
#include <linux/highmem.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
struct linked_page {
struct linked_page *next;
char data[LINKED_PAGE_DATA_SIZE];
-} __attribute__((packed));
+} __packed;
static inline void
free_list_of_pages(struct linked_page *list, int clear_page_nosave)
@@ -352,7 +353,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
struct mem_extent *ext, *cur, *aux;
zone_start = zone->zone_start_pfn;
- zone_end = zone->zone_start_pfn + zone->spanned_pages;
+ zone_end = zone_end_pfn(zone);
list_for_each_entry(ext, list, hook)
if (zone_start <= ext->end)
@@ -637,13 +638,14 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
BUG_ON(!region);
} else
/* This allocation cannot fail */
- region = alloc_bootmem(sizeof(struct nosave_region));
+ region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
Report:
- printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n",
- start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+ printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
}
/*
@@ -711,9 +713,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
list_for_each_entry(region, &nosave_regions, list) {
unsigned long pfn;
- pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
- region->start_pfn << PAGE_SHIFT,
- region->end_pfn << PAGE_SHIFT);
+ pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+ (unsigned long long) region->start_pfn << PAGE_SHIFT,
+ ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+ - 1);
for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
if (pfn_valid(pfn)) {
@@ -741,7 +744,10 @@ int create_basic_memory_bitmaps(void)
struct memory_bitmap *bm1, *bm2;
int error = 0;
- BUG_ON(forbidden_pages_map || free_pages_map);
+ if (forbidden_pages_map && free_pages_map)
+ return 0;
+ else
+ BUG_ON(forbidden_pages_map || free_pages_map);
bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
if (!bm1)
@@ -787,7 +793,8 @@ void free_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
- BUG_ON(!(forbidden_pages_map && free_pages_map));
+ if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
+ return;
bm1 = forbidden_pages_map;
bm2 = free_pages_map;
@@ -882,7 +889,7 @@ static unsigned int count_highmem_pages(void)
continue;
mark_free_pages(zone);
- max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (saveable_highmem_page(zone, pfn))
n++;
@@ -946,7 +953,7 @@ static unsigned int count_data_pages(void)
continue;
mark_free_pages(zone);
- max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (saveable_page(zone, pfn))
n++;
@@ -1000,20 +1007,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
s_page = pfn_to_page(src_pfn);
d_page = pfn_to_page(dst_pfn);
if (PageHighMem(s_page)) {
- src = kmap_atomic(s_page, KM_USER0);
- dst = kmap_atomic(d_page, KM_USER1);
+ src = kmap_atomic(s_page);
+ dst = kmap_atomic(d_page);
do_copy_page(dst, src);
- kunmap_atomic(dst, KM_USER1);
- kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
} else {
if (PageHighMem(d_page)) {
/* Page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
safe_copy_page(buffer, s_page);
- dst = kmap_atomic(d_page, KM_USER0);
+ dst = kmap_atomic(d_page);
copy_page(dst, buffer);
- kunmap_atomic(dst, KM_USER0);
+ kunmap_atomic(dst);
} else {
safe_copy_page(page_address(d_page), s_page);
}
@@ -1039,7 +1046,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
unsigned long max_zone_pfn;
mark_free_pages(zone);
- max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (page_is_saveable(zone, pfn))
memory_bm_set_bit(orig_bm, pfn);
@@ -1091,7 +1098,7 @@ void swsusp_free(void)
unsigned long pfn, max_zone_pfn;
for_each_populated_zone(zone) {
- max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
@@ -1262,7 +1269,7 @@ static void free_unnecessary_pages(void)
* [number of saveable pages] - [number of pages that can be freed in theory]
*
* where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
* minus mapped file pages.
*/
static unsigned long minimum_image_size(unsigned long saveable)
@@ -1397,7 +1404,11 @@ int hibernate_preallocate_memory(void)
* highmem and non-highmem zones separately.
*/
pages_highmem = preallocate_image_highmem(highmem / 2);
- alloc = (count - max_size) - pages_highmem;
+ alloc = count - max_size;
+ if (alloc > pages_highmem)
+ alloc -= pages_highmem;
+ else
+ alloc = 0;
pages = preallocate_image_memory(alloc, avail_normal);
if (pages < alloc) {
/* We have exhausted non-highmem pages, try highmem. */
@@ -1575,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
return -ENOMEM;
}
-asmlinkage int swsusp_save(void)
+asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
@@ -1650,7 +1661,7 @@ unsigned long snapshot_get_image_size(void)
static int init_header(struct swsusp_info *info)
{
memset(info, 0, sizeof(struct swsusp_info));
- info->num_physpages = num_physpages;
+ info->num_physpages = get_num_physpages();
info->image_pages = nr_copy_pages;
info->pages = snapshot_get_image_size();
info->size = info->pages;
@@ -1728,9 +1739,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
*/
void *kaddr;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
copy_page(buffer, kaddr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
handle->buffer = buffer;
} else {
handle->buffer = page_address(page);
@@ -1753,7 +1764,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
/* Clear page flags */
for_each_populated_zone(zone) {
- max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn))
swsusp_unset_page_free(pfn_to_page(pfn));
@@ -1794,7 +1805,7 @@ static int check_header(struct swsusp_info *info)
char *reason;
reason = check_image_kernel(info);
- if (!reason && info->num_physpages != num_physpages)
+ if (!reason && info->num_physpages != get_num_physpages())
reason = "memory size";
if (reason) {
printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
@@ -2014,9 +2025,9 @@ static void copy_last_highmem_page(void)
if (last_highmem_page) {
void *dst;
- dst = kmap_atomic(last_highmem_page, KM_USER0);
+ dst = kmap_atomic(last_highmem_page);
copy_page(dst, buffer);
- kunmap_atomic(dst, KM_USER0);
+ kunmap_atomic(dst);
last_highmem_page = NULL;
}
}
@@ -2309,13 +2320,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
{
void *kaddr1, *kaddr2;
- kaddr1 = kmap_atomic(p1, KM_USER0);
- kaddr2 = kmap_atomic(p2, KM_USER1);
+ kaddr1 = kmap_atomic(p1);
+ kaddr2 = kmap_atomic(p2);
copy_page(buf, kaddr1);
copy_page(kaddr1, kaddr2);
copy_page(kaddr2, buf);
- kunmap_atomic(kaddr2, KM_USER1);
- kunmap_atomic(kaddr1, KM_USER0);
+ kunmap_atomic(kaddr2);
+ kunmap_atomic(kaddr1);
}
/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed87..ed35a4790af 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,9 +12,9 @@
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/init.h>
-#include <linux/kmod.h>
#include <linux/console.h>
#include <linux/cpu.h>
+#include <linux/cpuidle.h>
#include <linux/syscalls.h>
#include <linux/gfp.h>
#include <linux/io.h>
@@ -25,44 +25,119 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
#include <trace/events/power.h>
+#include <linux/compiler.h>
#include "power.h"
-const char *const pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_STANDBY] = "standby",
- [PM_SUSPEND_MEM] = "mem",
+struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
+ [PM_SUSPEND_STANDBY] = { .label = "standby", },
+ [PM_SUSPEND_MEM] = { .label = "mem", },
};
static const struct platform_suspend_ops *suspend_ops;
+static const struct platform_freeze_ops *freeze_ops;
-/**
- * suspend_set_ops - Set the global suspend method table.
- * @ops: Pointer to ops structure.
- */
-void suspend_set_ops(const struct platform_suspend_ops *ops)
+static bool need_suspend_ops(suspend_state_t state)
+{
+ return state > PM_SUSPEND_FREEZE;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+
+void freeze_set_ops(const struct platform_freeze_ops *ops)
{
lock_system_sleep();
- suspend_ops = ops;
+ freeze_ops = ops;
unlock_system_sleep();
}
-EXPORT_SYMBOL_GPL(suspend_set_ops);
-bool valid_state(suspend_state_t state)
+static void freeze_begin(void)
+{
+ suspend_freeze_wake = false;
+}
+
+static void freeze_enter(void)
+{
+ cpuidle_use_deepest_state(true);
+ cpuidle_resume();
+ wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+ cpuidle_pause();
+ cpuidle_use_deepest_state(false);
+}
+
+void freeze_wake(void)
+{
+ suspend_freeze_wake = true;
+ wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
+
+static bool valid_state(suspend_state_t state)
{
/*
- * All states need lowlevel support and need to be valid to the lowlevel
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
+ * support and need to be valid to the low level
* implementation, no valid callback implies that none are valid.
*/
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
}
+/*
+ * If this is set, the "mem" label always corresponds to the deepest sleep state
+ * available, the "standby" label corresponds to the second deepest sleep state
+ * available (if any), and the "freeze" label corresponds to the remaining
+ * available sleep state (if there is one).
+ */
+static bool relative_states;
+
+static int __init sleep_states_setup(char *str)
+{
+ relative_states = !strncmp(str, "1", 1);
+ if (relative_states) {
+ pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
+ pm_states[PM_SUSPEND_FREEZE].state = 0;
+ }
+ return 1;
+}
+
+__setup("relative_sleep_states=", sleep_states_setup);
+
/**
- * suspend_valid_only_mem - generic memory-only valid callback
+ * suspend_set_ops - Set the global suspend method table.
+ * @ops: Suspend operations to use.
+ */
+void suspend_set_ops(const struct platform_suspend_ops *ops)
+{
+ suspend_state_t i;
+ int j = PM_SUSPEND_MAX - 1;
+
+ lock_system_sleep();
+
+ suspend_ops = ops;
+ for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
+ if (valid_state(i))
+ pm_states[j--].state = i;
+ else if (!relative_states)
+ pm_states[j--].state = 0;
+
+ pm_states[j--].state = PM_SUSPEND_FREEZE;
+ while (j >= PM_SUSPEND_MIN)
+ pm_states[j--].state = 0;
+
+ unlock_system_sleep();
+}
+EXPORT_SYMBOL_GPL(suspend_set_ops);
+
+/**
+ * suspend_valid_only_mem - Generic memory-only valid callback.
*
- * Platform drivers that implement mem suspend only and only need
- * to check for that in their .valid callback can use this instead
- * of rolling their own .valid callback.
+ * Platform drivers that implement mem suspend only and only need to check for
+ * that in their .valid() callback can use this instead of rolling their own
+ * .valid() callback.
*/
int suspend_valid_only_mem(suspend_state_t state)
{
@@ -83,16 +158,17 @@ static int suspend_test(int level)
}
/**
- * suspend_prepare - Do prep work before entering low-power state.
+ * suspend_prepare - Prepare for entering system sleep state.
*
- * This is common code that is called for each state that we're entering.
- * Run suspend notifiers, allocate a console and stop all processes.
+ * Common code run for every system sleep state that can be entered (except for
+ * hibernation). Run suspend notifiers, allocate the "suspend" console and
+ * freeze processes.
*/
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
{
int error;
- if (!suspend_ops || !suspend_ops->enter)
+ if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
return -EPERM;
pm_prepare_console();
@@ -101,17 +177,14 @@ static int suspend_prepare(void)
if (error)
goto Finish;
- error = usermodehelper_disable();
- if (error)
- goto Finish;
-
+ trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
+ trace_suspend_resume(TPS("freeze_processes"), 0, false);
if (!error)
return 0;
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
- usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
@@ -119,21 +192,21 @@ static int suspend_prepare(void)
}
/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+void __weak arch_suspend_disable_irqs(void)
{
local_irq_disable();
}
/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+void __weak arch_suspend_enable_irqs(void)
{
local_irq_enable();
}
/**
- * suspend_enter - enter the desired system sleep state.
- * @state: State to enter
- * @wakeup: Returns information that suspend should not be entered again.
+ * suspend_enter - Make the system enter the given sleep state.
+ * @state: System sleep state to enter.
+ * @wakeup: Returns information that the sleep state should not be re-entered.
*
* This function should be called after devices have been suspended.
*/
@@ -141,19 +214,19 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (suspend_ops->prepare) {
+ if (need_suspend_ops(state) && suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
goto Platform_finish;
}
- error = dpm_suspend_noirq(PMSG_SUSPEND);
+ error = dpm_suspend_end(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platform_finish;
}
- if (suspend_ops->prepare_late) {
+ if (need_suspend_ops(state) && suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
goto Platform_wake;
@@ -162,6 +235,20 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
+ /*
+ * PM_SUSPEND_FREEZE equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus we should invoke freeze_enter() soon after
+ * all the devices are suspended.
+ */
+ if (state == PM_SUSPEND_FREEZE) {
+ trace_suspend_resume(TPS("machine_suspend"), state, true);
+ freeze_enter();
+ trace_suspend_resume(TPS("machine_suspend"), state, false);
+ goto Platform_wake;
+ }
+
+ ftrace_stop();
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -173,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (!error) {
*wakeup = pm_wakeup_pending();
if (!(suspend_test(TEST_CORE) || *wakeup)) {
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, true);
error = suspend_ops->enter(state);
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, false);
events_check_enabled = false;
}
syscore_resume();
@@ -184,44 +275,47 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
Enable_cpus:
enable_nonboot_cpus();
+ ftrace_start();
Platform_wake:
- if (suspend_ops->wake)
+ if (need_suspend_ops(state) && suspend_ops->wake)
suspend_ops->wake();
- dpm_resume_noirq(PMSG_RESUME);
+ dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (suspend_ops->finish)
+ if (need_suspend_ops(state) && suspend_ops->finish)
suspend_ops->finish();
return error;
}
/**
- * suspend_devices_and_enter - suspend devices and enter the desired system
- * sleep state.
- * @state: state to enter
+ * suspend_devices_and_enter - Suspend devices and enter system sleep state.
+ * @state: System sleep state to enter.
*/
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
bool wakeup = false;
- if (!suspend_ops)
+ if (need_suspend_ops(state) && !suspend_ops)
return -ENOSYS;
- trace_machine_suspend(state);
- if (suspend_ops->begin) {
+ if (need_suspend_ops(state) && suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
goto Close;
+ } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
+ error = freeze_ops->begin();
+ if (error)
+ goto Close;
}
suspend_console();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to suspend\n");
+ pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -230,7 +324,7 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup
+ } while (!error && !wakeup && need_suspend_ops(state)
&& suspend_ops->suspend_again && suspend_ops->suspend_again());
Resume_devices:
@@ -239,64 +333,78 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_finish("resume devices");
resume_console();
Close:
- if (suspend_ops->end)
+ if (need_suspend_ops(state) && suspend_ops->end)
suspend_ops->end();
- trace_machine_suspend(PWR_EVENT_EXIT);
+ else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+ freeze_ops->end();
+
return error;
Recover_platform:
- if (suspend_ops->recover)
+ if (need_suspend_ops(state) && suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}
/**
- * suspend_finish - Do final work before exiting suspend sequence.
+ * suspend_finish - Clean up before finishing the suspend sequence.
*
- * Call platform code to clean up, restart processes, and free the
- * console that we've allocated. This is not called for suspend-to-disk.
+ * Call platform code to clean up, restart processes, and free the console that
+ * we've allocated. This routine is not called for hibernation.
*/
static void suspend_finish(void)
{
suspend_thaw_processes();
- usermodehelper_enable();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
/**
- * enter_state - Do common work of entering low-power state.
- * @state: pm_state structure for state we're entering.
+ * enter_state - Do common work needed to enter system sleep state.
+ * @state: System sleep state to enter.
*
- * Make sure we're the only ones trying to enter a sleep state. Fail
- * if someone has beat us to it, since we don't want anything weird to
- * happen when we wake up.
- * Then, do the setup for suspend, enter the state, and cleaup (after
- * we've woken up).
+ * Make sure that no one else is trying to put the system into a sleep state.
+ * Fail if that's not the case. Otherwise, prepare for system suspend, make the
+ * system enter the given sleep state and clean up after wakeup.
*/
-int enter_state(suspend_state_t state)
+static int enter_state(suspend_state_t state)
{
int error;
- if (!valid_state(state))
- return -ENODEV;
-
+ trace_suspend_resume(TPS("suspend_enter"), state, true);
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
+ pr_warning("PM: Unsupported test mode for freeze state,"
+ "please choose none/freezer/devices/platform.\n");
+ return -EAGAIN;
+ }
+#endif
+ } else if (!valid_state(state)) {
+ return -EINVAL;
+ }
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
+ if (state == PM_SUSPEND_FREEZE)
+ freeze_begin();
+
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
- error = suspend_prepare();
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
+ error = suspend_prepare(state);
if (error)
goto Unlock;
if (suspend_test(TEST_FREEZER))
goto Finish;
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ trace_suspend_resume(TPS("suspend_enter"), state, false);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
@@ -310,24 +418,26 @@ int enter_state(suspend_state_t state)
}
/**
- * pm_suspend - Externally visible function for suspending system.
- * @state: Enumerated value of state to enter.
+ * pm_suspend - Externally visible function for suspending the system.
+ * @state: System sleep state to enter.
*
- * Determine whether or not value is within range, get state
- * structure, and enter (above).
+ * Check if the value of @state represents one of the supported states,
+ * execute enter_state() and update system suspend statistics.
*/
int pm_suspend(suspend_state_t state)
{
- int ret;
- if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
- ret = enter_state(state);
- if (ret) {
- suspend_stats.fail++;
- dpm_save_failed_errno(ret);
- } else
- suspend_stats.success++;
- return ret;
+ int error;
+
+ if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
+ return -EINVAL;
+
+ error = enter_state(state);
+ if (error) {
+ suspend_stats.fail++;
+ dpm_save_failed_errno(error);
+ } else {
+ suspend_stats.success++;
}
- return -EINVAL;
+ return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac..269b097e78e 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
}
if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state]);
+ printk(info_test, pm_states[state].label);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state]);
+ printk(info_test, pm_states[state].label);
status = pm_suspend(state);
}
if (status < 0)
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
rtc_set_alarm(rtc, &alm);
}
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
+static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
if (!device_may_wakeup(candidate->dev.parent))
return 0;
- *(const char **)name_ptr = dev_name(dev);
return 1;
}
@@ -137,18 +136,16 @@ static char warn_bad_state[] __initdata =
static int __init setup_test_suspend(char *value)
{
- unsigned i;
+ suspend_state_t i;
/* "=mem" ==> "mem" */
value++;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (!pm_states[i])
- continue;
- if (strcmp(pm_states[i], value) != 0)
- continue;
- test_state = (__force suspend_state_t) i;
- return 0;
- }
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (!strcmp(pm_states[i].label, value)) {
+ test_state = pm_states[i].state;
+ return 0;
+ }
+
printk(warn_bad_state, value);
return 0;
}
@@ -159,21 +156,21 @@ static int __init test_suspend(void)
static char warn_no_rtc[] __initdata =
KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
- char *pony = NULL;
struct rtc_device *rtc = NULL;
+ struct device *dev;
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
- if (!valid_state(test_state)) {
- printk(warn_bad_state, pm_states[test_state]);
+ if (!pm_states[test_state].state) {
+ printk(warn_bad_state, pm_states[test_state].label);
goto done;
}
/* RTCs have initialized by now too ... can we use one? */
- class_find_device(rtc_class, NULL, &pony, has_wakealarm);
- if (pony)
- rtc = rtc_class_open(pony);
+ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ if (dev)
+ rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
goto done;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8742fd013a9..aaa3261dea5 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
*
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
+ * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
*
* This file is released under the GPLv2.
*
@@ -51,6 +51,23 @@
#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
+/*
+ * Number of free pages that are not high.
+ */
+static inline unsigned long low_free_pages(void)
+{
+ return nr_free_pages() - nr_free_highpages();
+}
+
+/*
+ * Number of pages required to be kept free while writing the image. Always
+ * half of all available low pages before the writing starts.
+ */
+static inline unsigned long reqd_free_pages(void)
+{
+ return low_free_pages() / 2;
+}
+
struct swap_map_page {
sector_t entries[MAP_PAGE_ENTRIES];
sector_t next_swap;
@@ -72,7 +89,7 @@ struct swap_map_handle {
sector_t cur_swap;
sector_t first_sector;
unsigned int k;
- unsigned long nr_free_pages, written;
+ unsigned long reqd_free_pages;
u32 crc32;
};
@@ -84,7 +101,7 @@ struct swsusp_header {
unsigned int flags; /* Flags to pass to the "boot" kernel */
char orig_sig[10];
char sig[10];
-} __attribute__((packed));
+} __packed;
static struct swsusp_header *swsusp_header;
@@ -109,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
/* Figure out where to put the new node */
while (*new) {
- ext = container_of(*new, struct swsusp_extent, node);
+ ext = rb_entry(*new, struct swsusp_extent, node);
parent = *new;
if (swap_offset < ext->start) {
/* Try to merge */
@@ -265,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
return -ENOSPC;
if (bio_chain) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
@@ -316,8 +336,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
goto err_rel;
}
handle->k = 0;
- handle->nr_free_pages = nr_free_pages() >> 1;
- handle->written = 0;
+ handle->reqd_free_pages = reqd_free_pages();
handle->first_sector = handle->cur_swap;
return 0;
err_rel:
@@ -351,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- }
- if (bio_chain && ++handle->written > handle->nr_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
- if (error)
- goto out;
- handle->written = 0;
+
+ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ /*
+ * Recalculate the number of required free pages, to
+ * make sure we never take more than half.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+ }
}
out:
return error;
@@ -403,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
/* Maximum number of threads for compression/decompression. */
#define LZO_THREADS 3
-/* Maximum number of pages for read buffering. */
-#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
+/* Minimum/maximum number of pages for read buffering. */
+#define LZO_MIN_RD_PAGES 1024
+#define LZO_MAX_RD_PAGES 8192
/**
@@ -423,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
struct timeval start;
struct timeval stop;
- printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ",
+ printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
- m = nr_to_write / 100;
+ m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -439,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+ printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
@@ -447,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_CONT "\b\b\b\bdone\n");
- else
- printk(KERN_CONT "\n");
+ printk(KERN_INFO "PM: Image saving done.\n");
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
return ret;
}
@@ -543,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
/**
* save_image_lzo - Save the suspend image data compressed with LZO.
- * @handle: Swap mam handle to use for saving the image.
+ * @handle: Swap map handle to use for saving the image.
* @snapshot: Image to read data from.
* @nr_to_write: Number of pages to save.
*/
@@ -615,12 +639,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of free pages after all allocations have been done.
- * We don't want to run out of pages when writing.
- */
- handle->nr_free_pages = nr_free_pages() >> 1;
-
- /*
* Start the CRC32 thread.
*/
init_waitqueue_head(&crc->go);
@@ -641,11 +659,17 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
+ /*
+ * Adjust the number of required free pages after all allocations have
+ * been done. We don't want to run out of pages when writing.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+
printk(KERN_INFO
"PM: Using %u thread(s) for compression.\n"
- "PM: Compressing and saving image data (%u pages) ... ",
+ "PM: Compressing and saving image data (%u pages)...\n",
nr_threads, nr_to_write);
- m = nr_to_write / 100;
+ m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -665,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
data_of(*snapshot), PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_CONT "\b\b\b\b%3d%%",
- nr_pages / m);
+ printk(KERN_INFO
+ "PM: Image saving progress: "
+ "%3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
if (!off)
@@ -736,11 +762,8 @@ out_finish:
do_gettimeofday(&stop);
if (!ret)
ret = err2;
- if (!ret) {
- printk(KERN_CONT "\b\b\b\bdone\n");
- } else {
- printk(KERN_CONT "\n");
- }
+ if (!ret)
+ printk(KERN_INFO "PM: Image saving done.\n");
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
@@ -948,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
int err2;
unsigned nr_pages;
- printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ",
+ printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
nr_to_read);
- m = nr_to_read / 100;
+ m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -968,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
+ printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
@@ -976,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret) {
- printk("\b\b\b\bdone\n");
+ printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
- } else
- printk("\n");
+ }
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
return ret;
}
@@ -1051,7 +1074,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned i, thr, run_threads, nr_threads;
unsigned ring = 0, pg = 0, ring_size = 0,
have = 0, want, need, asked = 0;
- unsigned long read_pages;
+ unsigned long read_pages = 0;
unsigned char **page = NULL;
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
@@ -1063,7 +1086,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+ page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -1128,15 +1151,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of pages for read buffering, in case we are short.
+ * Set the number of pages for read buffering.
+ * This is complete guesswork, because we'll only know the real
+ * picture once prepare_image() is called, which is much later on
+ * during the image load phase. We'll assume the worst case and
+ * say that none of the image pages are from high memory.
*/
- read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
- read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
+ if (low_free_pages() > snapshot_get_image_size())
+ read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
+ read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
__GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT);
+ __GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
+
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
@@ -1153,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
printk(KERN_INFO
"PM: Using %u thread(s) for decompression.\n"
- "PM: Loading and decompressing image data (%u pages) ... ",
+ "PM: Loading and decompressing image data (%u pages)...\n",
nr_threads, nr_to_read);
- m = nr_to_read / 100;
+ m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -1287,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].unc + off, PAGE_SIZE);
if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
+ printk(KERN_INFO
+ "PM: Image loading progress: "
+ "%3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
ret = snapshot_write_next(snapshot);
@@ -1312,7 +1345,7 @@ out_finish:
}
do_gettimeofday(&stop);
if (!ret) {
- printk("\b\b\b\bdone\n");
+ printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
@@ -1325,8 +1358,7 @@ out_finish:
}
}
}
- } else
- printk("\n");
+ }
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
out_clean:
for (i = 0; i < ring_size; i++)
@@ -1440,6 +1472,34 @@ void swsusp_close(fmode_t mode)
blkdev_put(hib_resume_bdev, mode);
}
+/**
+ * swsusp_unmark - Unmark swsusp signature in the resume device
+ */
+
+#ifdef CONFIG_SUSPEND
+int swsusp_unmark(void)
+{
+ int error;
+
+ hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
+ error = hib_bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ error = -ENODEV;
+ }
+
+ /*
+ * We just returned from suspend, we don't need the image any more.
+ */
+ free_all_swap_pages(root_swap);
+
+ return error;
+}
+#endif
+
static int swsusp_header_init(void)
{
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13..526e8911460 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,7 +12,6 @@
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
-#include <linux/kmod.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
@@ -25,7 +24,6 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
#include <asm/uaccess.h>
@@ -38,9 +36,10 @@ static struct snapshot_data {
struct snapshot_handle handle;
int swap;
int mode;
- char frozen;
- char ready;
- char platform_support;
+ bool frozen;
+ bool ready;
+ bool platform_support;
+ bool free_bitmaps;
} snapshot_state;
atomic_t snapshot_device_available = ATOMIC_INIT(1);
@@ -50,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
struct snapshot_data *data;
int error;
+ if (!hibernation_available())
+ return -EPERM;
+
lock_system_sleep();
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -62,11 +64,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
error = -ENOSYS;
goto Unlock;
}
- if(create_basic_memory_bitmaps()) {
- atomic_inc(&snapshot_device_available);
- error = -ENOMEM;
- goto Unlock;
- }
nonseekable_open(inode, filp);
data = &snapshot_state;
filp->private_data = data;
@@ -76,6 +73,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = swsusp_resume_device ?
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
+ data->free_bitmaps = false;
error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
pm_notifier_call_chain(PM_POST_HIBERNATION);
@@ -85,21 +83,23 @@ static int snapshot_open(struct inode *inode, struct file *filp)
* appear.
*/
wait_for_device_probe();
- scsi_complete_async_scans();
data->swap = -1;
data->mode = O_WRONLY;
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ if (!error) {
+ error = create_basic_memory_bitmaps();
+ data->free_bitmaps = !error;
+ }
if (error)
pm_notifier_call_chain(PM_POST_RESTORE);
}
- if (error) {
- free_basic_memory_bitmaps();
+ if (error)
atomic_inc(&snapshot_device_available);
- }
- data->frozen = 0;
- data->ready = 0;
- data->platform_support = 0;
+
+ data->frozen = false;
+ data->ready = false;
+ data->platform_support = false;
Unlock:
unlock_system_sleep();
@@ -114,12 +114,14 @@ static int snapshot_release(struct inode *inode, struct file *filp)
lock_system_sleep();
swsusp_free();
- free_basic_memory_bitmaps();
data = filp->private_data;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
+ free_basic_memory_bitmaps();
thaw_processes();
+ } else if (data->free_bitmaps) {
+ free_basic_memory_bitmaps();
}
pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
@@ -210,6 +212,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
+ lock_device_hotplug();
data = filp->private_data;
switch (cmd) {
@@ -222,24 +225,26 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
sys_sync();
printk("done.\n");
- error = usermodehelper_disable();
+ error = freeze_processes();
if (error)
break;
- error = freeze_processes();
+ error = create_basic_memory_bitmaps();
if (error)
- usermodehelper_enable();
+ thaw_processes();
else
- data->frozen = 1;
+ data->frozen = true;
+
break;
case SNAPSHOT_UNFREEZE:
if (!data->frozen || data->ready)
break;
pm_restore_gfp_mask();
+ free_basic_memory_bitmaps();
+ data->free_bitmaps = false;
thaw_processes();
- usermodehelper_enable();
- data->frozen = 0;
+ data->frozen = false;
break;
case SNAPSHOT_CREATE_IMAGE:
@@ -249,16 +254,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
pm_restore_gfp_mask();
error = hibernation_snapshot(data->platform_support);
- if (error) {
- thaw_kernel_threads();
- } else {
+ if (!error) {
error = put_user(in_suspend, (int __user *)arg);
- if (!error && !freezer_test_done)
- data->ready = 1;
- if (freezer_test_done) {
- freezer_test_done = false;
- thaw_kernel_threads();
- }
+ data->ready = !freezer_test_done && !error;
+ freezer_test_done = false;
}
break;
@@ -275,7 +274,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
case SNAPSHOT_FREE:
swsusp_free();
memset(&data->handle, 0, sizeof(struct snapshot_handle));
- data->ready = 0;
+ data->ready = false;
/*
* It is necessary to thaw kernel threads here, because
* SNAPSHOT_CREATE_IMAGE may be invoked directly after
@@ -339,7 +338,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
* PM_HIBERNATION_PREPARE
*/
error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- data->ready = 0;
+ data->ready = false;
break;
case SNAPSHOT_PLATFORM_SUPPORT:
@@ -387,6 +386,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
+ unlock_device_hotplug();
mutex_unlock(&pm_mutex);
return error;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 00000000000..019069c84ff
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,268 @@
+/*
+ * kernel/power/wakelock.c
+ *
+ * User space wakeup sources support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This code is based on the analogous interface allowing user space to
+ * manipulate wakelocks on Android.
+ */
+
+#include <linux/capability.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#include "power.h"
+
+static DEFINE_MUTEX(wakelocks_lock);
+
+struct wakelock {
+ char *name;
+ struct rb_node node;
+ struct wakeup_source ws;
+#ifdef CONFIG_PM_WAKELOCKS_GC
+ struct list_head lru;
+#endif
+};
+
+static struct rb_root wakelocks_tree = RB_ROOT;
+
+ssize_t pm_show_wakelocks(char *buf, bool show_active)
+{
+ struct rb_node *node;
+ struct wakelock *wl;
+ char *str = buf;
+ char *end = buf + PAGE_SIZE;
+
+ mutex_lock(&wakelocks_lock);
+
+ for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
+ wl = rb_entry(node, struct wakelock, node);
+ if (wl->ws.active == show_active)
+ str += scnprintf(str, end - str, "%s ", wl->name);
+ }
+ if (str > buf)
+ str--;
+
+ str += scnprintf(str, end - str, "\n");
+
+ mutex_unlock(&wakelocks_lock);
+ return (str - buf);
+}
+
+#if CONFIG_PM_WAKELOCKS_LIMIT > 0
+static unsigned int number_of_wakelocks;
+
+static inline bool wakelocks_limit_exceeded(void)
+{
+ return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
+}
+
+static inline void increment_wakelocks_number(void)
+{
+ number_of_wakelocks++;
+}
+
+static inline void decrement_wakelocks_number(void)
+{
+ number_of_wakelocks--;
+}
+#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
+static inline bool wakelocks_limit_exceeded(void) { return false; }
+static inline void increment_wakelocks_number(void) {}
+static inline void decrement_wakelocks_number(void) {}
+#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
+
+#ifdef CONFIG_PM_WAKELOCKS_GC
+#define WL_GC_COUNT_MAX 100
+#define WL_GC_TIME_SEC 300
+
+static LIST_HEAD(wakelocks_lru_list);
+static unsigned int wakelocks_gc_count;
+
+static inline void wakelocks_lru_add(struct wakelock *wl)
+{
+ list_add(&wl->lru, &wakelocks_lru_list);
+}
+
+static inline void wakelocks_lru_most_recent(struct wakelock *wl)
+{
+ list_move(&wl->lru, &wakelocks_lru_list);
+}
+
+static void wakelocks_gc(void)
+{
+ struct wakelock *wl, *aux;
+ ktime_t now;
+
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ now = ktime_get();
+ list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
+ u64 idle_time_ns;
+ bool active;
+
+ spin_lock_irq(&wl->ws.lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
+ active = wl->ws.active;
+ spin_unlock_irq(&wl->ws.lock);
+
+ if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
+ break;
+
+ if (!active) {
+ wakeup_source_remove(&wl->ws);
+ rb_erase(&wl->node, &wakelocks_tree);
+ list_del(&wl->lru);
+ kfree(wl->name);
+ kfree(wl);
+ decrement_wakelocks_number();
+ }
+ }
+ wakelocks_gc_count = 0;
+}
+#else /* !CONFIG_PM_WAKELOCKS_GC */
+static inline void wakelocks_lru_add(struct wakelock *wl) {}
+static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
+static inline void wakelocks_gc(void) {}
+#endif /* !CONFIG_PM_WAKELOCKS_GC */
+
+static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
+ bool add_if_not_found)
+{
+ struct rb_node **node = &wakelocks_tree.rb_node;
+ struct rb_node *parent = *node;
+ struct wakelock *wl;
+
+ while (*node) {
+ int diff;
+
+ parent = *node;
+ wl = rb_entry(*node, struct wakelock, node);
+ diff = strncmp(name, wl->name, len);
+ if (diff == 0) {
+ if (wl->name[len])
+ diff = -1;
+ else
+ return wl;
+ }
+ if (diff < 0)
+ node = &(*node)->rb_left;
+ else
+ node = &(*node)->rb_right;
+ }
+ if (!add_if_not_found)
+ return ERR_PTR(-EINVAL);
+
+ if (wakelocks_limit_exceeded())
+ return ERR_PTR(-ENOSPC);
+
+ /* Not found, we have to add a new one. */
+ wl = kzalloc(sizeof(*wl), GFP_KERNEL);
+ if (!wl)
+ return ERR_PTR(-ENOMEM);
+
+ wl->name = kstrndup(name, len, GFP_KERNEL);
+ if (!wl->name) {
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws.name = wl->name;
+ wakeup_source_add(&wl->ws);
+ rb_link_node(&wl->node, parent, node);
+ rb_insert_color(&wl->node, &wakelocks_tree);
+ wakelocks_lru_add(wl);
+ increment_wakelocks_number();
+ return wl;
+}
+
+int pm_wake_lock(const char *buf)
+{
+ const char *str = buf;
+ struct wakelock *wl;
+ u64 timeout_ns = 0;
+ size_t len;
+ int ret = 0;
+
+ if (!capable(CAP_BLOCK_SUSPEND))
+ return -EPERM;
+
+ while (*str && !isspace(*str))
+ str++;
+
+ len = str - buf;
+ if (!len)
+ return -EINVAL;
+
+ if (*str && *str != '\n') {
+ /* Find out if there's a valid timeout string appended. */
+ ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
+ if (ret)
+ return -EINVAL;
+ }
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, true);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ if (timeout_ns) {
+ u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
+
+ do_div(timeout_ms, NSEC_PER_MSEC);
+ __pm_wakeup_event(&wl->ws, timeout_ms);
+ } else {
+ __pm_stay_awake(&wl->ws);
+ }
+
+ wakelocks_lru_most_recent(wl);
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
+
+int pm_wake_unlock(const char *buf)
+{
+ struct wakelock *wl;
+ size_t len;
+ int ret = 0;
+
+ if (!capable(CAP_BLOCK_SUSPEND))
+ return -EPERM;
+
+ len = strlen(buf);
+ if (!len)
+ return -EINVAL;
+
+ if (buf[len-1] == '\n')
+ len--;
+
+ if (!len)
+ return -EINVAL;
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, false);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ __pm_relax(&wl->ws);
+
+ wakelocks_lru_most_recent(wl);
+ wakelocks_gc();
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
diff --git a/kernel/printk.c b/kernel/printk.c
deleted file mode 100644
index 13c0a1143f4..00000000000
--- a/kernel/printk.c
+++ /dev/null
@@ -1,1762 +0,0 @@
-/*
- * linux/kernel/printk.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Modified to make sys_syslog() more flexible: added commands to
- * return the last 4k of kernel messages, regardless of whether
- * they've been read or not. Added option to suppress kernel printk's
- * to the console. Added hook for sending the console messages
- * elsewhere, in preparation for a serial line console (someday).
- * Ted Ts'o, 2/11/93.
- * Modified for sysctl support, 1/8/97, Chris Horn.
- * Fixed SMP synchronization, 08/08/99, Manfred Spraul
- * manfred@colorfullife.com
- * Rewrote bits to get rid of console_lock
- * 01Mar01 Andrew Morton
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/console.h>
-#include <linux/init.h>
-#include <linux/jiffies.h>
-#include <linux/nmi.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/interrupt.h> /* For in_interrupt() */
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/security.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/syscalls.h>
-#include <linux/kexec.h>
-#include <linux/kdb.h>
-#include <linux/ratelimit.h>
-#include <linux/kmsg_dump.h>
-#include <linux/syslog.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-#include <linux/rculist.h>
-
-#include <asm/uaccess.h>
-
-/*
- * Architectures can override it:
- */
-void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
-{
-}
-
-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-
-/* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
-
-/* We show everything that is MORE important than this.. */
-#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
-#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-
-int console_printk[4] = {
- DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
- DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
- MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
- DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
-};
-
-/*
- * Low level drivers may need that to know if they can schedule in
- * their unblank() callback or not. So let's export it.
- */
-int oops_in_progress;
-EXPORT_SYMBOL(oops_in_progress);
-
-/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
- */
-static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
-
-/*
- * This is used for debugging the mess that is the VT code by
- * keeping track if we have the console semaphore held. It's
- * definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
- * locked without the console sempahore held
- */
-static int console_locked, console_suspended;
-
-/*
- * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
- * It is also used in interesting ways to provide interlocking in
- * console_unlock();.
- */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-#define LOG_BUF_MASK (log_buf_len-1)
-#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
-
-/*
- * The indices into log_buf are not constrained to log_buf_len - they
- * must be masked before subscripting
- */
-static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
-static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
-static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
-
-/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
-/*
- * Array of consoles built from command line options (console=)
- */
-struct console_cmdline
-{
- char name[8]; /* Name of the driver */
- int index; /* Minor dev. to use */
- char *options; /* Options for the driver */
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
- char *brl_options; /* Options for braille driver */
-#endif
-};
-
-#define MAX_CMDLINECONSOLES 8
-
-static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
-static int selected_console = -1;
-static int preferred_console = -1;
-int console_set_on_cmdline;
-EXPORT_SYMBOL(console_set_on_cmdline);
-
-/* Flag: console code may call schedule() */
-static int console_may_schedule;
-
-#ifdef CONFIG_PRINTK
-
-static char __log_buf[__LOG_BUF_LEN];
-static char *log_buf = __log_buf;
-static int log_buf_len = __LOG_BUF_LEN;
-static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
-static int saved_console_loglevel = -1;
-
-#ifdef CONFIG_KEXEC
-/*
- * This appends the listed symbols to /proc/vmcoreinfo
- *
- * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
- * obtain access to symbols that are otherwise very difficult to locate. These
- * symbols are specifically used so that utilities can access and extract the
- * dmesg log from a vmcore file after a crash.
- */
-void log_buf_kexec_setup(void)
-{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_end);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(logged_chars);
-}
-#endif
-
-/* requested log_buf_len from kernel cmdline */
-static unsigned long __initdata new_log_buf_len;
-
-/* save requested log_buf_len since it's too early to process it */
-static int __init log_buf_len_setup(char *str)
-{
- unsigned size = memparse(str, &str);
-
- if (size)
- size = roundup_pow_of_two(size);
- if (size > log_buf_len)
- new_log_buf_len = size;
-
- return 0;
-}
-early_param("log_buf_len", log_buf_len_setup);
-
-void __init setup_log_buf(int early)
-{
- unsigned long flags;
- unsigned start, dest_idx, offset;
- char *new_log_buf;
- int free;
-
- if (!new_log_buf_len)
- return;
-
- if (early) {
- unsigned long mem;
-
- mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
- if (!mem)
- return;
- new_log_buf = __va(mem);
- } else {
- new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
- }
-
- if (unlikely(!new_log_buf)) {
- pr_err("log_buf_len: %ld bytes not available\n",
- new_log_buf_len);
- return;
- }
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- log_buf_len = new_log_buf_len;
- log_buf = new_log_buf;
- new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_end;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
-
- log_buf[dest_idx] = __log_buf[log_idx_mask];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-
- pr_info("log_buf_len: %d\n", log_buf_len);
- pr_info("early log buf free: %d(%d%%)\n",
- free, (free * 100) / __LOG_BUF_LEN);
-}
-
-#ifdef CONFIG_BOOT_PRINTK_DELAY
-
-static int boot_delay; /* msecs delay after each printk during bootup */
-static unsigned long long loops_per_msec; /* based on boot_delay */
-
-static int __init boot_delay_setup(char *str)
-{
- unsigned long lpj;
-
- lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
- loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
-
- get_option(&str, &boot_delay);
- if (boot_delay > 10 * 1000)
- boot_delay = 0;
-
- pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
- "HZ: %d, loops_per_msec: %llu\n",
- boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
- return 1;
-}
-__setup("boot_delay=", boot_delay_setup);
-
-static void boot_delay_msec(void)
-{
- unsigned long long k;
- unsigned long timeout;
-
- if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
- return;
-
- k = (unsigned long long)loops_per_msec * boot_delay;
-
- timeout = jiffies + msecs_to_jiffies(boot_delay);
- while (k) {
- k--;
- cpu_relax();
- /*
- * use (volatile) jiffies to prevent
- * compiler reduction; loop termination via jiffies
- * is secondary and may or may not happen.
- */
- if (time_after(jiffies, timeout))
- break;
- touch_nmi_watchdog();
- }
-}
-#else
-static inline void boot_delay_msec(void)
-{
-}
-#endif
-
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
-
-static int syslog_action_restricted(int type)
-{
- if (dmesg_restrict)
- return 1;
- /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
- return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
-}
-
-static int check_syslog_permissions(int type, bool from_file)
-{
- /*
- * If this is from /proc/kmsg and we've already opened it, then we've
- * already done the capabilities checks at open time.
- */
- if (from_file && type != SYSLOG_ACTION_OPEN)
- return 0;
-
- if (syslog_action_restricted(type)) {
- if (capable(CAP_SYSLOG))
- return 0;
- /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
- if (capable(CAP_SYS_ADMIN)) {
- printk_once(KERN_WARNING "%s (%d): "
- "Attempt to access syslog with CAP_SYS_ADMIN "
- "but no CAP_SYSLOG (deprecated).\n",
- current->comm, task_pid_nr(current));
- return 0;
- }
- return -EPERM;
- }
- return 0;
-}
-
-int do_syslog(int type, char __user *buf, int len, bool from_file)
-{
- unsigned i, j, limit, count;
- int do_clear = 0;
- char c;
- int error;
-
- error = check_syslog_permissions(type, from_file);
- if (error)
- goto out;
-
- error = security_syslog(type);
- if (error)
- return error;
-
- switch (type) {
- case SYSLOG_ACTION_CLOSE: /* Close log */
- break;
- case SYSLOG_ACTION_OPEN: /* Open log */
- break;
- case SYSLOG_ACTION_READ: /* Read from log */
- error = -EINVAL;
- if (!buf || len < 0)
- goto out;
- error = 0;
- if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
- error = wait_event_interruptible(log_wait,
- (log_start - log_end));
- if (error)
- goto out;
- i = 0;
- raw_spin_lock_irq(&logbuf_lock);
- while (!error && (log_start != log_end) && i < len) {
- c = LOG_BUF(log_start);
- log_start++;
- raw_spin_unlock_irq(&logbuf_lock);
- error = __put_user(c,buf);
- buf++;
- i++;
- cond_resched();
- raw_spin_lock_irq(&logbuf_lock);
- }
- raw_spin_unlock_irq(&logbuf_lock);
- if (!error)
- error = i;
- break;
- /* Read/clear last kernel messages */
- case SYSLOG_ACTION_READ_CLEAR:
- do_clear = 1;
- /* FALL THRU */
- /* Read last kernel messages */
- case SYSLOG_ACTION_READ_ALL:
- error = -EINVAL;
- if (!buf || len < 0)
- goto out;
- error = 0;
- if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
- count = len;
- if (count > log_buf_len)
- count = log_buf_len;
- raw_spin_lock_irq(&logbuf_lock);
- if (count > logged_chars)
- count = logged_chars;
- if (do_clear)
- logged_chars = 0;
- limit = log_end;
- /*
- * __put_user() could sleep, and while we sleep
- * printk() could overwrite the messages
- * we try to copy to user space. Therefore
- * the messages are copied in reverse. <manfreds>
- */
- for (i = 0; i < count && !error; i++) {
- j = limit-1-i;
- if (j + log_buf_len < log_end)
- break;
- c = LOG_BUF(j);
- raw_spin_unlock_irq(&logbuf_lock);
- error = __put_user(c,&buf[count-1-i]);
- cond_resched();
- raw_spin_lock_irq(&logbuf_lock);
- }
- raw_spin_unlock_irq(&logbuf_lock);
- if (error)
- break;
- error = i;
- if (i != count) {
- int offset = count-error;
- /* buffer overflow during copy, correct user buffer. */
- for (i = 0; i < error; i++) {
- if (__get_user(c,&buf[i+offset]) ||
- __put_user(c,&buf[i])) {
- error = -EFAULT;
- break;
- }
- cond_resched();
- }
- }
- break;
- /* Clear ring buffer */
- case SYSLOG_ACTION_CLEAR:
- logged_chars = 0;
- break;
- /* Disable logging to console */
- case SYSLOG_ACTION_CONSOLE_OFF:
- if (saved_console_loglevel == -1)
- saved_console_loglevel = console_loglevel;
- console_loglevel = minimum_console_loglevel;
- break;
- /* Enable logging to console */
- case SYSLOG_ACTION_CONSOLE_ON:
- if (saved_console_loglevel != -1) {
- console_loglevel = saved_console_loglevel;
- saved_console_loglevel = -1;
- }
- break;
- /* Set level of messages printed to console */
- case SYSLOG_ACTION_CONSOLE_LEVEL:
- error = -EINVAL;
- if (len < 1 || len > 8)
- goto out;
- if (len < minimum_console_loglevel)
- len = minimum_console_loglevel;
- console_loglevel = len;
- /* Implicitly re-enable logging to console */
- saved_console_loglevel = -1;
- error = 0;
- break;
- /* Number of chars in the log buffer */
- case SYSLOG_ACTION_SIZE_UNREAD:
- error = log_end - log_start;
- break;
- /* Size of the log buffer */
- case SYSLOG_ACTION_SIZE_BUFFER:
- error = log_buf_len;
- break;
- default:
- error = -EINVAL;
- break;
- }
-out:
- return error;
-}
-
-SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
-{
- return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
-}
-
-#ifdef CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer. do_syslog()
- * uses locks so it cannot be used during debugging. Just tell kdb
- * where the start and end of the physical and logical logs are. This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
- syslog_data[0] = log_buf;
- syslog_data[1] = log_buf + log_buf_len;
- syslog_data[2] = log_buf + log_end -
- (logged_chars < log_buf_len ? logged_chars : log_buf_len);
- syslog_data[3] = log_buf + log_end;
-}
-#endif /* CONFIG_KGDB_KDB */
-
-/*
- * Call the console drivers on a range of log_buf
- */
-static void __call_console_drivers(unsigned start, unsigned end)
-{
- struct console *con;
-
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if ((con->flags & CON_ENABLED) && con->write &&
- (cpu_online(smp_processor_id()) ||
- (con->flags & CON_ANYTIME)))
- con->write(con, &LOG_BUF(start), end - start);
- }
-}
-
-static bool __read_mostly ignore_loglevel;
-
-static int __init ignore_loglevel_setup(char *str)
-{
- ignore_loglevel = 1;
- printk(KERN_INFO "debug: ignoring loglevel setting.\n");
-
- return 0;
-}
-
-early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
- "print all kernel messages to the console.");
-
-/*
- * Write out chars from start to end - 1 inclusive
- */
-static void _call_console_drivers(unsigned start,
- unsigned end, int msg_log_level)
-{
- if ((msg_log_level < console_loglevel || ignore_loglevel) &&
- console_drivers && start != end) {
- if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
- /* wrapped write */
- __call_console_drivers(start & LOG_BUF_MASK,
- log_buf_len);
- __call_console_drivers(0, end & LOG_BUF_MASK);
- } else {
- __call_console_drivers(start, end);
- }
- }
-}
-
-/*
- * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
- * lower 3 bit are the log level, the rest are the log facility. In case
- * userspace passes usual userspace syslog messages to /dev/kmsg or
- * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
- * to extract the correct log level for in-kernel processing, and not mangle
- * the original value.
- *
- * If a prefix is found, the length of the prefix is returned. If 'level' is
- * passed, it will be filled in with the log level without a possible facility
- * value. If 'special' is passed, the special printk prefix chars are accepted
- * and returned. If no valid header is found, 0 is returned and the passed
- * variables are not touched.
- */
-static size_t log_prefix(const char *p, unsigned int *level, char *special)
-{
- unsigned int lev = 0;
- char sp = '\0';
- size_t len;
-
- if (p[0] != '<' || !p[1])
- return 0;
- if (p[2] == '>') {
- /* usual single digit level number or special char */
- switch (p[1]) {
- case '0' ... '7':
- lev = p[1] - '0';
- break;
- case 'c': /* KERN_CONT */
- case 'd': /* KERN_DEFAULT */
- sp = p[1];
- break;
- default:
- return 0;
- }
- len = 3;
- } else {
- /* multi digit including the level and facility number */
- char *endp = NULL;
-
- lev = (simple_strtoul(&p[1], &endp, 10) & 7);
- if (endp == NULL || endp[0] != '>')
- return 0;
- len = (endp + 1) - p;
- }
-
- /* do not accept special char if not asked for */
- if (sp && !special)
- return 0;
-
- if (special) {
- *special = sp;
- /* return special char, do not touch level */
- if (sp)
- return len;
- }
-
- if (level)
- *level = lev;
- return len;
-}
-
-/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
- */
-static void call_console_drivers(unsigned start, unsigned end)
-{
- unsigned cur_index, start_print;
- static int msg_level = -1;
-
- BUG_ON(((int)(start - end)) > 0);
-
- cur_index = start;
- start_print = start;
- while (cur_index != end) {
- if (msg_level < 0 && ((end - cur_index) > 2)) {
- /* strip log prefix */
- cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
- start_print = cur_index;
- }
- while (cur_index != end) {
- char c = LOG_BUF(cur_index);
-
- cur_index++;
- if (c == '\n') {
- if (msg_level < 0) {
- /*
- * printk() has already given us loglevel tags in
- * the buffer. This code is here in case the
- * log buffer has wrapped right round and scribbled
- * on those tags
- */
- msg_level = default_message_loglevel;
- }
- _call_console_drivers(start_print, cur_index, msg_level);
- msg_level = -1;
- start_print = cur_index;
- break;
- }
- }
- }
- _call_console_drivers(start_print, end, msg_level);
-}
-
-static void emit_log_char(char c)
-{
- LOG_BUF(log_end) = c;
- log_end++;
- if (log_end - log_start > log_buf_len)
- log_start = log_end - log_buf_len;
- if (log_end - con_start > log_buf_len)
- con_start = log_end - log_buf_len;
- if (logged_chars < log_buf_len)
- logged_chars++;
-}
-
-/*
- * Zap console related locks when oopsing. Only zap at most once
- * every 10 seconds, to leave time for slow consoles to print a
- * full oops.
- */
-static void zap_locks(void)
-{
- static unsigned long oops_timestamp;
-
- if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
- return;
-
- oops_timestamp = jiffies;
-
- debug_locks_off();
- /* If a crash is occurring, make sure we can't deadlock */
- raw_spin_lock_init(&logbuf_lock);
- /* And make sure that we print immediately */
- sema_init(&console_sem, 1);
-}
-
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time = 0;
-#endif
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-
-/* Check if we have any console registered that can be called early in boot. */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if (con->flags & CON_ANYTIME)
- return 1;
-
- return 0;
-}
-
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the output and
- * call the console drivers. If we fail to get the semaphore we place the output
- * into the log buffer and return. The current holder of the console_sem will
- * notice the new output in console_unlock(); and will send it to the
- * consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-
-asmlinkage int printk(const char *fmt, ...)
-{
- va_list args;
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
- va_start(args, fmt);
- r = vprintk(fmt, args);
- va_end(args);
-
- return r;
-}
-
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int printk_cpu = UINT_MAX;
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
- */
-static inline int can_use_console(unsigned int cpu)
-{
- return cpu_online(cpu) || have_callable_console();
-}
-
-/*
- * Try to get console ownership to actually show the kernel
- * messages from a 'printk'. Return true (and with the
- * console_lock held, and 'console_locked' set) if it
- * is successful, false otherwise.
- *
- * This gets called with the 'logbuf_lock' spinlock held and
- * interrupts disabled. It should return with 'lockbuf_lock'
- * released but interrupts still disabled.
- */
-static int console_trylock_for_printk(unsigned int cpu)
- __releases(&logbuf_lock)
-{
- int retval = 0, wake = 0;
-
- if (console_trylock()) {
- retval = 1;
-
- /*
- * If we can't use the console, we need to release
- * the console semaphore by hand to avoid flushing
- * the buffer. We need to hold the console semaphore
- * in order to do this test safely.
- */
- if (!can_use_console(cpu)) {
- console_locked = 0;
- wake = 1;
- retval = 0;
- }
- }
- printk_cpu = UINT_MAX;
- if (wake)
- up(&console_sem);
- raw_spin_unlock(&logbuf_lock);
- return retval;
-}
-static const char recursion_bug_msg [] =
- KERN_CRIT "BUG: recent printk recursion!\n";
-static int recursion_bug;
-static int new_text_line = 1;
-static char printk_buf[1024];
-
-int printk_delay_msec __read_mostly;
-
-static inline void printk_delay(void)
-{
- if (unlikely(printk_delay_msec)) {
- int m = printk_delay_msec;
-
- while (m--) {
- mdelay(1);
- touch_nmi_watchdog();
- }
- }
-}
-
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- int printed_len = 0;
- int current_log_level = default_message_loglevel;
- unsigned long flags;
- int this_cpu;
- char *p;
- size_t plen;
- char special;
-
- boot_delay_msec();
- printk_delay();
-
- /* This stops the holder of console_sem just where we want him */
- local_irq_save(flags);
- this_cpu = smp_processor_id();
-
- /*
- * Ouch, printk recursed into itself!
- */
- if (unlikely(printk_cpu == this_cpu)) {
- /*
- * If a crash is occurring during printk() on this CPU,
- * then try to get the crash message out but make sure
- * we can't deadlock. Otherwise just return to avoid the
- * recursion and return - but flag the recursion so that
- * it can be printed at the next appropriate moment:
- */
- if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = 1;
- goto out_restore_irqs;
- }
- zap_locks();
- }
-
- lockdep_off();
- raw_spin_lock(&logbuf_lock);
- printk_cpu = this_cpu;
-
- if (recursion_bug) {
- recursion_bug = 0;
- strcpy(printk_buf, recursion_bug_msg);
- printed_len = strlen(recursion_bug_msg);
- }
- /* Emit the output into the temporary buffer */
- printed_len += vscnprintf(printk_buf + printed_len,
- sizeof(printk_buf) - printed_len, fmt, args);
-
- p = printk_buf;
-
- /* Read log level and handle special printk prefix */
- plen = log_prefix(p, &current_log_level, &special);
- if (plen) {
- p += plen;
-
- switch (special) {
- case 'c': /* Strip <c> KERN_CONT, continue line */
- plen = 0;
- break;
- case 'd': /* Strip <d> KERN_DEFAULT, start new line */
- plen = 0;
- default:
- if (!new_text_line) {
- emit_log_char('\n');
- new_text_line = 1;
- }
- }
- }
-
- /*
- * Copy the output into log_buf. If the caller didn't provide
- * the appropriate log prefix, we insert them here
- */
- for (; *p; p++) {
- if (new_text_line) {
- new_text_line = 0;
-
- if (plen) {
- /* Copy original log prefix */
- int i;
-
- for (i = 0; i < plen; i++)
- emit_log_char(printk_buf[i]);
- printed_len += plen;
- } else {
- /* Add log prefix */
- emit_log_char('<');
- emit_log_char(current_log_level + '0');
- emit_log_char('>');
- printed_len += 3;
- }
-
- if (printk_time) {
- /* Add the current time stamp */
- char tbuf[50], *tp;
- unsigned tlen;
- unsigned long long t;
- unsigned long nanosec_rem;
-
- t = cpu_clock(printk_cpu);
- nanosec_rem = do_div(t, 1000000000);
- tlen = sprintf(tbuf, "[%5lu.%06lu] ",
- (unsigned long) t,
- nanosec_rem / 1000);
-
- for (tp = tbuf; tp < tbuf + tlen; tp++)
- emit_log_char(*tp);
- printed_len += tlen;
- }
-
- if (!*p)
- break;
- }
-
- emit_log_char(*p);
- if (*p == '\n')
- new_text_line = 1;
- }
-
- /*
- * Try to acquire and then immediately release the
- * console semaphore. The release will do all the
- * actual magic (print out buffers, wake up klogd,
- * etc).
- *
- * The console_trylock_for_printk() function
- * will release 'logbuf_lock' regardless of whether it
- * actually gets the semaphore or not.
- */
- if (console_trylock_for_printk(this_cpu))
- console_unlock();
-
- lockdep_on();
-out_restore_irqs:
- local_irq_restore(flags);
-
- return printed_len;
-}
-EXPORT_SYMBOL(printk);
-EXPORT_SYMBOL(vprintk);
-
-#else
-
-static void call_console_drivers(unsigned start, unsigned end)
-{
-}
-
-#endif
-
-static int __add_preferred_console(char *name, int idx, char *options,
- char *brl_options)
-{
- struct console_cmdline *c;
- int i;
-
- /*
- * See if this tty is not yet registered, and
- * if we have a slot free.
- */
- for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
- if (strcmp(console_cmdline[i].name, name) == 0 &&
- console_cmdline[i].index == idx) {
- if (!brl_options)
- selected_console = i;
- return 0;
- }
- if (i == MAX_CMDLINECONSOLES)
- return -E2BIG;
- if (!brl_options)
- selected_console = i;
- c = &console_cmdline[i];
- strlcpy(c->name, name, sizeof(c->name));
- c->options = options;
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
- c->brl_options = brl_options;
-#endif
- c->index = idx;
- return 0;
-}
-/*
- * Set up a list of consoles. Called from init/main.c
- */
-static int __init console_setup(char *str)
-{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
- char *s, *options, *brl_options = NULL;
- int idx;
-
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
- if (!memcmp(str, "brl,", 4)) {
- brl_options = "";
- str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
- brl_options = str + 4;
- str = strchr(brl_options, ',');
- if (!str) {
- printk(KERN_ERR "need port name after brl=\n");
- return 1;
- }
- *(str++) = 0;
- }
-#endif
-
- /*
- * Decode str into name, index, options.
- */
- if (str[0] >= '0' && str[0] <= '9') {
- strcpy(buf, "ttyS");
- strncpy(buf + 4, str, sizeof(buf) - 5);
- } else {
- strncpy(buf, str, sizeof(buf) - 1);
- }
- buf[sizeof(buf) - 1] = 0;
- if ((options = strchr(str, ',')) != NULL)
- *(options++) = 0;
-#ifdef __sparc__
- if (!strcmp(str, "ttya"))
- strcpy(buf, "ttyS0");
- if (!strcmp(str, "ttyb"))
- strcpy(buf, "ttyS1");
-#endif
- for (s = buf; *s; s++)
- if ((*s >= '0' && *s <= '9') || *s == ',')
- break;
- idx = simple_strtoul(s, NULL, 10);
- *s = 0;
-
- __add_preferred_console(buf, idx, options, brl_options);
- console_set_on_cmdline = 1;
- return 1;
-}
-__setup("console=", console_setup);
-
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- * @name: device name
- * @idx: device index
- * @options: options for this console
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init. Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int add_preferred_console(char *name, int idx, char *options)
-{
- return __add_preferred_console(name, idx, options, NULL);
-}
-
-int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
-{
- struct console_cmdline *c;
- int i;
-
- for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
- if (strcmp(console_cmdline[i].name, name) == 0 &&
- console_cmdline[i].index == idx) {
- c = &console_cmdline[i];
- strlcpy(c->name, name_new, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
- c->options = options;
- c->index = idx_new;
- return i;
- }
- /* not found */
- return -1;
-}
-
-bool console_suspend_enabled = 1;
-EXPORT_SYMBOL(console_suspend_enabled);
-
-static int __init console_suspend_disable(char *str)
-{
- console_suspend_enabled = 0;
- return 1;
-}
-__setup("no_console_suspend", console_suspend_disable);
-module_param_named(console_suspend, console_suspend_enabled,
- bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
- " and hibernate operations");
-
-/**
- * suspend_console - suspend the console subsystem
- *
- * This disables printk() while we go into suspend states
- */
-void suspend_console(void)
-{
- if (!console_suspend_enabled)
- return;
- printk("Suspending console(s) (use no_console_suspend to debug)\n");
- console_lock();
- console_suspended = 1;
- up(&console_sem);
-}
-
-void resume_console(void)
-{
- if (!console_suspend_enabled)
- return;
- down(&console_sem);
- console_suspended = 0;
- console_unlock();
-}
-
-/**
- * console_cpu_notify - print deferred console messages after CPU hotplug
- * @self: notifier struct
- * @action: CPU hotplug event
- * @hcpu: unused
- *
- * If printk() is called from a CPU that is not online yet, the messages
- * will be spooled but will not show up on the console. This function is
- * called when a new CPU comes online (or fails to come up), and ensures
- * that any such output gets printed.
- */
-static int __cpuinit console_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- switch (action) {
- case CPU_ONLINE:
- case CPU_DEAD:
- case CPU_DYING:
- case CPU_DOWN_FAILED:
- case CPU_UP_CANCELED:
- console_lock();
- console_unlock();
- }
- return NOTIFY_OK;
-}
-
-/**
- * console_lock - lock the console system for exclusive use.
- *
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
- *
- * Can sleep, returns nothing.
- */
-void console_lock(void)
-{
- BUG_ON(in_interrupt());
- down(&console_sem);
- if (console_suspended)
- return;
- console_locked = 1;
- console_may_schedule = 1;
-}
-EXPORT_SYMBOL(console_lock);
-
-/**
- * console_trylock - try to lock the console system for exclusive use.
- *
- * Tried to acquire a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
- *
- * returns 1 on success, and 0 on failure to acquire the lock.
- */
-int console_trylock(void)
-{
- if (down_trylock(&console_sem))
- return 0;
- if (console_suspended) {
- up(&console_sem);
- return 0;
- }
- console_locked = 1;
- console_may_schedule = 0;
- return 1;
-}
-EXPORT_SYMBOL(console_trylock);
-
-int is_console_locked(void)
-{
- return console_locked;
-}
-
-static DEFINE_PER_CPU(int, printk_pending);
-
-void printk_tick(void)
-{
- if (__this_cpu_read(printk_pending)) {
- __this_cpu_write(printk_pending, 0);
- wake_up_interruptible(&log_wait);
- }
-}
-
-int printk_needs_cpu(int cpu)
-{
- if (cpu_is_offline(cpu))
- printk_tick();
- return __this_cpu_read(printk_pending);
-}
-
-void wake_up_klogd(void)
-{
- if (waitqueue_active(&log_wait))
- this_cpu_write(printk_pending, 1);
-}
-
-/**
- * console_unlock - unlock the console system
- *
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
- *
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * If there is output waiting for klogd, we wake it up.
- *
- * console_unlock(); may be called from any context.
- */
-void console_unlock(void)
-{
- unsigned long flags;
- unsigned _con_start, _log_end;
- unsigned wake_klogd = 0, retry = 0;
-
- if (console_suspended) {
- up(&console_sem);
- return;
- }
-
- console_may_schedule = 0;
-
-again:
- for ( ; ; ) {
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- wake_klogd |= log_start - log_end;
- if (con_start == log_end)
- break; /* Nothing to print */
- _con_start = con_start;
- _log_end = log_end;
- con_start = log_end; /* Flush */
- raw_spin_unlock(&logbuf_lock);
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(_con_start, _log_end);
- start_critical_timings();
- local_irq_restore(flags);
- }
- console_locked = 0;
-
- /* Release the exclusive_console once it is used */
- if (unlikely(exclusive_console))
- exclusive_console = NULL;
-
- raw_spin_unlock(&logbuf_lock);
-
- up(&console_sem);
-
- /*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
- */
- raw_spin_lock(&logbuf_lock);
- if (con_start != log_end)
- retry = 1;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-
- if (retry && console_trylock())
- goto again;
-
- if (wake_klogd)
- wake_up_klogd();
-}
-EXPORT_SYMBOL(console_unlock);
-
-/**
- * console_conditional_schedule - yield the CPU if required
- *
- * If the console code is currently allowed to sleep, and
- * if this CPU should yield the CPU to another task, do
- * so here.
- *
- * Must be called within console_lock();.
- */
-void __sched console_conditional_schedule(void)
-{
- if (console_may_schedule)
- cond_resched();
-}
-EXPORT_SYMBOL(console_conditional_schedule);
-
-void console_unblank(void)
-{
- struct console *c;
-
- /*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
- */
- if (oops_in_progress) {
- if (down_trylock(&console_sem) != 0)
- return;
- } else
- console_lock();
-
- console_locked = 1;
- console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
- c->unblank();
- console_unlock();
-}
-
-/*
- * Return the console tty driver structure and its associated index
- */
-struct tty_driver *console_device(int *index)
-{
- struct console *c;
- struct tty_driver *driver = NULL;
-
- console_lock();
- for_each_console(c) {
- if (!c->device)
- continue;
- driver = c->device(c, index);
- if (driver)
- break;
- }
- console_unlock();
- return driver;
-}
-
-/*
- * Prevent further output on the passed console device so that (for example)
- * serial drivers can disable console output before suspending a port, and can
- * re-enable output afterwards.
- */
-void console_stop(struct console *console)
-{
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
-}
-EXPORT_SYMBOL(console_stop);
-
-void console_start(struct console *console)
-{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
-}
-EXPORT_SYMBOL(console_start);
-
-static int __read_mostly keep_bootcon;
-
-static int __init keep_bootcon_setup(char *str)
-{
- keep_bootcon = 1;
- printk(KERN_INFO "debug: skip boot console de-registration.\n");
-
- return 0;
-}
-
-early_param("keep_bootcon", keep_bootcon_setup);
-
-/*
- * The console driver calls this routine during kernel initialization
- * to register the console printing procedure with printk() and to
- * print any messages that were printed by the kernel before the
- * console driver was initialized.
- *
- * This can happen pretty early during the boot process (because of
- * early_printk) - sometimes before setup_arch() completes - be careful
- * of what kernel features are used - they may not be initialised yet.
- *
- * There are two types of consoles - bootconsoles (early_printk) and
- * "real" consoles (everything which is not a bootconsole) which are
- * handled differently.
- * - Any number of bootconsoles can be registered at any time.
- * - As soon as a "real" console is registered, all bootconsoles
- * will be unregistered automatically.
- * - Once a "real" console is registered, any attempt to register a
- * bootconsoles will be rejected
- */
-void register_console(struct console *newcon)
-{
- int i;
- unsigned long flags;
- struct console *bcon = NULL;
-
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (console_drivers && newcon->flags & CON_BOOT) {
- /* find the last or real console */
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- printk(KERN_INFO "Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
- }
- }
-
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
-
- if (preferred_console < 0 || bcon || !console_drivers)
- preferred_console = selected_console;
-
- if (newcon->early_setup)
- newcon->early_setup();
-
- /*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
- */
- if (preferred_console < 0) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- preferred_console = 0;
- }
- }
- }
-
- /*
- * See if this console matches one we selected on
- * the command line.
- */
- for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
- i++) {
- if (strcmp(console_cmdline[i].name, newcon->name) != 0)
- continue;
- if (newcon->index >= 0 &&
- newcon->index != console_cmdline[i].index)
- continue;
- if (newcon->index < 0)
- newcon->index = console_cmdline[i].index;
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
- if (console_cmdline[i].brl_options) {
- newcon->flags |= CON_BRL;
- braille_register_console(newcon,
- console_cmdline[i].index,
- console_cmdline[i].options,
- console_cmdline[i].brl_options);
- return;
- }
-#endif
- if (newcon->setup &&
- newcon->setup(newcon, console_cmdline[i].options) != 0)
- break;
- newcon->flags |= CON_ENABLED;
- newcon->index = console_cmdline[i].index;
- if (i == selected_console) {
- newcon->flags |= CON_CONSDEV;
- preferred_console = selected_console;
- }
- break;
- }
-
- if (!(newcon->flags & CON_ENABLED))
- return;
-
- /*
- * If we have a bootconsole, and are switching to a real console,
- * don't print everything out again, since when the boot console, and
- * the real console are the same physical device, it's annoying to
- * see the beginning boot messages twice
- */
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
- newcon->flags &= ~CON_PRINTBUFFER;
-
- /*
- * Put this console in the list - keep the
- * preferred driver at the head of the list.
- */
- console_lock();
- if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
- newcon->next = console_drivers;
- console_drivers = newcon;
- if (newcon->next)
- newcon->next->flags &= ~CON_CONSDEV;
- } else {
- newcon->next = console_drivers->next;
- console_drivers->next = newcon;
- }
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- con_start = log_start;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- /*
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- */
- exclusive_console = newcon;
- }
- console_unlock();
- console_sysfs_notify();
-
- /*
- * By unregistering the bootconsoles after we enable the real console
- * we get the "console xxx enabled" message on all the consoles -
- * boot consoles, real consoles, etc - this is to ensure that end
- * users know there might be something in the kernel's log buffer that
- * went to the bootconsole (that they do not see on the real console)
- */
- if (bcon &&
- ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
- !keep_bootcon) {
- /* we need to iterate through twice, to make sure we print
- * everything out, before we unregister the console(s)
- */
- printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
- newcon->name, newcon->index);
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
- } else {
- printk(KERN_INFO "%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
- }
-}
-EXPORT_SYMBOL(register_console);
-
-int unregister_console(struct console *console)
-{
- struct console *a, *b;
- int res = 1;
-
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
- if (console->flags & CON_BRL)
- return braille_unregister_console(console);
-#endif
-
- console_lock();
- if (console_drivers == console) {
- console_drivers=console->next;
- res = 0;
- } else if (console_drivers) {
- for (a=console_drivers->next, b=console_drivers ;
- a; b=a, a=b->next) {
- if (a == console) {
- b->next = a->next;
- res = 0;
- break;
- }
- }
- }
-
- /*
- * If this isn't the last console and it has CON_CONSDEV set, we
- * need to set it on the next preferred console.
- */
- if (console_drivers != NULL && console->flags & CON_CONSDEV)
- console_drivers->flags |= CON_CONSDEV;
-
- console_unlock();
- console_sysfs_notify();
- return res;
-}
-EXPORT_SYMBOL(unregister_console);
-
-static int __init printk_late_init(void)
-{
- struct console *con;
-
- for_each_console(con) {
- if (!keep_bootcon && con->flags & CON_BOOT) {
- printk(KERN_INFO "turn off boot console %s%d\n",
- con->name, con->index);
- unregister_console(con);
- }
- }
- hotcpu_notifier(console_cpu_notify, 0);
- return 0;
-}
-late_initcall(printk_late_init);
-
-#if defined CONFIG_PRINTK
-
-/*
- * printk rate limiting, lifted from the networking subsystem.
- *
- * This enforces a rate limit: not more than 10 kernel messages
- * every 5s to make a denial-of-service attack impossible.
- */
-DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
-
-int __printk_ratelimit(const char *func)
-{
- return ___ratelimit(&printk_ratelimit_state, func);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-
-/**
- * printk_timed_ratelimit - caller-controlled printk ratelimiting
- * @caller_jiffies: pointer to caller's state
- * @interval_msecs: minimum interval between prints
- *
- * printk_timed_ratelimit() returns true if more than @interval_msecs
- * milliseconds have elapsed since the last time printk_timed_ratelimit()
- * returned true.
- */
-bool printk_timed_ratelimit(unsigned long *caller_jiffies,
- unsigned int interval_msecs)
-{
- if (*caller_jiffies == 0
- || !time_in_range(jiffies, *caller_jiffies,
- *caller_jiffies
- + msecs_to_jiffies(interval_msecs))) {
- *caller_jiffies = jiffies;
- return true;
- }
- return false;
-}
-EXPORT_SYMBOL(printk_timed_ratelimit);
-
-static DEFINE_SPINLOCK(dump_list_lock);
-static LIST_HEAD(dump_list);
-
-/**
- * kmsg_dump_register - register a kernel log dumper.
- * @dumper: pointer to the kmsg_dumper structure
- *
- * Adds a kernel log dumper to the system. The dump callback in the
- * structure will be called when the kernel oopses or panics and must be
- * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
- */
-int kmsg_dump_register(struct kmsg_dumper *dumper)
-{
- unsigned long flags;
- int err = -EBUSY;
-
- /* The dump callback needs to be set */
- if (!dumper->dump)
- return -EINVAL;
-
- spin_lock_irqsave(&dump_list_lock, flags);
- /* Don't allow registering multiple times */
- if (!dumper->registered) {
- dumper->registered = 1;
- list_add_tail_rcu(&dumper->list, &dump_list);
- err = 0;
- }
- spin_unlock_irqrestore(&dump_list_lock, flags);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(kmsg_dump_register);
-
-/**
- * kmsg_dump_unregister - unregister a kmsg dumper.
- * @dumper: pointer to the kmsg_dumper structure
- *
- * Removes a dump device from the system. Returns zero on success and
- * %-EINVAL otherwise.
- */
-int kmsg_dump_unregister(struct kmsg_dumper *dumper)
-{
- unsigned long flags;
- int err = -EINVAL;
-
- spin_lock_irqsave(&dump_list_lock, flags);
- if (dumper->registered) {
- dumper->registered = 0;
- list_del_rcu(&dumper->list);
- err = 0;
- }
- spin_unlock_irqrestore(&dump_list_lock, flags);
- synchronize_rcu();
-
- return err;
-}
-EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-
-/**
- * kmsg_dump - dump kernel log to kernel message dumpers.
- * @reason: the reason (oops, panic etc) for dumping
- *
- * Iterate through each of the dump devices and call the oops/panic
- * callbacks with the log buffer.
- */
-void kmsg_dump(enum kmsg_dump_reason reason)
-{
- unsigned long end;
- unsigned chars;
- struct kmsg_dumper *dumper;
- const char *s1, *s2;
- unsigned long l1, l2;
- unsigned long flags;
-
- /* Theoretically, the log could move on after we do this, but
- there's not a lot we can do about that. The new messages
- will overwrite the start of what we dump. */
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- end = log_end & LOG_BUF_MASK;
- chars = logged_chars;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-
- if (chars > end) {
- s1 = log_buf + log_buf_len - chars + end;
- l1 = chars - end;
-
- s2 = log_buf;
- l2 = end;
- } else {
- s1 = "";
- l1 = 0;
-
- s2 = log_buf + end - chars;
- l2 = chars;
- }
-
- rcu_read_lock();
- list_for_each_entry_rcu(dumper, &dump_list, list)
- dumper->dump(dumper, reason, s1, l1, s2, l2);
- rcu_read_unlock();
-}
-#endif
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
new file mode 100644
index 00000000000..85405bdcf2b
--- /dev/null
+++ b/kernel/printk/Makefile
@@ -0,0 +1,2 @@
+obj-y = printk.o
+obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
new file mode 100644
index 00000000000..276762f3a46
--- /dev/null
+++ b/kernel/printk/braille.c
@@ -0,0 +1,49 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/string.h>
+
+#include "console_cmdline.h"
+#include "braille.h"
+
+char *_braille_console_setup(char **str, char **brl_options)
+{
+ if (!memcmp(*str, "brl,", 4)) {
+ *brl_options = "";
+ *str += 4;
+ } else if (!memcmp(str, "brl=", 4)) {
+ *brl_options = *str + 4;
+ *str = strchr(*brl_options, ',');
+ if (!*str)
+ pr_err("need port name after brl=\n");
+ else
+ *((*str)++) = 0;
+ } else
+ return NULL;
+
+ return *str;
+}
+
+int
+_braille_register_console(struct console *console, struct console_cmdline *c)
+{
+ int rtn = 0;
+
+ if (c->brl_options) {
+ console->flags |= CON_BRL;
+ rtn = braille_register_console(console, c->index, c->options,
+ c->brl_options);
+ }
+
+ return rtn;
+}
+
+int
+_braille_unregister_console(struct console *console)
+{
+ if (console->flags & CON_BRL)
+ return braille_unregister_console(console);
+
+ return 0;
+}
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
new file mode 100644
index 00000000000..769d771145c
--- /dev/null
+++ b/kernel/printk/braille.h
@@ -0,0 +1,48 @@
+#ifndef _PRINTK_BRAILLE_H
+#define _PRINTK_BRAILLE_H
+
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+
+static inline void
+braille_set_options(struct console_cmdline *c, char *brl_options)
+{
+ c->brl_options = brl_options;
+}
+
+char *
+_braille_console_setup(char **str, char **brl_options);
+
+int
+_braille_register_console(struct console *console, struct console_cmdline *c);
+
+int
+_braille_unregister_console(struct console *console);
+
+#else
+
+static inline void
+braille_set_options(struct console_cmdline *c, char *brl_options)
+{
+}
+
+static inline char *
+_braille_console_setup(char **str, char **brl_options)
+{
+ return NULL;
+}
+
+static inline int
+_braille_register_console(struct console *console, struct console_cmdline *c)
+{
+ return 0;
+}
+
+static inline int
+_braille_unregister_console(struct console *console)
+{
+ return 0;
+}
+
+#endif
+
+#endif
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
new file mode 100644
index 00000000000..cbd69d84234
--- /dev/null
+++ b/kernel/printk/console_cmdline.h
@@ -0,0 +1,14 @@
+#ifndef _CONSOLE_CMDLINE_H
+#define _CONSOLE_CMDLINE_H
+
+struct console_cmdline
+{
+ char name[8]; /* Name of the driver */
+ int index; /* Minor dev. to use */
+ char *options; /* Options for the driver */
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ char *brl_options; /* Options for braille driver */
+#endif
+};
+
+#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
new file mode 100644
index 00000000000..13e839dbca0
--- /dev/null
+++ b/kernel/printk/printk.c
@@ -0,0 +1,3009 @@
+/*
+ * linux/kernel/printk.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Modified to make sys_syslog() more flexible: added commands to
+ * return the last 4k of kernel messages, regardless of whether
+ * they've been read or not. Added option to suppress kernel printk's
+ * to the console. Added hook for sending the console messages
+ * elsewhere, in preparation for a serial line console (someday).
+ * Ted Ts'o, 2/11/93.
+ * Modified for sysctl support, 1/8/97, Chris Horn.
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
+ * manfred@colorfullife.com
+ * Rewrote bits to get rid of console_lock
+ * 01Mar01 Andrew Morton
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h> /* For in_interrupt() */
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/security.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/aio.h>
+#include <linux/syscalls.h>
+#include <linux/kexec.h>
+#include <linux/kdb.h>
+#include <linux/ratelimit.h>
+#include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/poll.h>
+#include <linux/irq_work.h>
+#include <linux/utsname.h>
+
+#include <asm/uaccess.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/printk.h>
+
+#include "console_cmdline.h"
+#include "braille.h"
+
+int console_printk[4] = {
+ CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
+ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
+ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
+ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
+};
+
+/* Deferred messaged from sched code are marked by this special level */
+#define SCHED_MESSAGE_LOGLEVEL -2
+
+/*
+ * Low level drivers may need that to know if they can schedule in
+ * their unblank() callback or not. So let's export it.
+ */
+int oops_in_progress;
+EXPORT_SYMBOL(oops_in_progress);
+
+/*
+ * console_sem protects the console_drivers list, and also
+ * provides serialisation for access to the entire console
+ * driver system.
+ */
+static DEFINE_SEMAPHORE(console_sem);
+struct console *console_drivers;
+EXPORT_SYMBOL_GPL(console_drivers);
+
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_lock_dep_map = {
+ .name = "console_lock"
+};
+#endif
+
+/*
+ * Helper macros to handle lockdep when locking/unlocking console_sem. We use
+ * macros instead of functions so that _RET_IP_ contains useful information.
+ */
+#define down_console_sem() do { \
+ down(&console_sem);\
+ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
+} while (0)
+
+static int __down_trylock_console_sem(unsigned long ip)
+{
+ if (down_trylock(&console_sem))
+ return 1;
+ mutex_acquire(&console_lock_dep_map, 0, 1, ip);
+ return 0;
+}
+#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
+
+#define up_console_sem() do { \
+ mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
+ up(&console_sem);\
+} while (0)
+
+/*
+ * This is used for debugging the mess that is the VT code by
+ * keeping track if we have the console semaphore held. It's
+ * definitely not the perfect debug tool (we don't know if _WE_
+ * hold it are racing, but it helps tracking those weird code
+ * path in the console code where we end up in places I want
+ * locked without the console sempahore held
+ */
+static int console_locked, console_suspended;
+
+/*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+
+/*
+ * Array of consoles built from command line options (console=)
+ */
+
+#define MAX_CMDLINECONSOLES 8
+
+static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
+
+static int selected_console = -1;
+static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
+
+/* Flag: console code may call schedule() */
+static int console_may_schedule;
+
+/*
+ * The printk log buffer consists of a chain of concatenated variable
+ * length records. Every record starts with a record header, containing
+ * the overall length of the record.
+ *
+ * The heads to the first and last entry in the buffer, as well as the
+ * sequence numbers of these both entries are maintained when messages
+ * are stored..
+ *
+ * If the heads indicate available messages, the length in the header
+ * tells the start next message. A length == 0 for the next message
+ * indicates a wrap-around to the beginning of the buffer.
+ *
+ * Every record carries the monotonic timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual
+ * kernel messages use LOG_KERN; userspace-injected messages always carry
+ * a matching syslog facility, by default LOG_USER. The origin of every
+ * message can be reliably determined that way.
+ *
+ * The human readable log message directly follows the message header. The
+ * length of the message text is stored in the header, the stored message
+ * is not terminated.
+ *
+ * Optionally, a message can carry a dictionary of properties (key/value pairs),
+ * to provide userspace with a machine-readable message context.
+ *
+ * Examples for well-defined, commonly used property names are:
+ * DEVICE=b12:8 device identifier
+ * b12:8 block dev_t
+ * c127:3 char dev_t
+ * n8 netdev ifindex
+ * +sound:card0 subsystem:devname
+ * SUBSYSTEM=pci driver-core subsystem name
+ *
+ * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
+ * follows directly after a '=' character. Every property is terminated by
+ * a '\0' character. The last property is not terminated.
+ *
+ * Example of a message structure:
+ * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
+ * 0008 34 00 record is 52 bytes long
+ * 000a 0b 00 text is 11 bytes long
+ * 000c 1f 00 dictionary is 23 bytes long
+ * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
+ * 0010 69 74 27 73 20 61 20 6c "it's a l"
+ * 69 6e 65 "ine"
+ * 001b 44 45 56 49 43 "DEVIC"
+ * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
+ * 52 49 56 45 52 3d 62 75 "RIVER=bu"
+ * 67 "g"
+ * 0032 00 00 00 padding to next message header
+ *
+ * The 'struct printk_log' buffer header must never be directly exported to
+ * userspace, it is a kernel-private implementation detail that might
+ * need to be changed in the future, when the requirements change.
+ *
+ * /dev/kmsg exports the structured data in the following line format:
+ * "level,sequnum,timestamp;<message text>\n"
+ *
+ * The optional key/value pairs are attached as continuation lines starting
+ * with a space character and terminated by a newline. All possible
+ * non-prinatable characters are escaped in the "\xff" notation.
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
+ */
+
+enum log_flags {
+ LOG_NOCONS = 1, /* already flushed, do not print to console */
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_PREFIX = 4, /* text started with a prefix */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
+
+struct printk_log {
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 len; /* length of entire record */
+ u16 text_len; /* length of text buffer */
+ u16 dict_len; /* length of dictionary buffer */
+ u8 facility; /* syslog facility */
+ u8 flags:5; /* internal record flags */
+ u8 level:3; /* syslog level */
+};
+
+/*
+ * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
+ * within the scheduler's rq lock. It must be released before calling
+ * console_unlock() or anything else that might wake up a process.
+ */
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
+
+#ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* the next printk record to read by syslog(READ) or /proc/kmsg */
+static u64 syslog_seq;
+static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
+
+/* index and sequence number of the first record stored in the buffer */
+static u64 log_first_seq;
+static u32 log_first_idx;
+
+/* index and sequence number of the next record to store in the buffer */
+static u64 log_next_seq;
+static u32 log_next_idx;
+
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
+
+/* the next printk record to read after the last 'clear' command */
+static u64 clear_seq;
+static u32 clear_idx;
+
+#define PREFIX_MAX 32
+#define LOG_LINE_MAX 1024 - PREFIX_MAX
+
+/* record buffer */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define LOG_ALIGN 4
+#else
+#define LOG_ALIGN __alignof__(struct printk_log)
+#endif
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
+static char *log_buf = __log_buf;
+static u32 log_buf_len = __LOG_BUF_LEN;
+
+/* human readable text of the record */
+static char *log_text(const struct printk_log *msg)
+{
+ return (char *)msg + sizeof(struct printk_log);
+}
+
+/* optional key/value pair dictionary attached to the record */
+static char *log_dict(const struct printk_log *msg)
+{
+ return (char *)msg + sizeof(struct printk_log) + msg->text_len;
+}
+
+/* get record by index; idx must point to valid msg */
+static struct printk_log *log_from_idx(u32 idx)
+{
+ struct printk_log *msg = (struct printk_log *)(log_buf + idx);
+
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer.
+ */
+ if (!msg->len)
+ return (struct printk_log *)log_buf;
+ return msg;
+}
+
+/* get next record; idx must point to valid msg */
+static u32 log_next(u32 idx)
+{
+ struct printk_log *msg = (struct printk_log *)(log_buf + idx);
+
+ /* length == 0 indicates the end of the buffer; wrap */
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer as *this* one, and
+ * return the one after that.
+ */
+ if (!msg->len) {
+ msg = (struct printk_log *)log_buf;
+ return msg->len;
+ }
+ return idx + msg->len;
+}
+
+/*
+ * Check whether there is enough free space for the given message.
+ *
+ * The same values of first_idx and next_idx mean that the buffer
+ * is either empty or full.
+ *
+ * If the buffer is empty, we must respect the position of the indexes.
+ * They cannot be reset to the beginning of the buffer.
+ */
+static int logbuf_has_space(u32 msg_size, bool empty)
+{
+ u32 free;
+
+ if (log_next_idx > log_first_idx || empty)
+ free = max(log_buf_len - log_next_idx, log_first_idx);
+ else
+ free = log_first_idx - log_next_idx;
+
+ /*
+ * We need space also for an empty header that signalizes wrapping
+ * of the buffer.
+ */
+ return free >= msg_size + sizeof(struct printk_log);
+}
+
+static int log_make_free_space(u32 msg_size)
+{
+ while (log_first_seq < log_next_seq) {
+ if (logbuf_has_space(msg_size, false))
+ return 0;
+ /* drop old messages until we have enough continuous space */
+ log_first_idx = log_next(log_first_idx);
+ log_first_seq++;
+ }
+
+ /* sequence numbers are equal, so the log buffer is empty */
+ if (logbuf_has_space(msg_size, true))
+ return 0;
+
+ return -ENOMEM;
+}
+
+/* compute the message size including the padding bytes */
+static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+{
+ u32 size;
+
+ size = sizeof(struct printk_log) + text_len + dict_len;
+ *pad_len = (-size) & (LOG_ALIGN - 1);
+ size += *pad_len;
+
+ return size;
+}
+
+/*
+ * Define how much of the log buffer we could take at maximum. The value
+ * must be greater than two. Note that only half of the buffer is available
+ * when the index points to the middle.
+ */
+#define MAX_LOG_TAKE_PART 4
+static const char trunc_msg[] = "<truncated>";
+
+static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
+ u16 *dict_len, u32 *pad_len)
+{
+ /*
+ * The message should not take the whole buffer. Otherwise, it might
+ * get removed too soon.
+ */
+ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+ if (*text_len > max_text_len)
+ *text_len = max_text_len;
+ /* enable the warning message */
+ *trunc_msg_len = strlen(trunc_msg);
+ /* disable the "dict" completely */
+ *dict_len = 0;
+ /* compute the size again, count also the warning message */
+ return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+}
+
+/* insert record into the buffer, discard old ones, update heads */
+static int log_store(int facility, int level,
+ enum log_flags flags, u64 ts_nsec,
+ const char *dict, u16 dict_len,
+ const char *text, u16 text_len)
+{
+ struct printk_log *msg;
+ u32 size, pad_len;
+ u16 trunc_msg_len = 0;
+
+ /* number of '\0' padding bytes to next message */
+ size = msg_used_size(text_len, dict_len, &pad_len);
+
+ if (log_make_free_space(size)) {
+ /* truncate the message if it is too long for empty buffer */
+ size = truncate_msg(&text_len, &trunc_msg_len,
+ &dict_len, &pad_len);
+ /* survive when the log buffer is too small for trunc_msg */
+ if (log_make_free_space(size))
+ return 0;
+ }
+
+ if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
+ /*
+ * This message + an additional empty header does not fit
+ * at the end of the buffer. Add an empty header with len == 0
+ * to signify a wrap around.
+ */
+ memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
+ log_next_idx = 0;
+ }
+
+ /* fill message */
+ msg = (struct printk_log *)(log_buf + log_next_idx);
+ memcpy(log_text(msg), text, text_len);
+ msg->text_len = text_len;
+ if (trunc_msg_len) {
+ memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
+ msg->text_len += trunc_msg_len;
+ }
+ memcpy(log_dict(msg), dict, dict_len);
+ msg->dict_len = dict_len;
+ msg->facility = facility;
+ msg->level = level & 7;
+ msg->flags = flags & 0x1f;
+ if (ts_nsec > 0)
+ msg->ts_nsec = ts_nsec;
+ else
+ msg->ts_nsec = local_clock();
+ memset(log_dict(msg) + dict_len, 0, pad_len);
+ msg->len = size;
+
+ /* insert message */
+ log_next_idx += msg->len;
+ log_next_seq++;
+
+ return msg->text_len;
+}
+
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
+static int syslog_action_restricted(int type)
+{
+ if (dmesg_restrict)
+ return 1;
+ /*
+ * Unless restricted, we allow "read all" and "get buffer size"
+ * for everybody.
+ */
+ return type != SYSLOG_ACTION_READ_ALL &&
+ type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+ /*
+ * If this is from /proc/kmsg and we've already opened it, then we've
+ * already done the capabilities checks at open time.
+ */
+ if (from_file && type != SYSLOG_ACTION_OPEN)
+ return 0;
+
+ if (syslog_action_restricted(type)) {
+ if (capable(CAP_SYSLOG))
+ return 0;
+ /*
+ * For historical reasons, accept CAP_SYS_ADMIN too, with
+ * a warning.
+ */
+ if (capable(CAP_SYS_ADMIN)) {
+ pr_warn_once("%s (%d): Attempt to access syslog with "
+ "CAP_SYS_ADMIN but no CAP_SYSLOG "
+ "(deprecated).\n",
+ current->comm, task_pid_nr(current));
+ return 0;
+ }
+ return -EPERM;
+ }
+ return security_syslog(type);
+}
+
+
+/* /dev/kmsg - userspace message inject/listen interface */
+struct devkmsg_user {
+ u64 seq;
+ u32 idx;
+ enum log_flags prev;
+ struct mutex lock;
+ char buf[8192];
+};
+
+static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ char *buf, *line;
+ int i;
+ int level = default_message_loglevel;
+ int facility = 1; /* LOG_USER */
+ size_t len = iov_length(iv, count);
+ ssize_t ret = len;
+
+ if (len > LOG_LINE_MAX)
+ return -EINVAL;
+ buf = kmalloc(len+1, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ line = buf;
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ line += iv[i].iov_len;
+ }
+
+ /*
+ * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
+ * the decimal value represents 32bit, the lower 3 bit are the log
+ * level, the rest are the log facility.
+ *
+ * If no prefix or no userspace facility is specified, we
+ * enforce LOG_USER, to be able to reliably distinguish
+ * kernel-generated messages from userspace-injected ones.
+ */
+ line = buf;
+ if (line[0] == '<') {
+ char *endp = NULL;
+
+ i = simple_strtoul(line+1, &endp, 10);
+ if (endp && endp[0] == '>') {
+ level = i & 7;
+ if (i >> 3)
+ facility = i >> 3;
+ endp++;
+ len -= endp - line;
+ line = endp;
+ }
+ }
+ line[len] = '\0';
+
+ printk_emit(facility, level, NULL, 0, "%s", line);
+out:
+ kfree(buf);
+ return ret;
+}
+
+static ssize_t devkmsg_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct devkmsg_user *user = file->private_data;
+ struct printk_log *msg;
+ u64 ts_usec;
+ size_t i;
+ char cont = '-';
+ size_t len;
+ ssize_t ret;
+
+ if (!user)
+ return -EBADF;
+
+ ret = mutex_lock_interruptible(&user->lock);
+ if (ret)
+ return ret;
+ raw_spin_lock_irq(&logbuf_lock);
+ while (user->seq == log_next_seq) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ raw_spin_unlock_irq(&logbuf_lock);
+ goto out;
+ }
+
+ raw_spin_unlock_irq(&logbuf_lock);
+ ret = wait_event_interruptible(log_wait,
+ user->seq != log_next_seq);
+ if (ret)
+ goto out;
+ raw_spin_lock_irq(&logbuf_lock);
+ }
+
+ if (user->seq < log_first_seq) {
+ /* our last seen message is gone, return error and reset */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ ret = -EPIPE;
+ raw_spin_unlock_irq(&logbuf_lock);
+ goto out;
+ }
+
+ msg = log_from_idx(user->idx);
+ ts_usec = msg->ts_nsec;
+ do_div(ts_usec, 1000);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level,
+ user->seq, ts_usec, cont);
+ user->prev = msg->flags;
+
+ /* escape non-printable characters */
+ for (i = 0; i < msg->text_len; i++) {
+ unsigned char c = log_text(msg)[i];
+
+ if (c < ' ' || c >= 127 || c == '\\')
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ else
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+
+ if (msg->dict_len) {
+ bool line = true;
+
+ for (i = 0; i < msg->dict_len; i++) {
+ unsigned char c = log_dict(msg)[i];
+
+ if (line) {
+ user->buf[len++] = ' ';
+ line = false;
+ }
+
+ if (c == '\0') {
+ user->buf[len++] = '\n';
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 127 || c == '\\') {
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ continue;
+ }
+
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+ }
+
+ user->idx = log_next(user->idx);
+ user->seq++;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ if (len > count) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(buf, user->buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = len;
+out:
+ mutex_unlock(&user->lock);
+ return ret;
+}
+
+static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct devkmsg_user *user = file->private_data;
+ loff_t ret = 0;
+
+ if (!user)
+ return -EBADF;
+ if (offset)
+ return -ESPIPE;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ switch (whence) {
+ case SEEK_SET:
+ /* the first record */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ break;
+ case SEEK_DATA:
+ /*
+ * The first record after the last SYSLOG_ACTION_CLEAR,
+ * like issued by 'dmesg -c'. Reading /dev/kmsg itself
+ * changes no global state, and does not clear anything.
+ */
+ user->idx = clear_idx;
+ user->seq = clear_seq;
+ break;
+ case SEEK_END:
+ /* after the last record */
+ user->idx = log_next_idx;
+ user->seq = log_next_seq;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+ return ret;
+}
+
+static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+{
+ struct devkmsg_user *user = file->private_data;
+ int ret = 0;
+
+ if (!user)
+ return POLLERR|POLLNVAL;
+
+ poll_wait(file, &log_wait, wait);
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (user->seq < log_next_seq) {
+ /* return error when data has vanished underneath us */
+ if (user->seq < log_first_seq)
+ ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ else
+ ret = POLLIN|POLLRDNORM;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ return ret;
+}
+
+static int devkmsg_open(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user;
+ int err;
+
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ return 0;
+
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+
+ user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ if (!user)
+ return -ENOMEM;
+
+ mutex_init(&user->lock);
+
+ raw_spin_lock_irq(&logbuf_lock);
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ file->private_data = user;
+ return 0;
+}
+
+static int devkmsg_release(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user = file->private_data;
+
+ if (!user)
+ return 0;
+
+ mutex_destroy(&user->lock);
+ kfree(user);
+ return 0;
+}
+
+const struct file_operations kmsg_fops = {
+ .open = devkmsg_open,
+ .read = devkmsg_read,
+ .aio_write = devkmsg_writev,
+ .llseek = devkmsg_llseek,
+ .poll = devkmsg_poll,
+ .release = devkmsg_release,
+};
+
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcore
+ *
+ * /proc/vmcore is used by various utilities, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate. These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+ VMCOREINFO_SYMBOL(log_buf);
+ VMCOREINFO_SYMBOL(log_buf_len);
+ VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(log_next_idx);
+ /*
+ * Export struct printk_log size and field offsets. User space tools can
+ * parse it and detect any changes to structure down the line.
+ */
+ VMCOREINFO_STRUCT_SIZE(printk_log);
+ VMCOREINFO_OFFSET(printk_log, ts_nsec);
+ VMCOREINFO_OFFSET(printk_log, len);
+ VMCOREINFO_OFFSET(printk_log, text_len);
+ VMCOREINFO_OFFSET(printk_log, dict_len);
+}
+#endif
+
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ if (size)
+ size = roundup_pow_of_two(size);
+ if (size > log_buf_len)
+ new_log_buf_len = size;
+
+ return 0;
+}
+early_param("log_buf_len", log_buf_len_setup);
+
+void __init setup_log_buf(int early)
+{
+ unsigned long flags;
+ char *new_log_buf;
+ int free;
+
+ if (!new_log_buf_len)
+ return;
+
+ if (early) {
+ new_log_buf =
+ memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+ } else {
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+ }
+
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = new_log_buf_len;
+ log_buf = new_log_buf;
+ new_log_buf_len = 0;
+ free = __LOG_BUF_LEN - log_next_idx;
+ memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("early log buf free: %d(%d%%)\n",
+ free, (free * 100) / __LOG_BUF_LEN);
+}
+
+static bool __read_mostly ignore_loglevel;
+
+static int __init ignore_loglevel_setup(char *str)
+{
+ ignore_loglevel = 1;
+ pr_info("debug: ignoring loglevel setting.\n");
+
+ return 0;
+}
+
+early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+ "print all kernel messages to the console.");
+
+#ifdef CONFIG_BOOT_PRINTK_DELAY
+
+static int boot_delay; /* msecs delay after each printk during bootup */
+static unsigned long long loops_per_msec; /* based on boot_delay */
+
+static int __init boot_delay_setup(char *str)
+{
+ unsigned long lpj;
+
+ lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
+ loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
+
+ get_option(&str, &boot_delay);
+ if (boot_delay > 10 * 1000)
+ boot_delay = 0;
+
+ pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
+ "HZ: %d, loops_per_msec: %llu\n",
+ boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
+ return 0;
+}
+early_param("boot_delay", boot_delay_setup);
+
+static void boot_delay_msec(int level)
+{
+ unsigned long long k;
+ unsigned long timeout;
+
+ if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ || (level >= console_loglevel && !ignore_loglevel)) {
+ return;
+ }
+
+ k = (unsigned long long)loops_per_msec * boot_delay;
+
+ timeout = jiffies + msecs_to_jiffies(boot_delay);
+ while (k) {
+ k--;
+ cpu_relax();
+ /*
+ * use (volatile) jiffies to prevent
+ * compiler reduction; loop termination via jiffies
+ * is secondary and may or may not happen.
+ */
+ if (time_after(jiffies, timeout))
+ break;
+ touch_nmi_watchdog();
+ }
+}
+#else
+static inline void boot_delay_msec(int level)
+{
+}
+#endif
+
+#if defined(CONFIG_PRINTK_TIME)
+static bool printk_time = 1;
+#else
+static bool printk_time;
+#endif
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+
+static size_t print_time(u64 ts, char *buf)
+{
+ unsigned long rem_nsec;
+
+ if (!printk_time)
+ return 0;
+
+ rem_nsec = do_div(ts, 1000000000);
+
+ if (!buf)
+ return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
+
+ return sprintf(buf, "[%5lu.%06lu] ",
+ (unsigned long)ts, rem_nsec / 1000);
+}
+
+static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
+{
+ size_t len = 0;
+ unsigned int prefix = (msg->facility << 3) | msg->level;
+
+ if (syslog) {
+ if (buf) {
+ len += sprintf(buf, "<%u>", prefix);
+ } else {
+ len += 3;
+ if (prefix > 999)
+ len += 3;
+ else if (prefix > 99)
+ len += 2;
+ else if (prefix > 9)
+ len++;
+ }
+ }
+
+ len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+ return len;
+}
+
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size)
+{
+ const char *text = log_text(msg);
+ size_t text_size = msg->text_len;
+ bool prefix = true;
+ bool newline = true;
+ size_t len = 0;
+
+ if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+ prefix = false;
+
+ if (msg->flags & LOG_CONT) {
+ if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+ prefix = false;
+
+ if (!(msg->flags & LOG_NEWLINE))
+ newline = false;
+ }
+
+ do {
+ const char *next = memchr(text, '\n', text_size);
+ size_t text_len;
+
+ if (next) {
+ text_len = next - text;
+ next++;
+ text_size -= next - text;
+ } else {
+ text_len = text_size;
+ }
+
+ if (buf) {
+ if (print_prefix(msg, syslog, NULL) +
+ text_len + 1 >= size - len)
+ break;
+
+ if (prefix)
+ len += print_prefix(msg, syslog, buf + len);
+ memcpy(buf + len, text, text_len);
+ len += text_len;
+ if (next || newline)
+ buf[len++] = '\n';
+ } else {
+ /* SYSLOG_ACTION_* buffer size only calculation */
+ if (prefix)
+ len += print_prefix(msg, syslog, NULL);
+ len += text_len;
+ if (next || newline)
+ len++;
+ }
+
+ prefix = true;
+ text = next;
+ } while (text);
+
+ return len;
+}
+
+static int syslog_print(char __user *buf, int size)
+{
+ char *text;
+ struct printk_log *msg;
+ int len = 0;
+
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ while (size > 0) {
+ size_t n;
+ size_t skip;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ syslog_prev = 0;
+ syslog_partial = 0;
+ }
+ if (syslog_seq == log_next_seq) {
+ raw_spin_unlock_irq(&logbuf_lock);
+ break;
+ }
+
+ skip = syslog_partial;
+ msg = log_from_idx(syslog_idx);
+ n = msg_print_text(msg, syslog_prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
+ if (n - syslog_partial <= size) {
+ /* message fits into buffer, move forward */
+ syslog_idx = log_next(syslog_idx);
+ syslog_seq++;
+ syslog_prev = msg->flags;
+ n -= syslog_partial;
+ syslog_partial = 0;
+ } else if (!len){
+ /* partial read(), remember position */
+ n = size;
+ syslog_partial += n;
+ } else
+ n = 0;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ if (!n)
+ break;
+
+ if (copy_to_user(buf, text + skip, n)) {
+ if (!len)
+ len = -EFAULT;
+ break;
+ }
+
+ len += n;
+ size -= n;
+ buf += n;
+ }
+
+ kfree(text);
+ return len;
+}
+
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+ char *text;
+ int len = 0;
+
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (buf) {
+ u64 next_seq;
+ u64 seq;
+ u32 idx;
+ enum log_flags prev;
+
+ if (clear_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+ seq = clear_seq;
+ idx = clear_idx;
+ prev = 0;
+ while (seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ len += msg_print_text(msg, prev, true, NULL, 0);
+ prev = msg->flags;
+ idx = log_next(idx);
+ seq++;
+ }
+
+ /* move first record forward until length fits into the buffer */
+ seq = clear_seq;
+ idx = clear_idx;
+ prev = 0;
+ while (len > size && seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ len -= msg_print_text(msg, prev, true, NULL, 0);
+ prev = msg->flags;
+ idx = log_next(idx);
+ seq++;
+ }
+
+ /* last message fitting into this dump */
+ next_seq = log_next_seq;
+
+ len = 0;
+ while (len >= 0 && seq < next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+ int textlen;
+
+ textlen = msg_print_text(msg, prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+
+ raw_spin_unlock_irq(&logbuf_lock);
+ if (copy_to_user(buf + len, text, textlen))
+ len = -EFAULT;
+ else
+ len += textlen;
+ raw_spin_lock_irq(&logbuf_lock);
+
+ if (seq < log_first_seq) {
+ /* messages are gone, move to next one */
+ seq = log_first_seq;
+ idx = log_first_idx;
+ prev = 0;
+ }
+ }
+ }
+
+ if (clear) {
+ clear_seq = log_next_seq;
+ clear_idx = log_next_idx;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ kfree(text);
+ return len;
+}
+
+int do_syslog(int type, char __user *buf, int len, bool from_file)
+{
+ bool clear = false;
+ static int saved_console_loglevel = -1;
+ int error;
+
+ error = check_syslog_permissions(type, from_file);
+ if (error)
+ goto out;
+
+ error = security_syslog(type);
+ if (error)
+ return error;
+
+ switch (type) {
+ case SYSLOG_ACTION_CLOSE: /* Close log */
+ break;
+ case SYSLOG_ACTION_OPEN: /* Open log */
+ break;
+ case SYSLOG_ACTION_READ: /* Read from log */
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = wait_event_interruptible(log_wait,
+ syslog_seq != log_next_seq);
+ if (error)
+ goto out;
+ error = syslog_print(buf, len);
+ break;
+ /* Read/clear last kernel messages */
+ case SYSLOG_ACTION_READ_CLEAR:
+ clear = true;
+ /* FALL THRU */
+ /* Read last kernel messages */
+ case SYSLOG_ACTION_READ_ALL:
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = syslog_print_all(buf, len, clear);
+ break;
+ /* Clear ring buffer */
+ case SYSLOG_ACTION_CLEAR:
+ syslog_print_all(NULL, 0, true);
+ break;
+ /* Disable logging to console */
+ case SYSLOG_ACTION_CONSOLE_OFF:
+ if (saved_console_loglevel == -1)
+ saved_console_loglevel = console_loglevel;
+ console_loglevel = minimum_console_loglevel;
+ break;
+ /* Enable logging to console */
+ case SYSLOG_ACTION_CONSOLE_ON:
+ if (saved_console_loglevel != -1) {
+ console_loglevel = saved_console_loglevel;
+ saved_console_loglevel = -1;
+ }
+ break;
+ /* Set level of messages printed to console */
+ case SYSLOG_ACTION_CONSOLE_LEVEL:
+ error = -EINVAL;
+ if (len < 1 || len > 8)
+ goto out;
+ if (len < minimum_console_loglevel)
+ len = minimum_console_loglevel;
+ console_loglevel = len;
+ /* Implicitly re-enable logging to console */
+ saved_console_loglevel = -1;
+ error = 0;
+ break;
+ /* Number of chars in the log buffer */
+ case SYSLOG_ACTION_SIZE_UNREAD:
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ syslog_prev = 0;
+ syslog_partial = 0;
+ }
+ if (from_file) {
+ /*
+ * Short-cut for poll(/"proc/kmsg") which simply checks
+ * for pending data, not the size; return the count of
+ * records, not the length.
+ */
+ error = log_next_idx - syslog_idx;
+ } else {
+ u64 seq = syslog_seq;
+ u32 idx = syslog_idx;
+ enum log_flags prev = syslog_prev;
+
+ error = 0;
+ while (seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ error += msg_print_text(msg, prev, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+ error -= syslog_partial;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+ break;
+ /* Size of the log buffer */
+ case SYSLOG_ACTION_SIZE_BUFFER:
+ error = log_buf_len;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
+{
+ return do_syslog(type, buf, len, SYSLOG_FROM_READER);
+}
+
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_lock must be held.
+ */
+static void call_console_drivers(int level, const char *text, size_t len)
+{
+ struct console *con;
+
+ trace_console(text, len);
+
+ if (level >= console_loglevel && !ignore_loglevel)
+ return;
+ if (!console_drivers)
+ return;
+
+ for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (!con->write)
+ continue;
+ if (!cpu_online(smp_processor_id()) &&
+ !(con->flags & CON_ANYTIME))
+ continue;
+ con->write(con, text, len);
+ }
+}
+
+/*
+ * Zap console related locks when oopsing. Only zap at most once
+ * every 10 seconds, to leave time for slow consoles to print a
+ * full oops.
+ */
+static void zap_locks(void)
+{
+ static unsigned long oops_timestamp;
+
+ if (time_after_eq(jiffies, oops_timestamp) &&
+ !time_after(jiffies, oops_timestamp + 30 * HZ))
+ return;
+
+ oops_timestamp = jiffies;
+
+ debug_locks_off();
+ /* If a crash is occurring, make sure we can't deadlock */
+ raw_spin_lock_init(&logbuf_lock);
+ /* And make sure that we print immediately */
+ sema_init(&console_sem, 1);
+}
+
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for_each_console(con)
+ if (con->flags & CON_ANYTIME)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
+ */
+static inline int can_use_console(unsigned int cpu)
+{
+ return cpu_online(cpu) || have_callable_console();
+}
+
+/*
+ * Try to get console ownership to actually show the kernel
+ * messages from a 'printk'. Return true (and with the
+ * console_lock held, and 'console_locked' set) if it
+ * is successful, false otherwise.
+ */
+static int console_trylock_for_printk(unsigned int cpu)
+{
+ if (!console_trylock())
+ return 0;
+ /*
+ * If we can't use the console, we need to release the console
+ * semaphore by hand to avoid flushing the buffer. We need to hold the
+ * console semaphore in order to do this test safely.
+ */
+ if (!can_use_console(cpu)) {
+ console_locked = 0;
+ up_console_sem();
+ return 0;
+ }
+ return 1;
+}
+
+int printk_delay_msec __read_mostly;
+
+static inline void printk_delay(void)
+{
+ if (unlikely(printk_delay_msec)) {
+ int m = printk_delay_msec;
+
+ while (m--) {
+ mdelay(1);
+ touch_nmi_watchdog();
+ }
+ }
+}
+
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+static struct cont {
+ char buf[LOG_LINE_MAX];
+ size_t len; /* length == 0 means unused buffer */
+ size_t cons; /* bytes written to console */
+ struct task_struct *owner; /* task of first print*/
+ u64 ts_nsec; /* time of first print */
+ u8 level; /* log level of first message */
+ u8 facility; /* log level of first message */
+ enum log_flags flags; /* prefix, newline flags */
+ bool flushed:1; /* buffer sealed and committed */
+} cont;
+
+static void cont_flush(enum log_flags flags)
+{
+ if (cont.flushed)
+ return;
+ if (cont.len == 0)
+ return;
+
+ if (cont.cons) {
+ /*
+ * If a fragment of this line was directly flushed to the
+ * console; wait for the console to pick up the rest of the
+ * line. LOG_NOCONS suppresses a duplicated output.
+ */
+ log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+ cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+ cont.flags = flags;
+ cont.flushed = true;
+ } else {
+ /*
+ * If no fragment of this line ever reached the console,
+ * just submit it to the store and free the buffer.
+ */
+ log_store(cont.facility, cont.level, flags, 0,
+ NULL, 0, cont.buf, cont.len);
+ cont.len = 0;
+ }
+}
+
+static bool cont_add(int facility, int level, const char *text, size_t len)
+{
+ if (cont.len && cont.flushed)
+ return false;
+
+ if (cont.len + len > sizeof(cont.buf)) {
+ /* the line gets too long, split it up in separate records */
+ cont_flush(LOG_CONT);
+ return false;
+ }
+
+ if (!cont.len) {
+ cont.facility = facility;
+ cont.level = level;
+ cont.owner = current;
+ cont.ts_nsec = local_clock();
+ cont.flags = 0;
+ cont.cons = 0;
+ cont.flushed = false;
+ }
+
+ memcpy(cont.buf + cont.len, text, len);
+ cont.len += len;
+
+ if (cont.len > (sizeof(cont.buf) * 80) / 100)
+ cont_flush(LOG_CONT);
+
+ return true;
+}
+
+static size_t cont_print_text(char *text, size_t size)
+{
+ size_t textlen = 0;
+ size_t len;
+
+ if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
+ textlen += print_time(cont.ts_nsec, text);
+ size -= textlen;
+ }
+
+ len = cont.len - cont.cons;
+ if (len > 0) {
+ if (len+1 > size)
+ len = size-1;
+ memcpy(text + textlen, cont.buf + cont.cons, len);
+ textlen += len;
+ cont.cons = cont.len;
+ }
+
+ if (cont.flushed) {
+ if (cont.flags & LOG_NEWLINE)
+ text[textlen++] = '\n';
+ /* got everything, release buffer */
+ cont.len = 0;
+ }
+ return textlen;
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
+{
+ static int recursion_bug;
+ static char textbuf[LOG_LINE_MAX];
+ char *text = textbuf;
+ size_t text_len = 0;
+ enum log_flags lflags = 0;
+ unsigned long flags;
+ int this_cpu;
+ int printed_len = 0;
+ bool in_sched = false;
+ /* cpu currently holding logbuf_lock in this function */
+ static volatile unsigned int logbuf_cpu = UINT_MAX;
+
+ if (level == SCHED_MESSAGE_LOGLEVEL) {
+ level = -1;
+ in_sched = true;
+ }
+
+ boot_delay_msec(level);
+ printk_delay();
+
+ /* This stops the holder of console_sem just where we want him */
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+
+ /*
+ * Ouch, printk recursed into itself!
+ */
+ if (unlikely(logbuf_cpu == this_cpu)) {
+ /*
+ * If a crash is occurring during printk() on this CPU,
+ * then try to get the crash message out but make sure
+ * we can't deadlock. Otherwise just return to avoid the
+ * recursion and return - but flag the recursion so that
+ * it can be printed at the next appropriate moment:
+ */
+ if (!oops_in_progress && !lockdep_recursing(current)) {
+ recursion_bug = 1;
+ goto out_restore_irqs;
+ }
+ zap_locks();
+ }
+
+ lockdep_off();
+ raw_spin_lock(&logbuf_lock);
+ logbuf_cpu = this_cpu;
+
+ if (recursion_bug) {
+ static const char recursion_msg[] =
+ "BUG: recent printk recursion!";
+
+ recursion_bug = 0;
+ text_len = strlen(recursion_msg);
+ /* emit KERN_CRIT message */
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, recursion_msg, text_len);
+ }
+
+ /*
+ * The printf needs to come first; we need the syslog
+ * prefix which might be passed-in as a parameter.
+ */
+ if (in_sched)
+ text_len = scnprintf(text, sizeof(textbuf),
+ KERN_WARNING "[sched_delayed] ");
+
+ text_len += vscnprintf(text + text_len,
+ sizeof(textbuf) - text_len, fmt, args);
+
+ /* mark and strip a trailing newline */
+ if (text_len && text[text_len-1] == '\n') {
+ text_len--;
+ lflags |= LOG_NEWLINE;
+ }
+
+ /* strip kernel syslog prefix and extract log level or control flags */
+ if (facility == 0) {
+ int kern_level = printk_get_level(text);
+
+ if (kern_level) {
+ const char *end_of_header = printk_skip_level(text);
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level == -1)
+ level = kern_level - '0';
+ case 'd': /* KERN_DEFAULT */
+ lflags |= LOG_PREFIX;
+ }
+ /*
+ * No need to check length here because vscnprintf
+ * put '\0' at the end of the string. Only valid and
+ * newly printed level is detected.
+ */
+ text_len -= end_of_header - text;
+ text = (char *)end_of_header;
+ }
+ }
+
+ if (level == -1)
+ level = default_message_loglevel;
+
+ if (dict)
+ lflags |= LOG_PREFIX|LOG_NEWLINE;
+
+ if (!(lflags & LOG_NEWLINE)) {
+ /*
+ * Flush the conflicting buffer. An earlier newline was missing,
+ * or another task also prints continuation lines.
+ */
+ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
+ cont_flush(LOG_NEWLINE);
+
+ /* buffer line if possible, otherwise store it right away */
+ if (cont_add(facility, level, text, text_len))
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level,
+ lflags | LOG_CONT, 0,
+ dict, dictlen, text, text_len);
+ } else {
+ bool stored = false;
+
+ /*
+ * If an earlier newline was missing and it was the same task,
+ * either merge it with the current buffer and flush, or if
+ * there was a race with interrupts (prefix == true) then just
+ * flush it out and store this line separately.
+ * If the preceding printk was from a different task and missed
+ * a newline, flush and append the newline.
+ */
+ if (cont.len) {
+ if (cont.owner == current && !(lflags & LOG_PREFIX))
+ stored = cont_add(facility, level, text,
+ text_len);
+ cont_flush(LOG_NEWLINE);
+ }
+
+ if (stored)
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level, lflags, 0,
+ dict, dictlen, text, text_len);
+ }
+
+ logbuf_cpu = UINT_MAX;
+ raw_spin_unlock(&logbuf_lock);
+
+ /* If called from the scheduler, we can not call up(). */
+ if (!in_sched) {
+ /*
+ * Try to acquire and then immediately release the console
+ * semaphore. The release will print out buffers and wake up
+ * /dev/kmsg and syslog() users.
+ */
+ if (console_trylock_for_printk(this_cpu))
+ console_unlock();
+ }
+
+ lockdep_on();
+out_restore_irqs:
+ local_irq_restore(flags);
+ return printed_len;
+}
+EXPORT_SYMBOL(vprintk_emit);
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, -1, NULL, 0, fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
+
+asmlinkage int printk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(printk_emit);
+
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk(). It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_lock. If we succeed, it's easy - we log the
+ * output and call the console drivers. If we fail to get the semaphore, we
+ * place the output into the log buffer and return. The current holder of
+ * the console_sem will notice the new output in console_unlock(); and will
+ * send it to the consoles before releasing the lock.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+asmlinkage __visible int printk(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ va_start(args, fmt);
+ r = vkdb_printf(fmt, args);
+ va_end(args);
+ return r;
+ }
+#endif
+ va_start(args, fmt);
+ r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(printk);
+
+#else /* CONFIG_PRINTK */
+
+#define LOG_LINE_MAX 0
+#define PREFIX_MAX 0
+#define LOG_LINE_MAX 0
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
+static struct cont {
+ size_t len;
+ size_t cons;
+ u8 level;
+ bool flushed:1;
+} cont;
+static struct printk_log *log_from_idx(u32 idx) { return NULL; }
+static u32 log_next(u32 idx) { return 0; }
+static void call_console_drivers(int level, const char *text, size_t len) {}
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size) { return 0; }
+static size_t cont_print_text(char *text, size_t size) { return 0; }
+
+#endif /* CONFIG_PRINTK */
+
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+ if (early_console) {
+ char buf[512];
+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+ early_console->write(early_console, buf, n);
+ }
+}
+
+asmlinkage __visible void early_printk(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ early_vprintk(fmt, ap);
+ va_end(ap);
+}
+#endif
+
+static int __add_preferred_console(char *name, int idx, char *options,
+ char *brl_options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ /*
+ * See if this tty is not yet registered, and
+ * if we have a slot free.
+ */
+ for (i = 0, c = console_cmdline;
+ i < MAX_CMDLINECONSOLES && c->name[0];
+ i++, c++) {
+ if (strcmp(c->name, name) == 0 && c->index == idx) {
+ if (!brl_options)
+ selected_console = i;
+ return 0;
+ }
+ }
+ if (i == MAX_CMDLINECONSOLES)
+ return -E2BIG;
+ if (!brl_options)
+ selected_console = i;
+ strlcpy(c->name, name, sizeof(c->name));
+ c->options = options;
+ braille_set_options(c, brl_options);
+
+ c->index = idx;
+ return 0;
+}
+/*
+ * Set up a list of consoles. Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+ char *s, *options, *brl_options = NULL;
+ int idx;
+
+ if (_braille_console_setup(&str, &brl_options))
+ return 1;
+
+ /*
+ * Decode str into name, index, options.
+ */
+ if (str[0] >= '0' && str[0] <= '9') {
+ strcpy(buf, "ttyS");
+ strncpy(buf + 4, str, sizeof(buf) - 5);
+ } else {
+ strncpy(buf, str, sizeof(buf) - 1);
+ }
+ buf[sizeof(buf) - 1] = 0;
+ if ((options = strchr(str, ',')) != NULL)
+ *(options++) = 0;
+#ifdef __sparc__
+ if (!strcmp(str, "ttya"))
+ strcpy(buf, "ttyS0");
+ if (!strcmp(str, "ttyb"))
+ strcpy(buf, "ttyS1");
+#endif
+ for (s = buf; *s; s++)
+ if ((*s >= '0' && *s <= '9') || *s == ',')
+ break;
+ idx = simple_strtoul(s, NULL, 10);
+ *s = 0;
+
+ __add_preferred_console(buf, idx, options, brl_options);
+ console_set_on_cmdline = 1;
+ return 1;
+}
+__setup("console=", console_setup);
+
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ * @name: device name
+ * @idx: device index
+ * @options: options for this console
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init. Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int add_preferred_console(char *name, int idx, char *options)
+{
+ return __add_preferred_console(name, idx, options, NULL);
+}
+
+int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ for (i = 0, c = console_cmdline;
+ i < MAX_CMDLINECONSOLES && c->name[0];
+ i++, c++)
+ if (strcmp(c->name, name) == 0 && c->index == idx) {
+ strlcpy(c->name, name_new, sizeof(c->name));
+ c->name[sizeof(c->name) - 1] = 0;
+ c->options = options;
+ c->index = idx_new;
+ return i;
+ }
+ /* not found */
+ return -1;
+}
+
+bool console_suspend_enabled = 1;
+EXPORT_SYMBOL(console_suspend_enabled);
+
+static int __init console_suspend_disable(char *str)
+{
+ console_suspend_enabled = 0;
+ return 1;
+}
+__setup("no_console_suspend", console_suspend_disable);
+module_param_named(console_suspend, console_suspend_enabled,
+ bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
+ " and hibernate operations");
+
+/**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+ if (!console_suspend_enabled)
+ return;
+ printk("Suspending console(s) (use no_console_suspend to debug)\n");
+ console_lock();
+ console_suspended = 1;
+ up_console_sem();
+}
+
+void resume_console(void)
+{
+ if (!console_suspend_enabled)
+ return;
+ down_console_sem();
+ console_suspended = 0;
+ console_unlock();
+}
+
+/**
+ * console_cpu_notify - print deferred console messages after CPU hotplug
+ * @self: notifier struct
+ * @action: CPU hotplug event
+ * @hcpu: unused
+ *
+ * If printk() is called from a CPU that is not online yet, the messages
+ * will be spooled but will not show up on the console. This function is
+ * called when a new CPU comes online (or fails to come up), and ensures
+ * that any such output gets printed.
+ */
+static int console_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ case CPU_DOWN_FAILED:
+ case CPU_UP_CANCELED:
+ console_lock();
+ console_unlock();
+ }
+ return NOTIFY_OK;
+}
+
+/**
+ * console_lock - lock the console system for exclusive use.
+ *
+ * Acquires a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * Can sleep, returns nothing.
+ */
+void console_lock(void)
+{
+ might_sleep();
+
+ down_console_sem();
+ if (console_suspended)
+ return;
+ console_locked = 1;
+ console_may_schedule = 1;
+}
+EXPORT_SYMBOL(console_lock);
+
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
+{
+ if (down_trylock_console_sem())
+ return 0;
+ if (console_suspended) {
+ up_console_sem();
+ return 0;
+ }
+ console_locked = 1;
+ console_may_schedule = 0;
+ return 1;
+}
+EXPORT_SYMBOL(console_trylock);
+
+int is_console_locked(void)
+{
+ return console_locked;
+}
+
+static void console_cont_flush(char *text, size_t size)
+{
+ unsigned long flags;
+ size_t len;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+
+ if (!cont.len)
+ goto out;
+
+ /*
+ * We still queue earlier records, likely because the console was
+ * busy. The earlier ones need to be printed before this one, we
+ * did not flush any fragment so far, so just let it queue up.
+ */
+ if (console_seq < log_next_seq && !cont.cons)
+ goto out;
+
+ len = cont_print_text(text, size);
+ raw_spin_unlock(&logbuf_lock);
+ stop_critical_timings();
+ call_console_drivers(cont.level, text, len);
+ start_critical_timings();
+ local_irq_restore(flags);
+ return;
+out:
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
+
+/**
+ * console_unlock - unlock the console system
+ *
+ * Releases the console_lock which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ static char text[LOG_LINE_MAX + PREFIX_MAX];
+ static u64 seen_seq;
+ unsigned long flags;
+ bool wake_klogd = false;
+ bool retry;
+
+ if (console_suspended) {
+ up_console_sem();
+ return;
+ }
+
+ console_may_schedule = 0;
+
+ /* flush buffered message fragment immediately to console */
+ console_cont_flush(text, sizeof(text));
+again:
+ for (;;) {
+ struct printk_log *msg;
+ size_t len;
+ int level;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ if (seen_seq != log_next_seq) {
+ wake_klogd = true;
+ seen_seq = log_next_seq;
+ }
+
+ if (console_seq < log_first_seq) {
+ len = sprintf(text, "** %u printk messages dropped ** ",
+ (unsigned)(log_first_seq - console_seq));
+
+ /* messages are gone, move to first one */
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ console_prev = 0;
+ } else {
+ len = 0;
+ }
+skip:
+ if (console_seq == log_next_seq)
+ break;
+
+ msg = log_from_idx(console_idx);
+ if (msg->flags & LOG_NOCONS) {
+ /*
+ * Skip record we have buffered and already printed
+ * directly to the console when we received it.
+ */
+ console_idx = log_next(console_idx);
+ console_seq++;
+ /*
+ * We will get here again when we register a new
+ * CON_PRINTBUFFER console. Clear the flag so we
+ * will properly dump everything later.
+ */
+ msg->flags &= ~LOG_NOCONS;
+ console_prev = msg->flags;
+ goto skip;
+ }
+
+ level = msg->level;
+ len += msg_print_text(msg, console_prev, false,
+ text + len, sizeof(text) - len);
+ console_idx = log_next(console_idx);
+ console_seq++;
+ console_prev = msg->flags;
+ raw_spin_unlock(&logbuf_lock);
+
+ stop_critical_timings(); /* don't trace print latency */
+ call_console_drivers(level, text, len);
+ start_critical_timings();
+ local_irq_restore(flags);
+ }
+ console_locked = 0;
+
+ /* Release the exclusive_console once it is used */
+ if (unlikely(exclusive_console))
+ exclusive_console = NULL;
+
+ raw_spin_unlock(&logbuf_lock);
+
+ up_console_sem();
+
+ /*
+ * Someone could have filled up the buffer again, so re-check if there's
+ * something to flush. In case we cannot trylock the console_sem again,
+ * there's a new owner and the console_unlock() from them will do the
+ * flush, no worries.
+ */
+ raw_spin_lock(&logbuf_lock);
+ retry = console_seq != log_next_seq;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ if (retry && console_trylock())
+ goto again;
+
+ if (wake_klogd)
+ wake_up_klogd();
+}
+EXPORT_SYMBOL(console_unlock);
+
+/**
+ * console_conditional_schedule - yield the CPU if required
+ *
+ * If the console code is currently allowed to sleep, and
+ * if this CPU should yield the CPU to another task, do
+ * so here.
+ *
+ * Must be called within console_lock();.
+ */
+void __sched console_conditional_schedule(void)
+{
+ if (console_may_schedule)
+ cond_resched();
+}
+EXPORT_SYMBOL(console_conditional_schedule);
+
+void console_unblank(void)
+{
+ struct console *c;
+
+ /*
+ * console_unblank can no longer be called in interrupt context unless
+ * oops_in_progress is set to 1..
+ */
+ if (oops_in_progress) {
+ if (down_trylock_console_sem() != 0)
+ return;
+ } else
+ console_lock();
+
+ console_locked = 1;
+ console_may_schedule = 0;
+ for_each_console(c)
+ if ((c->flags & CON_ENABLED) && c->unblank)
+ c->unblank();
+ console_unlock();
+}
+
+/*
+ * Return the console tty driver structure and its associated index
+ */
+struct tty_driver *console_device(int *index)
+{
+ struct console *c;
+ struct tty_driver *driver = NULL;
+
+ console_lock();
+ for_each_console(c) {
+ if (!c->device)
+ continue;
+ driver = c->device(c, index);
+ if (driver)
+ break;
+ }
+ console_unlock();
+ return driver;
+}
+
+/*
+ * Prevent further output on the passed console device so that (for example)
+ * serial drivers can disable console output before suspending a port, and can
+ * re-enable output afterwards.
+ */
+void console_stop(struct console *console)
+{
+ console_lock();
+ console->flags &= ~CON_ENABLED;
+ console_unlock();
+}
+EXPORT_SYMBOL(console_stop);
+
+void console_start(struct console *console)
+{
+ console_lock();
+ console->flags |= CON_ENABLED;
+ console_unlock();
+}
+EXPORT_SYMBOL(console_start);
+
+static int __read_mostly keep_bootcon;
+
+static int __init keep_bootcon_setup(char *str)
+{
+ keep_bootcon = 1;
+ pr_info("debug: skip boot console de-registration.\n");
+
+ return 0;
+}
+
+early_param("keep_bootcon", keep_bootcon_setup);
+
+/*
+ * The console driver calls this routine during kernel initialization
+ * to register the console printing procedure with printk() and to
+ * print any messages that were printed by the kernel before the
+ * console driver was initialized.
+ *
+ * This can happen pretty early during the boot process (because of
+ * early_printk) - sometimes before setup_arch() completes - be careful
+ * of what kernel features are used - they may not be initialised yet.
+ *
+ * There are two types of consoles - bootconsoles (early_printk) and
+ * "real" consoles (everything which is not a bootconsole) which are
+ * handled differently.
+ * - Any number of bootconsoles can be registered at any time.
+ * - As soon as a "real" console is registered, all bootconsoles
+ * will be unregistered automatically.
+ * - Once a "real" console is registered, any attempt to register a
+ * bootconsoles will be rejected
+ */
+void register_console(struct console *newcon)
+{
+ int i;
+ unsigned long flags;
+ struct console *bcon = NULL;
+ struct console_cmdline *c;
+
+ if (console_drivers)
+ for_each_console(bcon)
+ if (WARN(bcon == newcon,
+ "console '%s%d' already registered\n",
+ bcon->name, bcon->index))
+ return;
+
+ /*
+ * before we register a new CON_BOOT console, make sure we don't
+ * already have a valid console
+ */
+ if (console_drivers && newcon->flags & CON_BOOT) {
+ /* find the last or real console */
+ for_each_console(bcon) {
+ if (!(bcon->flags & CON_BOOT)) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ return;
+ }
+ }
+ }
+
+ if (console_drivers && console_drivers->flags & CON_BOOT)
+ bcon = console_drivers;
+
+ if (preferred_console < 0 || bcon || !console_drivers)
+ preferred_console = selected_console;
+
+ if (newcon->early_setup)
+ newcon->early_setup();
+
+ /*
+ * See if we want to use this console driver. If we
+ * didn't select a console we take the first one
+ * that registers here.
+ */
+ if (preferred_console < 0) {
+ if (newcon->index < 0)
+ newcon->index = 0;
+ if (newcon->setup == NULL ||
+ newcon->setup(newcon, NULL) == 0) {
+ newcon->flags |= CON_ENABLED;
+ if (newcon->device) {
+ newcon->flags |= CON_CONSDEV;
+ preferred_console = 0;
+ }
+ }
+ }
+
+ /*
+ * See if this console matches one we selected on
+ * the command line.
+ */
+ for (i = 0, c = console_cmdline;
+ i < MAX_CMDLINECONSOLES && c->name[0];
+ i++, c++) {
+ if (strcmp(c->name, newcon->name) != 0)
+ continue;
+ if (newcon->index >= 0 &&
+ newcon->index != c->index)
+ continue;
+ if (newcon->index < 0)
+ newcon->index = c->index;
+
+ if (_braille_register_console(newcon, c))
+ return;
+
+ if (newcon->setup &&
+ newcon->setup(newcon, console_cmdline[i].options) != 0)
+ break;
+ newcon->flags |= CON_ENABLED;
+ newcon->index = c->index;
+ if (i == selected_console) {
+ newcon->flags |= CON_CONSDEV;
+ preferred_console = selected_console;
+ }
+ break;
+ }
+
+ if (!(newcon->flags & CON_ENABLED))
+ return;
+
+ /*
+ * If we have a bootconsole, and are switching to a real console,
+ * don't print everything out again, since when the boot console, and
+ * the real console are the same physical device, it's annoying to
+ * see the beginning boot messages twice
+ */
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ newcon->flags &= ~CON_PRINTBUFFER;
+
+ /*
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
+ */
+ console_lock();
+ if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
+ newcon->next = console_drivers;
+ console_drivers = newcon;
+ if (newcon->next)
+ newcon->next->flags &= ~CON_CONSDEV;
+ } else {
+ newcon->next = console_drivers->next;
+ console_drivers->next = newcon;
+ }
+ if (newcon->flags & CON_PRINTBUFFER) {
+ /*
+ * console_unlock(); will print out the buffered messages
+ * for us.
+ */
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
+ console_prev = syslog_prev;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ /*
+ * We're about to replay the log buffer. Only do this to the
+ * just-registered console to avoid excessive message spam to
+ * the already-registered consoles.
+ */
+ exclusive_console = newcon;
+ }
+ console_unlock();
+ console_sysfs_notify();
+
+ /*
+ * By unregistering the bootconsoles after we enable the real console
+ * we get the "console xxx enabled" message on all the consoles -
+ * boot consoles, real consoles, etc - this is to ensure that end
+ * users know there might be something in the kernel's log buffer that
+ * went to the bootconsole (that they do not see on the real console)
+ */
+ pr_info("%sconsole [%s%d] enabled\n",
+ (newcon->flags & CON_BOOT) ? "boot" : "" ,
+ newcon->name, newcon->index);
+ if (bcon &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+ !keep_bootcon) {
+ /* We need to iterate through all boot consoles, to make
+ * sure we print everything out, before we unregister them.
+ */
+ for_each_console(bcon)
+ if (bcon->flags & CON_BOOT)
+ unregister_console(bcon);
+ }
+}
+EXPORT_SYMBOL(register_console);
+
+int unregister_console(struct console *console)
+{
+ struct console *a, *b;
+ int res;
+
+ pr_info("%sconsole [%s%d] disabled\n",
+ (console->flags & CON_BOOT) ? "boot" : "" ,
+ console->name, console->index);
+
+ res = _braille_unregister_console(console);
+ if (res)
+ return res;
+
+ res = 1;
+ console_lock();
+ if (console_drivers == console) {
+ console_drivers=console->next;
+ res = 0;
+ } else if (console_drivers) {
+ for (a=console_drivers->next, b=console_drivers ;
+ a; b=a, a=b->next) {
+ if (a == console) {
+ b->next = a->next;
+ res = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If this isn't the last console and it has CON_CONSDEV set, we
+ * need to set it on the next preferred console.
+ */
+ if (console_drivers != NULL && console->flags & CON_CONSDEV)
+ console_drivers->flags |= CON_CONSDEV;
+
+ console->flags &= ~CON_ENABLED;
+ console_unlock();
+ console_sysfs_notify();
+ return res;
+}
+EXPORT_SYMBOL(unregister_console);
+
+static int __init printk_late_init(void)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!keep_bootcon && con->flags & CON_BOOT) {
+ unregister_console(con);
+ }
+ }
+ hotcpu_notifier(console_cpu_notify, 0);
+ return 0;
+}
+late_initcall(printk_late_init);
+
+#if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_OUTPUT 0x02
+
+static DEFINE_PER_CPU(int, printk_pending);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+ int pending = __this_cpu_xchg(printk_pending, 0);
+
+ if (pending & PRINTK_PENDING_OUTPUT) {
+ /* If trylock fails, someone else is doing the printing */
+ if (console_trylock())
+ console_unlock();
+ }
+
+ if (pending & PRINTK_PENDING_WAKEUP)
+ wake_up_interruptible(&log_wait);
+}
+
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+ .func = wake_up_klogd_work_func,
+ .flags = IRQ_WORK_LAZY,
+};
+
+void wake_up_klogd(void)
+{
+ preempt_disable();
+ if (waitqueue_active(&log_wait)) {
+ this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ }
+ preempt_enable();
+}
+
+int printk_deferred(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ preempt_disable();
+ va_start(args, fmt);
+ r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
+ va_end(args);
+
+ __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ preempt_enable();
+
+ return r;
+}
+
+/*
+ * printk rate limiting, lifted from the networking subsystem.
+ *
+ * This enforces a rate limit: not more than 10 kernel messages
+ * every 5s to make a denial-of-service attack impossible.
+ */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
+
+int __printk_ratelimit(const char *func)
+{
+ return ___ratelimit(&printk_ratelimit_state, func);
+}
+EXPORT_SYMBOL(__printk_ratelimit);
+
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+ unsigned int interval_msecs)
+{
+ if (*caller_jiffies == 0
+ || !time_in_range(jiffies, *caller_jiffies,
+ *caller_jiffies
+ + msecs_to_jiffies(interval_msecs))) {
+ *caller_jiffies = jiffies;
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
+
+static DEFINE_SPINLOCK(dump_list_lock);
+static LIST_HEAD(dump_list);
+
+/**
+ * kmsg_dump_register - register a kernel log dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Adds a kernel log dumper to the system. The dump callback in the
+ * structure will be called when the kernel oopses or panics and must be
+ * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
+ */
+int kmsg_dump_register(struct kmsg_dumper *dumper)
+{
+ unsigned long flags;
+ int err = -EBUSY;
+
+ /* The dump callback needs to be set */
+ if (!dumper->dump)
+ return -EINVAL;
+
+ spin_lock_irqsave(&dump_list_lock, flags);
+ /* Don't allow registering multiple times */
+ if (!dumper->registered) {
+ dumper->registered = 1;
+ list_add_tail_rcu(&dumper->list, &dump_list);
+ err = 0;
+ }
+ spin_unlock_irqrestore(&dump_list_lock, flags);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_register);
+
+/**
+ * kmsg_dump_unregister - unregister a kmsg dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Removes a dump device from the system. Returns zero on success and
+ * %-EINVAL otherwise.
+ */
+int kmsg_dump_unregister(struct kmsg_dumper *dumper)
+{
+ unsigned long flags;
+ int err = -EINVAL;
+
+ spin_lock_irqsave(&dump_list_lock, flags);
+ if (dumper->registered) {
+ dumper->registered = 0;
+ list_del_rcu(&dumper->list);
+ err = 0;
+ }
+ spin_unlock_irqrestore(&dump_list_lock, flags);
+ synchronize_rcu();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
+
+/**
+ * kmsg_dump - dump kernel log to kernel message dumpers.
+ * @reason: the reason (oops, panic etc) for dumping
+ *
+ * Call each of the registered dumper's dump() callback, which can
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
+ */
+void kmsg_dump(enum kmsg_dump_reason reason)
+{
+ struct kmsg_dumper *dumper;
+ unsigned long flags;
+
+ if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dumper, &dump_list, list) {
+ if (dumper->max_reason && reason > dumper->max_reason)
+ continue;
+
+ /* initialize iterator with data about the stored records */
+ dumper->active = true;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ dumper->cur_seq = clear_seq;
+ dumper->cur_idx = clear_idx;
+ dumper->next_seq = log_next_seq;
+ dumper->next_idx = log_next_idx;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ /* invoke dumper which will iterate over records */
+ dumper->dump(dumper, reason);
+
+ /* reset iterator */
+ dumper->active = false;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
+ */
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+ char *line, size_t size, size_t *len)
+{
+ struct printk_log *msg;
+ size_t l = 0;
+ bool ret = false;
+
+ if (!dumper->active)
+ goto out;
+
+ if (dumper->cur_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ dumper->cur_seq = log_first_seq;
+ dumper->cur_idx = log_first_idx;
+ }
+
+ /* last entry */
+ if (dumper->cur_seq >= log_next_seq)
+ goto out;
+
+ msg = log_from_idx(dumper->cur_idx);
+ l = msg_print_text(msg, 0, syslog, line, size);
+
+ dumper->cur_idx = log_next(dumper->cur_idx);
+ dumper->cur_seq++;
+ ret = true;
+out:
+ if (len)
+ *len = l;
+ return ret;
+}
+
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+ char *line, size_t size, size_t *len)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
+
+/**
+ * kmsg_dump_get_buffer - copy kmsg log lines
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @buf: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+ char *buf, size_t size, size_t *len)
+{
+ unsigned long flags;
+ u64 seq;
+ u32 idx;
+ u64 next_seq;
+ u32 next_idx;
+ enum log_flags prev;
+ size_t l = 0;
+ bool ret = false;
+
+ if (!dumper->active)
+ goto out;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ if (dumper->cur_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ dumper->cur_seq = log_first_seq;
+ dumper->cur_idx = log_first_idx;
+ }
+
+ /* last entry */
+ if (dumper->cur_seq >= dumper->next_seq) {
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ goto out;
+ }
+
+ /* calculate length of entire buffer */
+ seq = dumper->cur_seq;
+ idx = dumper->cur_idx;
+ prev = 0;
+ while (seq < dumper->next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ l += msg_print_text(msg, prev, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+
+ /* move first record forward until length fits into the buffer */
+ seq = dumper->cur_seq;
+ idx = dumper->cur_idx;
+ prev = 0;
+ while (l > size && seq < dumper->next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ l -= msg_print_text(msg, prev, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+
+ /* last message in next interation */
+ next_seq = seq;
+ next_idx = idx;
+
+ l = 0;
+ while (seq < dumper->next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+
+ dumper->next_seq = next_seq;
+ dumper->next_idx = next_idx;
+ ret = true;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+ if (len)
+ *len = l;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
+
+/**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+ dumper->cur_seq = clear_seq;
+ dumper->cur_idx = clear_idx;
+ dumper->next_seq = log_next_seq;
+ dumper->next_idx = log_next_idx;
+}
+
+/**
+ * kmsg_dump_rewind - reset the interator
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ kmsg_dump_rewind_nolock(dumper);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+static char dump_stack_arch_desc_str[128];
+
+/**
+ * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * The configured string will be printed right after utsname during task
+ * dumps. Usually used to add arch-specific system identifiers. If an
+ * arch wants to make use of such an ID string, it should initialize this
+ * as soon as possible during boot.
+ */
+void __init dump_stack_set_arch_desc(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
+ fmt, args);
+ va_end(args);
+}
+
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+ printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+ log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ if (dump_stack_arch_desc_str[0] != '\0')
+ printk("%sHardware name: %s\n",
+ log_lvl, dump_stack_arch_desc_str);
+
+ print_worker_info(log_lvl, current);
+}
+
+/**
+ * show_regs_print_info - print generic debug info for show_regs()
+ * @log_lvl: log level
+ *
+ * show_regs() implementations can use this function to print out generic
+ * debug information.
+ */
+void show_regs_print_info(const char *log_lvl)
+{
+ dump_stack_print_info(log_lvl);
+
+ printk("%stask: %p ti: %p task.ti: %p\n",
+ log_lvl, current, current_thread_info(),
+ task_thread_info(current));
+}
+
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 76b8e77773e..54bf5ba2642 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,9 +8,10 @@
* Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
* Red Hat, July 2004
* Consolidation of architecture support code for profiling,
- * William Irwin, Oracle, July 2004
+ * Nadia Yvette Chambers, Oracle, July 2004
* Amortized hit count accounting via per-cpu open-addressed hashtables
- * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
+ * to resolve timer interrupt livelocks, Nadia Yvette Chambers,
+ * Oracle, 2004
*/
#include <linux/export.h>
@@ -36,9 +37,6 @@ struct profile_hit {
#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
-/* Oprofile timer tick hook */
-static int (*timer_hook)(struct pt_regs *) __read_mostly;
-
static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;
@@ -54,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);
int profile_setup(char *str)
{
- static char schedstr[] = "schedule";
- static char sleepstr[] = "sleep";
- static char kvmstr[] = "kvm";
+ static const char schedstr[] = "schedule";
+ static const char sleepstr[] = "sleep";
+ static const char kvmstr[] = "kvm";
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -66,12 +64,10 @@ int profile_setup(char *str)
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
- printk(KERN_INFO
- "kernel sleep profiling enabled (shift: %ld)\n",
+ pr_info("kernel sleep profiling enabled (shift: %ld)\n",
prof_shift);
#else
- printk(KERN_WARNING
- "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
+ pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
@@ -79,8 +75,7 @@ int profile_setup(char *str)
str += strlen(schedstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
- printk(KERN_INFO
- "kernel schedule profiling enabled (shift: %ld)\n",
+ pr_info("kernel schedule profiling enabled (shift: %ld)\n",
prof_shift);
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
@@ -88,13 +83,12 @@ int profile_setup(char *str)
str += strlen(kvmstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
- printk(KERN_INFO
- "kernel KVM profiling enabled (shift: %ld)\n",
+ pr_info("kernel KVM profiling enabled (shift: %ld)\n",
prof_shift);
} else if (get_option(&str, &par)) {
prof_shift = par;
prof_on = CPU_PROFILING;
- printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %ld)\n",
prof_shift);
}
return 1;
@@ -207,25 +201,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
-int register_timer_hook(int (*hook)(struct pt_regs *))
-{
- if (timer_hook)
- return -EBUSY;
- timer_hook = hook;
- return 0;
-}
-EXPORT_SYMBOL_GPL(register_timer_hook);
-
-void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
- WARN_ON(hook != timer_hook);
- timer_hook = NULL;
- /* make sure all CPUs see the NULL hook */
- synchronize_sched(); /* Allow ongoing interrupts to complete. */
-}
-EXPORT_SYMBOL_GPL(unregister_timer_hook);
-
-
#ifdef CONFIG_SMP
/*
* Each cpu has a pair of open-addressed hashtables for pending
@@ -256,7 +231,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
* pagetable hash functions, but uses a full hashtable full of finite
* collision chains, not just pairs of them.
*
- * -- wli
+ * -- nyc
*/
static void __profile_flip_buffers(void *unused)
{
@@ -352,7 +327,7 @@ out:
put_cpu();
}
-static int __cpuinit profile_cpu_callback(struct notifier_block *info,
+static int profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
int node, cpu = (unsigned long)__cpu;
@@ -435,8 +410,6 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (type == CPU_PROFILING && timer_hook)
- timer_hook(regs);
if (!user_mode(regs) && prof_cpu_mask != NULL &&
cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
profile_hit(type, (void *)profile_pc(regs));
@@ -485,10 +458,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
.write = prof_cpu_mask_proc_write,
};
-void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+void create_prof_cpu_mask(void)
{
/* create /proc/irq/prof_cpu_mask */
- proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
+ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
}
/*
@@ -572,14 +545,14 @@ static int create_hash_tables(void)
struct page *page;
page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
@@ -614,18 +587,28 @@ out_cleanup:
int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
{
struct proc_dir_entry *entry;
+ int err = 0;
if (!prof_on)
return 0;
- if (create_hash_tables())
- return -ENOMEM;
+
+ cpu_notifier_register_begin();
+
+ if (create_hash_tables()) {
+ err = -ENOMEM;
+ goto out;
+ }
+
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &proc_profile_operations);
if (!entry)
- return 0;
- entry->size = (1+prof_len) * sizeof(atomic_t);
- hotcpu_notifier(profile_cpu_callback, 0);
- return 0;
+ goto out;
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
+ __hotcpu_notifier(profile_cpu_callback, 0);
+
+out:
+ cpu_notifier_register_done();
+ return err;
}
-module_init(create_proc_profile);
+subsys_initcall(create_proc_profile);
#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed1..adf98622cb3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
+#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
@@ -24,6 +25,7 @@
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
+#include <linux/compat.h>
static int ptrace_trapping_sleep_fn(void *flags)
@@ -117,11 +119,45 @@ void __ptrace_unlink(struct task_struct *child)
* TASK_KILLABLE sleeps.
*/
if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
- signal_wake_up(child, task_is_traced(child));
+ ptrace_signal_wake_up(child, true);
spin_unlock(&child->sighand->siglock);
}
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+ bool ret = false;
+
+ /* Lockless, nobody but us can set this flag */
+ if (task->jobctl & JOBCTL_LISTENING)
+ return ret;
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ task->state = __TASK_TRACED;
+ ret = true;
+ }
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+ if (task->state != __TASK_TRACED)
+ return;
+
+ WARN_ON(!task->ptrace || task->parent != current);
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ spin_unlock_irq(&task->sighand->siglock);
+}
+
/**
* ptrace_check_attach - check whether ptracee is ready for ptrace operation
* @child: ptracee to check for
@@ -139,7 +175,7 @@ void __ptrace_unlink(struct task_struct *child)
* RETURNS:
* 0 on success, -ESRCH if %child is not ready.
*/
-int ptrace_check_attach(struct task_struct *child, bool ignore_state)
+static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
int ret = -ESRCH;
@@ -151,24 +187,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
- if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+ if (child->ptrace && child->parent == current) {
+ WARN_ON(child->state == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
- spin_lock_irq(&child->sighand->siglock);
- WARN_ON_ONCE(task_is_stopped(child));
- if (ignore_state || (task_is_traced(child) &&
- !(child->jobctl & JOBCTL_LISTENING)))
+ if (ignore_state || ptrace_freeze_traced(child))
ret = 0;
- spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state)
- ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+ if (!ret && !ignore_state) {
+ if (!wait_task_inactive(child, __TASK_TRACED)) {
+ /*
+ * This can only happen if may_ptrace_stop() fails and
+ * ptrace_stop() changes ->state back to TASK_RUNNING,
+ * so we should not worry about leaking __TASK_TRACED.
+ */
+ WARN_ON(child->state == __TASK_TRACED);
+ ret = -ESRCH;
+ }
+ }
- /* All systems go.. */
return ret;
}
@@ -180,7 +221,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
return has_ns_capability(current, ns, CAP_SYS_PTRACE);
}
-int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+/* Returns 0 on success, -errno on denial. */
+static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
@@ -194,19 +236,18 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
*/
int dumpable = 0;
/* Don't let security modules deny introspection */
- if (task == current)
+ if (same_thread_group(task, current))
return 0;
rcu_read_lock();
tcred = __task_cred(task);
- if (cred->user->user_ns == tcred->user->user_ns &&
- (cred->uid == tcred->euid &&
- cred->uid == tcred->suid &&
- cred->uid == tcred->uid &&
- cred->gid == tcred->egid &&
- cred->gid == tcred->sgid &&
- cred->gid == tcred->gid))
+ if (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user->user_ns, mode))
+ if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -215,8 +256,13 @@ ok:
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
+ rcu_read_lock();
+ if (dumpable != SUID_DUMP_USER &&
+ !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ rcu_read_unlock();
return -EPERM;
+ }
+ rcu_read_unlock();
return security_ptrace_access_check(task, mode);
}
@@ -231,26 +277,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
}
static int ptrace_attach(struct task_struct *task, long request,
+ unsigned long addr,
unsigned long flags)
{
bool seize = (request == PTRACE_SEIZE);
int retval;
- /*
- * SEIZE will enable new ptrace behaviors which will be implemented
- * gradually. SEIZE_DEVEL is used to prevent applications
- * expecting full SEIZE behaviors trapping on kernel commits which
- * are still in the process of implementing them.
- *
- * Only test programs for new ptrace behaviors being implemented
- * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
- *
- * Once SEIZE behaviors are completely implemented, this flag and
- * the following test will be removed.
- */
retval = -EIO;
- if (seize && !(flags & PTRACE_SEIZE_DEVEL))
- goto out;
+ if (seize) {
+ if (addr != 0)
+ goto out;
+ if (flags & ~(unsigned long)PTRACE_O_MASK)
+ goto out;
+ flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+ } else {
+ flags = PT_PTRACED;
+ }
audit_ptrace(task);
@@ -262,7 +304,7 @@ static int ptrace_attach(struct task_struct *task, long request,
/*
* Protect exec's credential calculations against our interference;
- * interference; SUID, SGID and LSM creds get determined differently
+ * SUID, SGID and LSM creds get determined differently
* under ptrace.
*/
retval = -ERESTARTNOINTR;
@@ -282,11 +324,13 @@ static int ptrace_attach(struct task_struct *task, long request,
if (task->ptrace)
goto unlock_tasklist;
- task->ptrace = PT_PTRACED;
if (seize)
- task->ptrace |= PT_SEIZED;
- if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
- task->ptrace |= PT_PTRACE_CAP;
+ flags |= PT_SEIZED;
+ rcu_read_lock();
+ if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
+ flags |= PT_PTRACE_CAP;
+ rcu_read_unlock();
+ task->ptrace = flags;
__ptrace_link(task, current);
@@ -315,7 +359,7 @@ static int ptrace_attach(struct task_struct *task, long request,
*/
if (task_is_stopped(task) &&
task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
- signal_wake_up(task, 1);
+ signal_wake_up_state(task, __TASK_STOPPED);
spin_unlock(&task->sighand->siglock);
@@ -461,6 +505,9 @@ void exit_ptrace(struct task_struct *tracer)
return;
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+ if (unlikely(p->ptrace & PT_EXITKILL))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
+
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, &ptrace_dead);
}
@@ -528,30 +575,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
- child->ptrace &= ~PT_TRACE_MASK;
-
- if (data & PTRACE_O_TRACESYSGOOD)
- child->ptrace |= PT_TRACESYSGOOD;
-
- if (data & PTRACE_O_TRACEFORK)
- child->ptrace |= PT_TRACE_FORK;
+ unsigned flags;
- if (data & PTRACE_O_TRACEVFORK)
- child->ptrace |= PT_TRACE_VFORK;
-
- if (data & PTRACE_O_TRACECLONE)
- child->ptrace |= PT_TRACE_CLONE;
-
- if (data & PTRACE_O_TRACEEXEC)
- child->ptrace |= PT_TRACE_EXEC;
-
- if (data & PTRACE_O_TRACEVFORKDONE)
- child->ptrace |= PT_TRACE_VFORK_DONE;
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
- if (data & PTRACE_O_TRACEEXIT)
- child->ptrace |= PT_TRACE_EXIT;
+ /* Avoid intermediate state when all opts are cleared */
+ flags = child->ptrace;
+ flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+ flags |= (data << PT_OPT_FLAG_SHIFT);
+ child->ptrace = flags;
- return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+ return 0;
}
static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -586,6 +621,83 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
return error;
}
+static int ptrace_peek_siginfo(struct task_struct *child,
+ unsigned long addr,
+ unsigned long data)
+{
+ struct ptrace_peeksiginfo_args arg;
+ struct sigpending *pending;
+ struct sigqueue *q;
+ int ret, i;
+
+ ret = copy_from_user(&arg, (void __user *) addr,
+ sizeof(struct ptrace_peeksiginfo_args));
+ if (ret)
+ return -EFAULT;
+
+ if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
+ return -EINVAL; /* unknown flags */
+
+ if (arg.nr < 0)
+ return -EINVAL;
+
+ if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
+ pending = &child->signal->shared_pending;
+ else
+ pending = &child->pending;
+
+ for (i = 0; i < arg.nr; ) {
+ siginfo_t info;
+ s32 off = arg.off + i;
+
+ spin_lock_irq(&child->sighand->siglock);
+ list_for_each_entry(q, &pending->list, list) {
+ if (!off--) {
+ copy_siginfo(&info, &q->info);
+ break;
+ }
+ }
+ spin_unlock_irq(&child->sighand->siglock);
+
+ if (off >= 0) /* beyond the end of the list */
+ break;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(is_compat_task())) {
+ compat_siginfo_t __user *uinfo = compat_ptr(data);
+
+ if (copy_siginfo_to_user32(uinfo, &info) ||
+ __put_user(info.si_code, &uinfo->si_code)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ } else
+#endif
+ {
+ siginfo_t __user *uinfo = (siginfo_t __user *) data;
+
+ if (copy_siginfo_to_user(uinfo, &info) ||
+ __put_user(info.si_code, &uinfo->si_code)) {
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ data += sizeof(siginfo_t);
+ i++;
+
+ if (signal_pending(current))
+ break;
+
+ cond_resched();
+ }
+
+ if (i > 0)
+ return i;
+
+ return ret;
+}
#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
@@ -680,6 +792,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
kiov->iov_len, kiov->iov_base);
}
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code. We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
#endif
int ptrace_request(struct task_struct *child, long request,
@@ -710,6 +828,10 @@ int ptrace_request(struct task_struct *child, long request,
ret = put_user(child->ptrace_message, datalp);
break;
+ case PTRACE_PEEKSIGINFO:
+ ret = ptrace_peek_siginfo(child, addr, data);
+ break;
+
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, &siginfo);
if (!ret)
@@ -723,6 +845,47 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
+ case PTRACE_GETSIGMASK:
+ if (addr != sizeof(sigset_t)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ break;
+
+ case PTRACE_SETSIGMASK: {
+ sigset_t new_set;
+
+ if (addr != sizeof(sigset_t)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ /*
+ * Every thread does recalc_sigpending() after resume, so
+ * retarget_shared_pending() and recalc_sigpending() are not
+ * called here.
+ */
+ spin_lock_irq(&child->sighand->siglock);
+ child->blocked = new_set;
+ spin_unlock_irq(&child->sighand->siglock);
+
+ ret = 0;
+ break;
+ }
+
case PTRACE_INTERRUPT:
/*
* Stop tracee without any side-effect on signal or job
@@ -744,7 +907,7 @@ int ptrace_request(struct task_struct *child, long request,
* tracee into STOP.
*/
if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
- signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+ ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
unlock_task_sighand(child, &flags);
ret = 0;
@@ -770,7 +933,7 @@ int ptrace_request(struct task_struct *child, long request,
* start of this trap and now. Trigger re-trap.
*/
if (child->jobctl & JOBCTL_TRAP_NOTIFY)
- signal_wake_up(child, true);
+ ptrace_signal_wake_up(child, true);
ret = 0;
}
unlock_task_sighand(child, &flags);
@@ -827,8 +990,7 @@ int ptrace_request(struct task_struct *child, long request,
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
case PTRACE_GETREGSET:
- case PTRACE_SETREGSET:
- {
+ case PTRACE_SETREGSET: {
struct iovec kiov;
struct iovec __user *uiov = datavp;
@@ -891,7 +1053,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
}
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
- ret = ptrace_attach(child, request, data);
+ ret = ptrace_attach(child, request, addr, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -907,6 +1069,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
out_put_task_struct:
put_task_struct(child);
@@ -1016,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
return ret;
}
-asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
- compat_long_t addr, compat_long_t data)
+COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
+ compat_long_t, addr, compat_long_t, data)
{
struct task_struct *child;
long ret;
@@ -1034,7 +1198,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
}
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
- ret = ptrace_attach(child, request, data);
+ ret = ptrace_attach(child, request, addr, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -1046,8 +1210,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
ret = ptrace_check_attach(child, request == PTRACE_KILL ||
request == PTRACE_INTERRUPT);
- if (!ret)
+ if (!ret) {
ret = compat_arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
+ }
out_put_task_struct:
put_task_struct(child);
@@ -1055,19 +1222,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
return ret;
}
#endif /* CONFIG_COMPAT */
-
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-int ptrace_get_breakpoints(struct task_struct *tsk)
-{
- if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
- return 0;
-
- return -1;
-}
-
-void ptrace_put_breakpoints(struct task_struct *tsk)
-{
- if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
- flush_ptrace_hw_breakpoint(tsk);
-}
-#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed6..322ea8e93e4 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sort.h>
-
+#include <linux/string.h>
#include <linux/range.h>
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
if (start >= end)
return nr_range;
- /* Try to merge it with old one: */
+ /* get new start/end: */
for (i = 0; i < nr_range; i++) {
- u64 final_start, final_end;
u64 common_start, common_end;
if (!range[i].end)
@@ -45,12 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
if (common_start > common_end)
continue;
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
+ /* new start/end, will add it back at last */
+ start = min(range[i].start, start);
+ end = max(range[i].end, end);
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
+ memmove(&range[i], &range[i + 1],
+ (nr_range - (i + 1)) * sizeof(range[i]));
+ range[nr_range - 1].start = 0;
+ range[nr_range - 1].end = 0;
+ nr_range--;
+ i--;
}
/* Need to add it: */
@@ -97,7 +100,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
range[i].end = range[j].end;
range[i].start = end;
} else {
- printk(KERN_ERR "run of slot in ranges\n");
+ pr_err("%s: run out of slot in ranges\n",
+ __func__);
}
range[j].end = start;
continue;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 00000000000..807ccfbf69b
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
+obj-y += update.o srcu.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_TREE_RCU) += tree.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
+obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index aa88baab5f7..bfda2726ca4 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2011
*
@@ -23,6 +23,7 @@
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <trace/events/rcu.h>
#ifdef CONFIG_RCU_TRACE
#define RCU_TRACE(stmt) stmt
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -33,8 +34,27 @@
* Process-level increment to ->dynticks_nesting field. This allows for
* architectures that use half-interrupts and half-exceptions from
* process context.
+ *
+ * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
+ * that counts the number of process-based reasons why RCU cannot
+ * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
+ * is the value used to increment or decrement this field.
+ *
+ * The rest of the bits could in principle be used to count interrupts,
+ * but this would mean that a negative-one value in the interrupt
+ * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
+ * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
+ * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
+ * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
+ * initial exit from idle.
*/
-#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
+#define DYNTICK_TASK_NEST_WIDTH 7
+#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
+#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
+#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
+#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
+#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
+ DYNTICK_TASK_FLAG)
/*
* debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
@@ -48,13 +68,15 @@
extern struct debug_obj_descr rcuhead_debug_descr;
-static inline void debug_rcu_head_queue(struct rcu_head *head)
+static inline int debug_rcu_head_queue(struct rcu_head *head)
{
- WARN_ON_ONCE((unsigned long)head & 0x3);
- debug_object_activate(head, &rcuhead_debug_descr);
+ int r1;
+
+ r1 = debug_object_activate(head, &rcuhead_debug_descr);
debug_object_active_state(head, &rcuhead_debug_descr,
STATE_RCU_HEAD_READY,
STATE_RCU_HEAD_QUEUED);
+ return r1;
}
static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -65,8 +87,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
debug_object_deactivate(head, &rcuhead_debug_descr);
}
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-static inline void debug_rcu_head_queue(struct rcu_head *head)
+static inline int debug_rcu_head_queue(struct rcu_head *head)
{
+ return 0;
}
static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -74,19 +97,38 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-extern void kfree(const void *);
+void kfree(const void *);
-static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
+static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
+ rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
+ rcu_lock_release(&rcu_callback_map);
+ return 1;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
+ rcu_lock_release(&rcu_callback_map);
+ return 0;
}
}
+#ifdef CONFIG_RCU_STALL_COMMON
+
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x) tracepoint_string(x)
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
new file mode 100644
index 00000000000..948a7693748
--- /dev/null
+++ b/kernel/rcu/rcutorture.c
@@ -0,0 +1,1707 @@
+/*
+ * Read-Copy Update module-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2005, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Josh Triplett <josh@joshtriplett.org>
+ *
+ * See also: Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+
+
+torture_param(int, fqs_duration, 0,
+ "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(bool, gp_normal, false,
+ "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, n_barrier_cbs, 0,
+ "# of callbacks/kthreads for barrier testing");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, object_debug, 0,
+ "Enable debug-object double call_rcu() testing");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
+torture_param(int, stall_cpu_holdoff, 10,
+ "Time to wait before starting stall (s).");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+torture_param(int, test_boost_duration, 4,
+ "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7,
+ "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true,
+ "Test support for tickless idle CPUs");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "rcu";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+ struct rcu_head rtort_rcu;
+ int rtort_pipe_count;
+ struct list_head rtort_free;
+ int rtort_mbtest;
+};
+
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture __rcu *rcu_torture_current;
+static unsigned long rcu_torture_current_version;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_batch) = { 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
+static long n_rcu_torture_timers;
+static long n_barrier_attempts;
+static long n_barrier_successes;
+static struct list_head rcu_torture_removed;
+
+static int rcu_torture_writer_state;
+#define RTWS_FIXED_DELAY 0
+#define RTWS_DELAY 1
+#define RTWS_REPLACE 2
+#define RTWS_DEF_FREE 3
+#define RTWS_EXP_SYNC 4
+#define RTWS_COND_GET 5
+#define RTWS_COND_SYNC 6
+#define RTWS_SYNC 7
+#define RTWS_STUTTER 8
+#define RTWS_STOPPING 9
+
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
+
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
+#define rcu_can_boost() 1
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#define rcu_can_boost() 0
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+ u64 ts = trace_clock_local();
+ unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+ return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+ return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
+static bool barrier_phase; /* Test phase. */
+static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+static struct rcu_torture *
+rcu_torture_alloc(void)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&rcu_torture_lock);
+ if (list_empty(&rcu_torture_freelist)) {
+ atomic_inc(&n_rcu_torture_alloc_fail);
+ spin_unlock_bh(&rcu_torture_lock);
+ return NULL;
+ }
+ atomic_inc(&n_rcu_torture_alloc);
+ p = rcu_torture_freelist.next;
+ list_del_init(p);
+ spin_unlock_bh(&rcu_torture_lock);
+ return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+ atomic_inc(&n_rcu_torture_free);
+ spin_lock_bh(&rcu_torture_lock);
+ list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+ spin_unlock_bh(&rcu_torture_lock);
+}
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+ int ttype;
+ void (*init)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *rrsp);
+ void (*readunlock)(int idx);
+ int (*completed)(void);
+ void (*deferred_free)(struct rcu_torture *p);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ unsigned long (*get_state)(void);
+ void (*cond_sync)(unsigned long oldstate);
+ void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+ void (*cb_barrier)(void);
+ void (*fqs)(void);
+ void (*stats)(char *page);
+ int irq_capable;
+ int can_boost;
+ const char *name;
+};
+
+static struct rcu_torture_ops *cur_ops;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_read_delay(struct torture_random_state *rrsp)
+{
+ const unsigned long shortdelay_us = 200;
+ const unsigned long longdelay_ms = 50;
+
+ /* We want a short delay sometimes to make a reader delay the grace
+ * period, and we want a long delay occasionally to trigger
+ * force_quiescent_state. */
+
+ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!preempt_count() &&
+ !(torture_random(rrsp) % (nrealreaders * 20000)))
+ preempt_schedule(); /* No QS if preempt_disable() in effect */
+#endif
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+ return rcu_batches_completed();
+}
+
+/*
+ * Update callback in the pipe. This should be invoked after a grace period.
+ */
+static bool
+rcu_torture_pipe_update_one(struct rcu_torture *rp)
+{
+ int i;
+
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update all callbacks in the pipe. Suitable for synchronous grace-period
+ * primitives.
+ */
+static void
+rcu_torture_pipe_update(struct rcu_torture *old_rp)
+{
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ if (old_rp)
+ list_add(&old_rp->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ if (rcu_torture_pipe_update_one(rp)) {
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (torture_must_stop_irq()) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ if (rcu_torture_pipe_update_one(rp))
+ rcu_torture_free(rp);
+ else
+ cur_ops->deferred_free(rp);
+}
+
+static int rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static void rcu_sync_torture_init(void)
+{
+ INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .get_state = get_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
+ .call = call_rcu,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .can_boost = rcu_can_boost(),
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+ return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+ .ttype = RCU_BH_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_bh_torture_deferred_free,
+ .sync = synchronize_rcu_bh,
+ .exp_sync = synchronize_rcu_bh_expedited,
+ .call = call_rcu_bh,
+ .cb_barrier = rcu_barrier_bh,
+ .fqs = rcu_bh_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh"
+};
+
+/*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+ /* This is a deliberate bug for testing purposes only! */
+ rcu_torture_cb(&p->rtort_rcu);
+}
+
+static void synchronize_rcu_busted(void)
+{
+ /* This is a deliberate bug for testing purposes only! */
+}
+
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ /* This is a deliberate bug for testing purposes only! */
+ func(head);
+}
+
+static struct rcu_torture_ops rcu_busted_ops = {
+ .ttype = INVALID_RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_busted_torture_deferred_free,
+ .sync = synchronize_rcu_busted,
+ .exp_sync = synchronize_rcu_busted,
+ .call = call_rcu_busted,
+ .cb_barrier = NULL,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_busted"
+};
+
+/*
+ * Definitions for srcu torture testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl);
+
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+{
+ return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct torture_random_state *rrsp)
+{
+ long delay;
+ const long uspertick = 1000000 / HZ;
+ const long longdelay = 10;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = torture_random(rrsp) %
+ (nrealreaders * 2 * longdelay * uspertick);
+ if (!delay)
+ schedule_timeout_interruptible(longdelay);
+ else
+ rcu_read_delay(rrsp);
+}
+
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+{
+ srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static int srcu_torture_completed(void)
+{
+ return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+ call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
+static void srcu_torture_synchronize(void)
+{
+ synchronize_srcu(&srcu_ctl);
+}
+
+static void srcu_torture_call(struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ call_srcu(&srcu_ctl, head, func);
+}
+
+static void srcu_torture_barrier(void)
+{
+ srcu_barrier(&srcu_ctl);
+}
+
+static void srcu_torture_stats(char *page)
+{
+ int cpu;
+ int idx = srcu_ctl.completed & 0x1;
+
+ page += sprintf(page, "%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
+ for_each_possible_cpu(cpu) {
+ long c0, c1;
+
+ c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
+ }
+ sprintf(page, "\n");
+}
+
+static void srcu_torture_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(&srcu_ctl);
+}
+
+static struct rcu_torture_ops srcu_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops sched_ops = {
+ .ttype = RCU_SCHED_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_sched_torture_deferred_free,
+ .sync = synchronize_sched,
+ .exp_sync = synchronize_sched_expedited,
+ .call = call_rcu_sched,
+ .cb_barrier = rcu_barrier_sched,
+ .fqs = rcu_sched_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "sched"
+};
+
+/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ init_rcu_head_on_stack(&rbi.rcu);
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (ULONG_CMP_LT(jiffies, oldstarttime)) {
+ schedule_timeout_interruptible(oldstarttime - jiffies);
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (ULONG_CMP_LT(jiffies, endtime)) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched();
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime &&
+ !kthread_should_stop()) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: stutter_wait("rcu_torture_boost");
+ } while (!torture_must_stop());
+
+ /* Clean up and exit. */
+ while (!kthread_should_stop() || rbi.inflight) {
+ torture_shutdown_absorb("rcu_torture_boost");
+ schedule_timeout_uninterruptible(1);
+ }
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ destroy_rcu_head_on_stack(&rbi.rcu);
+ torture_kthread_stopping("rcu_torture_boost");
+ return 0;
+}
+
+/*
+ * RCU torture force-quiescent-state kthread. Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+ unsigned long fqs_resume_time;
+ int fqs_burst_remaining;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
+ do {
+ fqs_resume_time = jiffies + fqs_stutter * HZ;
+ while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+ !kthread_should_stop()) {
+ schedule_timeout_interruptible(1);
+ }
+ fqs_burst_remaining = fqs_duration;
+ while (fqs_burst_remaining > 0 &&
+ !kthread_should_stop()) {
+ cur_ops->fqs();
+ udelay(fqs_holdoff);
+ fqs_burst_remaining -= fqs_holdoff;
+ }
+ stutter_wait("rcu_torture_fqs");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_fqs");
+ return 0;
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ unsigned long gp_snap;
+ bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
+ bool gp_sync1 = gp_sync;
+ int i;
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
+ RTWS_COND_GET, RTWS_SYNC };
+ int nsynctypes = 0;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+
+ /* Initialize synctype[] array. If none set, take default. */
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ synctype[nsynctypes++] = RTWS_COND_GET;
+ else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+ pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+ if (gp_exp1 && cur_ops->exp_sync)
+ synctype[nsynctypes++] = RTWS_EXP_SYNC;
+ else if (gp_exp && !cur_ops->exp_sync)
+ pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+ if (gp_normal1 && cur_ops->deferred_free)
+ synctype[nsynctypes++] = RTWS_DEF_FREE;
+ else if (gp_normal && !cur_ops->deferred_free)
+ pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+ if (gp_sync1 && cur_ops->sync)
+ synctype[nsynctypes++] = RTWS_SYNC;
+ else if (gp_sync && !cur_ops->sync)
+ pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ if (WARN_ONCE(nsynctypes == 0,
+ "rcu_torture_writer: No update-side primitives.\n")) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ }
+
+ do {
+ rcu_torture_writer_state = RTWS_FIXED_DELAY;
+ schedule_timeout_uninterruptible(1);
+ rp = rcu_torture_alloc();
+ if (rp == NULL)
+ continue;
+ rp->rtort_pipe_count = 0;
+ rcu_torture_writer_state = RTWS_DELAY;
+ udelay(torture_random(&rand) & 0x3ff);
+ rcu_torture_writer_state = RTWS_REPLACE;
+ old_rp = rcu_dereference_check(rcu_torture_current,
+ current == writer_task);
+ rp->rtort_mbtest = 1;
+ rcu_assign_pointer(rcu_torture_current, rp);
+ smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
+ if (old_rp) {
+ i = old_rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ old_rp->rtort_pipe_count++;
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ rcu_torture_writer_state = RTWS_DEF_FREE;
+ cur_ops->deferred_free(old_rp);
+ break;
+ case RTWS_EXP_SYNC:
+ rcu_torture_writer_state = RTWS_EXP_SYNC;
+ cur_ops->exp_sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET:
+ rcu_torture_writer_state = RTWS_COND_GET;
+ gp_snap = cur_ops->get_state();
+ i = torture_random(&rand) % 16;
+ if (i != 0)
+ schedule_timeout_interruptible(i);
+ udelay(torture_random(&rand) % 1000);
+ rcu_torture_writer_state = RTWS_COND_SYNC;
+ cur_ops->cond_sync(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_SYNC:
+ rcu_torture_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ }
+ rcutorture_record_progress(++rcu_torture_current_version);
+ rcu_torture_writer_state = RTWS_STUTTER;
+ stutter_wait("rcu_torture_writer");
+ } while (!torture_must_stop());
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ return 0;
+}
+
+/*
+ * RCU torture fake writer kthread. Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+ DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
+ udelay(torture_random(&rand) & 0x3ff);
+ if (cur_ops->cb_barrier != NULL &&
+ torture_random(&rand) % (nfakewriters * 8) == 0) {
+ cur_ops->cb_barrier();
+ } else if (gp_normal == gp_exp) {
+ if (torture_random(&rand) & 0x80)
+ cur_ops->sync();
+ else
+ cur_ops->exp_sync();
+ } else if (gp_normal) {
+ cur_ops->sync();
+ } else {
+ cur_ops->exp_sync();
+ }
+ stutter_wait("rcu_torture_fakewriter");
+ } while (!torture_must_stop());
+
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+}
+
+static void rcutorture_trace_dump(void)
+{
+ static atomic_t beenhere = ATOMIC_INIT(0);
+
+ if (atomic_read(&beenhere))
+ return;
+ if (atomic_xchg(&beenhere, 1) != 0)
+ return;
+ ftrace_dump(DUMP_ALL);
+}
+
+/*
+ * RCU torture reader from timer handler. Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+ int idx;
+ int completed;
+ int completed_end;
+ static DEFINE_TORTURE_RANDOM(rand);
+ static DEFINE_SPINLOCK(rand_lock);
+ struct rcu_torture *p;
+ int pipe_count;
+ unsigned long long ts;
+
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
+ p = rcu_dereference_check(rcu_torture_current,
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_sched_held() ||
+ srcu_read_lock_held(&srcu_ctl));
+ if (p == NULL) {
+ /* Leave because rcu_torture_writer is not yet underway */
+ cur_ops->readunlock(idx);
+ return;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ spin_lock(&rand_lock);
+ cur_ops->read_delay(&rand);
+ n_rcu_torture_timers++;
+ spin_unlock(&rand_lock);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+ completed, completed_end);
+ rcutorture_trace_dump();
+ }
+ __this_cpu_inc(rcu_torture_count[pipe_count]);
+ completed = completed_end - completed;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ __this_cpu_inc(rcu_torture_batch[completed]);
+ preempt_enable();
+ cur_ops->readunlock(idx);
+}
+
+/*
+ * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+ int completed;
+ int completed_end;
+ int idx;
+ DEFINE_TORTURE_RANDOM(rand);
+ struct rcu_torture *p;
+ int pipe_count;
+ struct timer_list t;
+ unsigned long long ts;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+ if (irqreader && cur_ops->irq_capable)
+ setup_timer_on_stack(&t, rcu_torture_timer, 0);
+
+ do {
+ if (irqreader && cur_ops->irq_capable) {
+ if (!timer_pending(&t))
+ mod_timer(&t, jiffies + 1);
+ }
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
+ p = rcu_dereference_check(rcu_torture_current,
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_sched_held() ||
+ srcu_read_lock_held(&srcu_ctl));
+ if (p == NULL) {
+ /* Wait for rcu_torture_writer to get underway */
+ cur_ops->readunlock(idx);
+ schedule_timeout_interruptible(HZ);
+ continue;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ cur_ops->read_delay(&rand);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+ ts, completed, completed_end);
+ rcutorture_trace_dump();
+ }
+ __this_cpu_inc(rcu_torture_count[pipe_count]);
+ completed = completed_end - completed;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ __this_cpu_inc(rcu_torture_batch[completed]);
+ preempt_enable();
+ cur_ops->readunlock(idx);
+ cond_resched();
+ stutter_wait("rcu_torture_reader");
+ } while (!torture_must_stop());
+ if (irqreader && cur_ops->irq_capable) {
+ del_timer_sync(&t);
+ destroy_timer_on_stack(&t);
+ }
+ torture_kthread_stopping("rcu_torture_reader");
+ return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static void
+rcu_torture_printk(char *page)
+{
+ int cpu;
+ int i;
+ long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ static unsigned long rtcv_snap = ULONG_MAX;
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+ batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ }
+ }
+ for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ if (pipesummary[i] != 0)
+ break;
+ }
+ page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page,
+ "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ rcu_torture_current,
+ rcu_torture_current_version,
+ list_empty(&rcu_torture_freelist),
+ atomic_read(&n_rcu_torture_alloc),
+ atomic_read(&n_rcu_torture_alloc_fail),
+ atomic_read(&n_rcu_torture_free));
+ page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
+ atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror);
+ page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
+ n_rcu_torture_timers);
+ page = torture_onoff_stats(page);
+ page += sprintf(page, "barrier: %ld/%ld:%ld",
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_barrier_error != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_failure != 0 ||
+ i > 1) {
+ page += sprintf(page, "!!! ");
+ atomic_inc(&n_rcu_torture_error);
+ WARN_ON_ONCE(1);
+ }
+ page += sprintf(page, "Reader Pipe: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ page += sprintf(page, " %ld", pipesummary[i]);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "Reader Batch: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ page += sprintf(page, " %ld", batchsummary[i]);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "Free-Block Circulation: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ page += sprintf(page, " %d",
+ atomic_read(&rcu_torture_wcount[i]));
+ }
+ page += sprintf(page, "\n");
+ if (cur_ops->stats)
+ cur_ops->stats(page);
+ if (rtcv_snap == rcu_torture_current_version &&
+ rcu_torture_current != NULL) {
+ int __maybe_unused flags;
+ unsigned long __maybe_unused gpnum;
+ unsigned long __maybe_unused completed;
+
+ rcutorture_get_gp_data(cur_ops->ttype,
+ &flags, &gpnum, &completed);
+ page += sprintf(page,
+ "??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
+ show_rcu_gp_kthreads();
+ rcutorture_trace_dump();
+ }
+ rtcv_snap = rcu_torture_current_version;
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only
+ * one call to this function at a given time!!! This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+ int size = nr_cpu_ids * 200 + 8192;
+ char *buf;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("rcu-torture: Out of memory, need: %d", size);
+ return;
+ }
+ rcu_torture_printk(buf);
+ pr_alert("%s", buf);
+ kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ rcu_torture_stats_print();
+ torture_shutdown_absorb("rcu_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_stats");
+ return 0;
+}
+
+static inline void
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nreaders=%d nfakewriters=%d "
+ "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+ "shuffle_interval=%d stutter=%d irqreader=%d "
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d shutdown_secs=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d "
+ "n_barrier_cbs=%d "
+ "onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealreaders, nfakewriters,
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration, shutdown_secs,
+ stall_cpu, stall_cpu_holdoff,
+ n_barrier_cbs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ torture_stop_kthread(rcu_torture_boost, t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+ cpu_to_node(cpu),
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+/*
+ * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
+ * induces a CPU stall for the time specified by stall_cpu.
+ */
+static int rcu_torture_stall(void *args)
+{
+ unsigned long stop_at;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (stall_cpu_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
+ schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
+ VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
+ }
+ if (!kthread_should_stop()) {
+ stop_at = get_seconds() + stall_cpu;
+ /* RCU CPU stall is expected behavior in following code. */
+ pr_alert("rcu_torture_stall start.\n");
+ rcu_read_lock();
+ preempt_disable();
+ while (ULONG_CMP_LT(get_seconds(), stop_at))
+ continue; /* Induce RCU CPU stall warning. */
+ preempt_enable();
+ rcu_read_unlock();
+ pr_alert("rcu_torture_stall end.\n");
+ }
+ torture_shutdown_absorb("rcu_torture_stall");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(10 * HZ);
+ return 0;
+}
+
+/* Spawn CPU-stall kthread, if stall_cpu specified. */
+static int __init rcu_torture_stall_init(void)
+{
+ if (stall_cpu <= 0)
+ return 0;
+ return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
+}
+
+/* Callback function for RCU barrier testing. */
+static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+ atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+ long myid = (long)arg;
+ bool lastphase = 0;
+ bool newphase;
+ struct rcu_head rcu;
+
+ init_rcu_head_on_stack(&rcu);
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, MAX_NICE);
+ do {
+ wait_event(barrier_cbs_wq[myid],
+ (newphase =
+ ACCESS_ONCE(barrier_phase)) != lastphase ||
+ torture_must_stop());
+ lastphase = newphase;
+ smp_mb(); /* ensure barrier_phase load before ->call(). */
+ if (torture_must_stop())
+ break;
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ if (atomic_dec_and_test(&barrier_cbs_count))
+ wake_up(&barrier_wq);
+ } while (!torture_must_stop());
+ cur_ops->cb_barrier();
+ destroy_rcu_head_on_stack(&rcu);
+ torture_kthread_stopping("rcu_torture_barrier_cbs");
+ return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+ int i;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
+ do {
+ atomic_set(&barrier_cbs_invoked, 0);
+ atomic_set(&barrier_cbs_count, n_barrier_cbs);
+ smp_mb(); /* Ensure barrier_phase after prior assignments. */
+ barrier_phase = !barrier_phase;
+ for (i = 0; i < n_barrier_cbs; i++)
+ wake_up(&barrier_cbs_wq[i]);
+ wait_event(barrier_wq,
+ atomic_read(&barrier_cbs_count) == 0 ||
+ torture_must_stop());
+ if (torture_must_stop())
+ break;
+ n_barrier_attempts++;
+ cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
+ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+ n_rcu_torture_barrier_error++;
+ WARN_ON_ONCE(1);
+ }
+ n_barrier_successes++;
+ schedule_timeout_interruptible(HZ / 10);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_barrier");
+ return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+ int i;
+ int ret;
+
+ if (n_barrier_cbs == 0)
+ return 0;
+ if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+ pr_alert("%s" TORTURE_FLAG
+ " Call or barrier ops missing for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " RCU barrier testing omitted from run.\n",
+ torture_type);
+ return 0;
+ }
+ atomic_set(&barrier_cbs_count, 0);
+ atomic_set(&barrier_cbs_invoked, 0);
+ barrier_cbs_tasks =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ GFP_KERNEL);
+ barrier_cbs_wq =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+ GFP_KERNEL);
+ if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
+ return -ENOMEM;
+ for (i = 0; i < n_barrier_cbs; i++) {
+ init_waitqueue_head(&barrier_cbs_wq[i]);
+ ret = torture_create_kthread(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ barrier_cbs_tasks[i]);
+ if (ret)
+ return ret;
+ }
+ return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+ int i;
+
+ torture_stop_kthread(rcu_torture_barrier, barrier_task);
+ if (barrier_cbs_tasks != NULL) {
+ for (i = 0; i < n_barrier_cbs; i++)
+ torture_stop_kthread(rcu_torture_barrier_cbs,
+ barrier_cbs_tasks[i]);
+ kfree(barrier_cbs_tasks);
+ barrier_cbs_tasks = NULL;
+ }
+ if (barrier_cbs_wq != NULL) {
+ kfree(barrier_cbs_wq);
+ barrier_cbs_wq = NULL;
+ }
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
+static void
+rcu_torture_cleanup(void)
+{
+ int i;
+
+ rcutorture_record_test_transition();
+ if (torture_cleanup()) {
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+ return;
+ }
+
+ rcu_torture_barrier_cleanup();
+ torture_stop_kthread(rcu_torture_stall, stall_task);
+ torture_stop_kthread(rcu_torture_writer, writer_task);
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+ rcu_torture_current = NULL;
+
+ if (fakewriter_tasks) {
+ for (i = 0; i < nfakewriters; i++) {
+ torture_stop_kthread(rcu_torture_fakewriter,
+ fakewriter_tasks[i]);
+ }
+ kfree(fakewriter_tasks);
+ fakewriter_tasks = NULL;
+ }
+
+ torture_stop_kthread(rcu_torture_stats, stats_task);
+ torture_stop_kthread(rcu_torture_fqs, fqs_task);
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
+
+ /* Wait for all RCU callbacks to fire. */
+
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+
+ rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ rcu_torture_print_module_parms(cur_ops,
+ "End of test: RCU_HOTPLUG");
+ else
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+}
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static void rcu_torture_leak_cb(struct rcu_head *rhp)
+{
+}
+
+static void rcu_torture_err_cb(struct rcu_head *rhp)
+{
+ /*
+ * This -might- happen due to race conditions, but is unlikely.
+ * The scenario that leads to this happening is that the
+ * first of the pair of duplicate callbacks is queued,
+ * someone else starts a grace period that includes that
+ * callback, then the second of the pair must wait for the
+ * next grace period. Unlikely, but can happen. If it
+ * does happen, the debug-objects subsystem won't have splatted.
+ */
+ pr_alert("rcutorture: duplicated callback was invoked.\n");
+}
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+/*
+ * Verify that double-free causes debug-objects to complain, but only
+ * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
+ * cannot be carried out.
+ */
+static void rcu_test_debug_objects(void)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ struct rcu_head rh1;
+ struct rcu_head rh2;
+
+ init_rcu_head_on_stack(&rh1);
+ init_rcu_head_on_stack(&rh2);
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+
+ /* Try to queue the rh2 pair of callbacks for the same grace period. */
+ preempt_disable(); /* Prevent preemption from interrupting test. */
+ rcu_read_lock(); /* Make it impossible to finish a grace period. */
+ call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ local_irq_disable(); /* Make it harder to start a new grace period. */
+ call_rcu(&rh2, rcu_torture_leak_cb);
+ call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ local_irq_enable();
+ rcu_read_unlock();
+ preempt_enable();
+
+ /* Wait for them all to get done so we can safely return. */
+ rcu_barrier();
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ destroy_rcu_head_on_stack(&rh1);
+ destroy_rcu_head_on_stack(&rh2);
+#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+ pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+}
+
+static int __init
+rcu_torture_init(void)
+{
+ int i;
+ int cpu;
+ int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] = {
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
+ };
+
+ if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("rcu-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cur_ops->fqs == NULL && fqs_duration != 0) {
+ pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
+ fqs_duration = 0;
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nreaders >= 0) {
+ nrealreaders = nreaders;
+ } else {
+ nrealreaders = num_online_cpus() - 1;
+ if (nrealreaders <= 0)
+ nrealreaders = 1;
+ }
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
+
+ /* Set up the freelist. */
+
+ INIT_LIST_HEAD(&rcu_torture_freelist);
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
+ rcu_tortures[i].rtort_mbtest = 0;
+ list_add_tail(&rcu_tortures[i].rtort_free,
+ &rcu_torture_freelist);
+ }
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ rcu_torture_current = NULL;
+ rcu_torture_current_version = 0;
+ atomic_set(&n_rcu_torture_alloc, 0);
+ atomic_set(&n_rcu_torture_alloc_fail, 0);
+ atomic_set(&n_rcu_torture_free, 0);
+ atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_barrier_error = 0;
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ atomic_set(&rcu_torture_wcount[i], 0);
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ per_cpu(rcu_torture_count, cpu)[i] = 0;
+ per_cpu(rcu_torture_batch, cpu)[i] = 0;
+ }
+ }
+
+ /* Start up the kthreads. */
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (firsterr)
+ goto unwind;
+ fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nfakewriters; i++) {
+ firsterr = torture_create_kthread(rcu_torture_fakewriter,
+ NULL, fakewriter_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+ reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(rcu_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ if (test_no_idle_hz) {
+ firsterr = torture_shuffle_init(shuffle_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter < 0)
+ stutter = 0;
+ if (stutter) {
+ firsterr = torture_stutter_init(stutter * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (fqs_duration < 0)
+ fqs_duration = 0;
+ if (fqs_duration) {
+ /* Create the fqs thread */
+ firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
+ fqs_task);
+ if (firsterr)
+ goto unwind;
+ }
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ firsterr = rcutorture_booster_init(i);
+ if (firsterr)
+ goto unwind;
+ }
+ }
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_stall_init();
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_barrier_init();
+ if (firsterr)
+ goto unwind;
+ if (object_debug)
+ rcu_test_debug_objects();
+ rcutorture_record_test_transition();
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_torture_cleanup();
+ return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
new file mode 100644
index 00000000000..c639556f3fa
--- /dev/null
+++ b/kernel/rcu/srcu.c
@@ -0,0 +1,693 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include "rcu.h"
+
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+ b->head = NULL;
+ b->tail = &b->head;
+}
+
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+ *b->tail = head;
+ b->tail = &head->next;
+}
+
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+ return b->tail == &b->head;
+}
+
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+ struct rcu_head *head;
+
+ if (rcu_batch_empty(b))
+ return NULL;
+
+ head = b->head;
+ b->head = head->next;
+ if (b->tail == &head->next)
+ rcu_batch_init(b);
+
+ return head;
+}
+
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+ if (!rcu_batch_empty(from)) {
+ *to->tail = from->head;
+ to->tail = from->tail;
+ rcu_batch_init(from);
+ }
+}
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+ sp->completed = 0;
+ spin_lock_init(&sp->queue_lock);
+ sp->running = false;
+ rcu_batch_init(&sp->batch_queue);
+ rcu_batch_init(&sp->batch_check0);
+ rcu_batch_init(&sp->batch_check1);
+ rcu_batch_init(&sp->batch_done);
+ INIT_DELAYED_WORK(&sp->work, process_srcu);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+ lockdep_init_map(&sp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * Returns approximate total of the readers' ->seq[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
+
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
+ * Returns approximate number of readers active on the specified rank
+ * of the per-CPU ->c[] counters.
+ */
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
+
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be stably zero. An example unstable zero can occur if the call
+ * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
+ * but due to task migration, sees the corresponding __srcu_read_unlock()
+ * decrement. This can happen because srcu_readers_active_idx() takes
+ * time to sum the array, and might in fact be interrupted or preempted
+ * partway through the summation.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+ unsigned long seq;
+
+ seq = srcu_readers_seq_idx(sp, idx);
+
+ /*
+ * The following smp_mb() A pairs with the smp_mb() B located in
+ * __srcu_read_lock(). This pairing ensures that if an
+ * __srcu_read_lock() increments its counter after the summation
+ * in srcu_readers_active_idx(), then the corresponding SRCU read-side
+ * critical section will see any changes made prior to the start
+ * of the current SRCU grace period.
+ *
+ * Also, if the above call to srcu_readers_seq_idx() saw the
+ * increment of ->seq[], then the call to srcu_readers_active_idx()
+ * must see the increment of ->c[].
+ */
+ smp_mb(); /* A */
+
+ /*
+ * Note that srcu_readers_active_idx() can incorrectly return
+ * zero even though there is a pre-existing reader throughout.
+ * To see this, suppose that task A is in a very long SRCU
+ * read-side critical section that started on CPU 0, and that
+ * no other reader exists, so that the sum of the counters
+ * is equal to one. Then suppose that task B starts executing
+ * srcu_readers_active_idx(), summing up to CPU 1, and then that
+ * task C starts reading on CPU 0, so that its increment is not
+ * summed, but finishes reading on CPU 2, so that its decrement
+ * -is- summed. Then when task B completes its sum, it will
+ * incorrectly get zero, despite the fact that task A has been
+ * in its SRCU read-side critical section the whole time.
+ *
+ * We therefore do a validation step should srcu_readers_active_idx()
+ * return zero.
+ */
+ if (srcu_readers_active_idx(sp, idx) != 0)
+ return false;
+
+ /*
+ * The remainder of this function is the validation step.
+ * The following smp_mb() D pairs with the smp_mb() C in
+ * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
+ * by srcu_readers_active_idx() above, then any destructive
+ * operation performed after the grace period will happen after
+ * the corresponding SRCU read-side critical section.
+ *
+ * Note that there can be at most NR_CPUS worth of readers using
+ * the old index, which is not enough to overflow even a 32-bit
+ * integer. (Yes, this does mean that systems having more than
+ * a billion or so CPUs need to be 64-bit systems.) Therefore,
+ * the sum of the ->seq[] counters cannot possibly overflow.
+ * Therefore, the only way that the return values of the two
+ * calls to srcu_readers_seq_idx() can be equal is if there were
+ * no increments of the corresponding rank of ->seq[] counts
+ * in the interim. But the missed-increment scenario laid out
+ * above includes an increment of the ->seq[] counter by
+ * the corresponding __srcu_read_lock(). Therefore, if this
+ * scenario occurs, the return values from the two calls to
+ * srcu_readers_seq_idx() will differ, and thus the validation
+ * step below suffices.
+ */
+ smp_mb(); /* D */
+
+ return srcu_readers_seq_idx(sp, idx) == seq;
+}
+
+/**
+ * srcu_readers_active - returns approximate number of readers.
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static int srcu_readers_active(struct srcu_struct *sp)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ }
+ return sum;
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ if (WARN_ON(srcu_readers_active(sp)))
+ return; /* Leakage unless caller handles error. */
+ free_percpu(sp->per_cpu_ref);
+ sp->per_cpu_ref = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = ACCESS_ONCE(sp->completed) & 0x1;
+ preempt_disable();
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+ preempt_enable();
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ this_cpu_dec(sp->per_cpu_ref->c[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections. If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods. This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SRCU_RETRY_CHECK_DELAY 5
+#define SYNCHRONIZE_SRCU_TRYCOUNT 2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
+
+/*
+ * @@@ Wait until all pre-existing readers complete. Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+ for (;;) {
+ if (srcu_readers_active_idx_check(sp, idx))
+ return true;
+ if (--trycount <= 0)
+ return false;
+ udelay(SRCU_RETRY_CHECK_DELAY);
+ }
+}
+
+/*
+ * Increment the ->completed counter so that future SRCU readers will
+ * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+ sp->completed++;
+}
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section. On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu(). It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ unsigned long flags;
+
+ head->next = NULL;
+ head->func = func;
+ spin_lock_irqsave(&sp->queue_lock, flags);
+ rcu_batch_queue(&sp->batch_queue, head);
+ if (!sp->running) {
+ sp->running = true;
+ queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
+ }
+ spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+struct rcu_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
+
+/*
+ * Awaken the corresponding synchronize_srcu() instance now that a
+ * grace period has elapsed.
+ */
+static void wakeme_after_rcu(struct rcu_head *head)
+{
+ struct rcu_synchronize *rcu;
+
+ rcu = container_of(head, struct rcu_synchronize, head);
+ complete(&rcu->completion);
+}
+
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+{
+ struct rcu_synchronize rcu;
+ struct rcu_head *head = &rcu.head;
+ bool done = false;
+
+ rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+ !lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+
+ might_sleep();
+ init_completion(&rcu.completion);
+
+ head->next = NULL;
+ head->func = wakeme_after_rcu;
+ spin_lock_irq(&sp->queue_lock);
+ if (!sp->running) {
+ /* steal the processing owner */
+ sp->running = true;
+ rcu_batch_queue(&sp->batch_check0, head);
+ spin_unlock_irq(&sp->queue_lock);
+
+ srcu_advance_batches(sp, trycount);
+ if (!rcu_batch_empty(&sp->batch_done)) {
+ BUG_ON(sp->batch_done.head != head);
+ rcu_batch_dequeue(&sp->batch_done);
+ done = true;
+ }
+ /* give the processing owner to work_struct */
+ srcu_reschedule(sp);
+ } else {
+ rcu_batch_queue(&sp->batch_queue, head);
+ spin_unlock_irq(&sp->queue_lock);
+ }
+
+ if (!done)
+ wait_for_completion(&rcu.completion);
+}
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu(). In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section. Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ __synchronize_srcu(sp, rcu_expedited
+ ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
+ : SYNCHRONIZE_SRCU_TRYCOUNT);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+ __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+ synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return sp->completed;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+#define SRCU_CALLBACK_BATCH 10
+#define SRCU_INTERVAL 1
+
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+ if (!rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+ spin_unlock_irq(&sp->queue_lock);
+ }
+}
+
+/*
+ * Core SRCU state machine. Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+{
+ int idx = 1 ^ (sp->completed & 1);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->completed for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ */
+
+ if (rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_check1))
+ return; /* no callbacks need to be advanced */
+
+ if (!try_check_zero(sp, idx, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have already done with their
+ * first zero check and flip back when they were enqueued on
+ * ->batch_check0 in a previous invocation of srcu_advance_batches().
+ * (Presumably try_check_zero() returned false during that
+ * invocation, leaving the callbacks stranded on ->batch_check1.)
+ * They are therefore ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+ if (rcu_batch_empty(&sp->batch_check0))
+ return; /* no callbacks need to be advanced */
+ srcu_flip(sp);
+
+ /*
+ * The callbacks in ->batch_check0 just finished their
+ * first check zero and flip, so move them to ->batch_check1
+ * for future checking on the other idx.
+ */
+ rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+ /*
+ * SRCU read-side critical sections are normally short, so check
+ * at least twice in quick succession after a flip.
+ */
+ trycount = trycount < 2 ? 2 : trycount;
+ if (!try_check_zero(sp, idx^1, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have now waited for all
+ * pre-existing readers using both idx values. They are therefore
+ * ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue.
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+ int i;
+ struct rcu_head *head;
+
+ for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+ head = rcu_batch_dequeue(&sp->batch_done);
+ if (!head)
+ break;
+ local_bh_disable();
+ head->func(head);
+ local_bh_enable();
+ }
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp)
+{
+ bool pending = true;
+
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ sp->running = false;
+ pending = false;
+ }
+ spin_unlock_irq(&sp->queue_lock);
+ }
+
+ if (pending)
+ queue_delayed_work(system_power_efficient_wq,
+ &sp->work, SRCU_INTERVAL);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+ struct srcu_struct *sp;
+
+ sp = container_of(work, struct srcu_struct, work.work);
+
+ srcu_collect_new(sp);
+ srcu_advance_batches(sp, 1);
+ srcu_invoke_callbacks(sp);
+ srcu_reschedule(sp);
+}
+EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index 977296dca0a..d9efcc13008 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -35,46 +35,46 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-
-#ifdef CONFIG_RCU_TRACE
-#include <trace/events/rcu.h>
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
+#include <linux/ftrace_event.h>
#include "rcu.h"
-/* Forward declarations for rcutiny_plugin.h. */
+/* Forward declarations for tiny_plugin.h. */
struct rcu_ctrlblk;
-static void invoke_rcu_callbacks(void);
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
-#include "rcutiny_plugin.h"
+static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+#include "tiny_plugin.h"
/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
-static void rcu_idle_enter_common(long long oldval)
+static void rcu_idle_enter_common(long long newval)
{
- if (rcu_dynticks_nesting) {
- RCU_TRACE(trace_rcu_dyntick("--=",
- oldval, rcu_dynticks_nesting));
+ if (newval) {
+ RCU_TRACE(trace_rcu_dyntick(TPS("--="),
+ rcu_dynticks_nesting, newval));
+ rcu_dynticks_nesting = newval;
return;
}
- RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
+ RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
+ rcu_dynticks_nesting, newval));
if (!is_idle_task(current)) {
- struct task_struct *idle = idle_task(smp_processor_id());
+ struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
- RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
- oldval, rcu_dynticks_nesting));
+ RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
+ rcu_dynticks_nesting, newval));
ftrace_dump(DUMP_ALL);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
}
rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+ barrier();
+ rcu_dynticks_nesting = newval;
}
/*
@@ -84,14 +84,19 @@ static void rcu_idle_enter_common(long long oldval)
void rcu_idle_enter(void)
{
unsigned long flags;
- long long oldval;
+ long long newval;
local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- rcu_dynticks_nesting = 0;
- rcu_idle_enter_common(oldval);
+ WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
+ if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
+ DYNTICK_TASK_NEST_VALUE)
+ newval = 0;
+ else
+ newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
+ rcu_idle_enter_common(newval);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
/*
* Exit an interrupt handler towards idle.
@@ -99,29 +104,29 @@ void rcu_idle_enter(void)
void rcu_irq_exit(void)
{
unsigned long flags;
- long long oldval;
+ long long newval;
local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- rcu_dynticks_nesting--;
- WARN_ON_ONCE(rcu_dynticks_nesting < 0);
- rcu_idle_enter_common(oldval);
+ newval = rcu_dynticks_nesting - 1;
+ WARN_ON_ONCE(newval < 0);
+ rcu_idle_enter_common(newval);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_irq_exit);
/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
static void rcu_idle_exit_common(long long oldval)
{
if (oldval) {
- RCU_TRACE(trace_rcu_dyntick("++=",
+ RCU_TRACE(trace_rcu_dyntick(TPS("++="),
oldval, rcu_dynticks_nesting));
return;
}
- RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+ RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
if (!is_idle_task(current)) {
- struct task_struct *idle = idle_task(smp_processor_id());
+ struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
- RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+ RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
oldval, rcu_dynticks_nesting));
ftrace_dump(DUMP_ALL);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -140,11 +145,15 @@ void rcu_idle_exit(void)
local_irq_save(flags);
oldval = rcu_dynticks_nesting;
- WARN_ON_ONCE(oldval != 0);
- rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+ WARN_ON_ONCE(rcu_dynticks_nesting < 0);
+ if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
+ rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+ else
+ rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_idle_exit_common(oldval);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
/*
* Enter an interrupt handler, moving away from idle.
@@ -161,28 +170,29 @@ void rcu_irq_enter(void)
rcu_idle_exit_common(oldval);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_irq_enter);
-#ifdef CONFIG_PROVE_RCU
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
/*
* Test whether RCU thinks that the current CPU is idle.
*/
-int rcu_is_cpu_idle(void)
+bool notrace __rcu_is_watching(void)
{
- return !rcu_dynticks_nesting;
+ return rcu_dynticks_nesting;
}
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL(__rcu_is_watching);
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
/*
* Test whether the current CPU was interrupted from idle. Nested
* interrupts don't count, we must be running at the first interrupt
* level.
*/
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
{
- return rcu_dynticks_nesting <= 0;
+ return rcu_dynticks_nesting <= 1;
}
/*
@@ -192,6 +202,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
+ RCU_TRACE(reset_cpu_stall_ticks(rcp));
if (rcp->rcucblist != NULL &&
rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
@@ -213,7 +224,7 @@ void rcu_sched_qs(int cpu)
local_irq_save(flags);
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- invoke_rcu_callbacks();
+ raise_softirq(RCU_SOFTIRQ);
local_irq_restore(flags);
}
@@ -226,7 +237,7 @@ void rcu_bh_qs(int cpu)
local_irq_save(flags);
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- invoke_rcu_callbacks();
+ raise_softirq(RCU_SOFTIRQ);
local_irq_restore(flags);
}
@@ -238,11 +249,11 @@ void rcu_bh_qs(int cpu)
*/
void rcu_check_callbacks(int cpu, int user)
{
+ RCU_TRACE(check_cpu_stalls());
if (user || rcu_is_cpu_rrupt_from_idle())
rcu_sched_qs(cpu);
else if (!in_softirq())
rcu_bh_qs(cpu);
- rcu_preempt_check_callbacks();
}
/*
@@ -251,31 +262,30 @@ void rcu_check_callbacks(int cpu, int user)
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
- char *rn = NULL;
+ const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail) {
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
- ACCESS_ONCE(rcp->rcucblist),
+ !!ACCESS_ONCE(rcp->rcucblist),
need_resched(),
is_idle_task(current),
- rcu_is_callbacks_kthread()));
+ false));
return;
}
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
if (rcp->curtail == rcp->donetail)
rcp->curtail = &rcp->rcucblist;
- rcu_preempt_remove_callbacks(rcp);
rcp->donetail = &rcp->rcucblist;
local_irq_restore(flags);
@@ -292,16 +302,16 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
RCU_TRACE(cb_count++);
}
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
- RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+ RCU_TRACE(trace_rcu_batch_end(rcp->name,
+ cb_count, 0, need_resched(),
is_idle_task(current),
- rcu_is_callbacks_kthread()));
+ false));
}
static void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_sched_ctrlblk);
__rcu_process_callbacks(&rcu_bh_ctrlblk);
- rcu_preempt_process_callbacks();
}
/*
@@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
void synchronize_sched(void)
{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU read-side critical section");
cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -363,3 +377,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
__call_rcu(head, func, &rcu_bh_ctrlblk);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+void rcu_init(void)
+{
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
new file mode 100644
index 00000000000..858c5656912
--- /dev/null
+++ b/kernel/rcu/tiny_plugin.h
@@ -0,0 +1,174 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2010 Linaro
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+ RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
+ RCU_TRACE(const char *name); /* Name of RCU type. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+ RCU_TRACE(.name = "rcu_sched")
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+ RCU_TRACE(.name = "rcu_bh")
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#include <linux/kernel_stat.h>
+
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
+/*
+ * During boot, we forgive RCU lockdep issues. After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void __init rcu_scheduler_starting(void)
+{
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_TRACE
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcp->qlen -= n;
+ local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+ return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = show_tiny_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+ struct dentry *retval;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
+ NULL, &show_tiny_stats_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long j;
+ unsigned long js;
+
+ if (rcu_cpu_stall_suppress)
+ return;
+ rcp->ticks_this_gp++;
+ j = jiffies;
+ js = ACCESS_ONCE(rcp->jiffies_stall);
+ if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+ pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+ rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+ jiffies - rcp->gp_start, rcp->qlen);
+ dump_stack();
+ }
+ if (*rcp->curtail && ULONG_CMP_GE(j, js))
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
+ 3 * rcu_jiffies_till_stall_check() + 3;
+ else if (ULONG_CMP_GE(j, js))
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+ rcp->ticks_this_gp = 0;
+ rcp->gp_start = jiffies;
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void check_cpu_stalls(void)
+{
+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
new file mode 100644
index 00000000000..625d0b0cd75
--- /dev/null
+++ b/kernel/rcu/tree.c
@@ -0,0 +1,3744 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ * Manfred Spraul <manfred@colorfullife.com>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/nmi.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+#include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <linux/random.h>
+#include <linux/ftrace_event.h>
+#include <linux/suspend.h>
+
+#include "tree.h"
+#include "rcu.h"
+
+MODULE_ALIAS("rcutree");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutree."
+
+/* Data structures. */
+
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+
+/*
+ * In order to export the rcu_state name to the tracing tools, it
+ * needs to be added in the __tracepoint_string section.
+ * This requires defining a separate variable tp_<sname>_varname
+ * that points to the string being used, and this will allow
+ * the tracing userspace tools to be able to decipher the string
+ * address to the matching string.
+ */
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+static char sname##_varname[] = #sname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
+struct rcu_state sname##_state = { \
+ .level = { &sname##_state.node[0] }, \
+ .call = cr, \
+ .fqs_state = RCU_GP_IDLE, \
+ .gpnum = 0UL - 300UL, \
+ .completed = 0UL - 300UL, \
+ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
+ .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+ .orphan_donetail = &sname##_state.orphan_donelist, \
+ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
+ .name = sname##_varname, \
+ .abbr = sabbr, \
+}; \
+DEFINE_PER_CPU(struct rcu_data, sname##_data)
+
+RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
+RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
+
+static struct rcu_state *rcu_state_p;
+LIST_HEAD(rcu_struct_flavors);
+
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0444);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
+ NUM_RCU_LVL_0,
+ NUM_RCU_LVL_1,
+ NUM_RCU_LVL_2,
+ NUM_RCU_LVL_3,
+ NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned. So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimize synchronize_sched() to a simple barrier(). When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods. This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
+/*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks. So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one. We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+
+#ifdef CONFIG_RCU_BOOST
+
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+
+/*
+ * Track the rcutorture test sequence number and the update version
+ * number within a given test. The rcutorture_testseq is incremented
+ * on every rcutorture module load and unload, so has an odd value
+ * when a test is running. The rcutorture_vernum is set to zero
+ * when rcutorture starts and is incremented on each rcutorture update.
+ * These variables enable correlating rcutorture output with the
+ * RCU tracing information.
+ */
+unsigned long rcutorture_testseq;
+unsigned long rcutorture_vernum;
+
+/*
+ * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
+ * permit this function to be invoked without holding the root rcu_node
+ * structure's ->lock, but of course results can be subject to change.
+ */
+static int rcu_gp_in_progress(struct rcu_state *rsp)
+{
+ return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Note a quiescent state. Because we do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
+ */
+void rcu_sched_qs(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
+
+ if (rdp->passed_quiesce == 0)
+ trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
+ rdp->passed_quiesce = 1;
+}
+
+void rcu_bh_qs(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+
+ if (rdp->passed_quiesce == 0)
+ trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
+ rdp->passed_quiesce = 1;
+}
+
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+ .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+ .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state. This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp;
+ int resched_mask;
+ struct rcu_state *rsp;
+
+ local_irq_save(flags);
+
+ /*
+ * Yes, we can lose flag-setting operations. This is OK, because
+ * the flag will be set again after some delay.
+ */
+ resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+ raw_cpu_write(rcu_sched_qs_mask, 0);
+
+ /* Find the flavor that needs a quiescent state. */
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (!(resched_mask & rsp->flavor_mask))
+ continue;
+ smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+ if (ACCESS_ONCE(rdp->mynode->completed) !=
+ ACCESS_ONCE(rdp->cond_resched_completed))
+ continue;
+
+ /*
+ * Pretend to be momentarily idle for the quiescent state.
+ * This allows the grace-period kthread to record the
+ * quiescent state, with no need for this CPU to do anything
+ * further.
+ */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ smp_mb__before_atomic(); /* Earlier stuff before QS. */
+ atomic_add(2, &rdtp->dynticks); /* QS. */
+ smp_mb__after_atomic(); /* Later stuff after QS. */
+ break;
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Note a context switch. This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
+ */
+void rcu_note_context_switch(int cpu)
+{
+ trace_rcu_utilization(TPS("Start context switch"));
+ rcu_sched_qs(cpu);
+ rcu_preempt_note_context_switch(cpu);
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
+ trace_rcu_utilization(TPS("End context switch"));
+}
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+
+static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
+static long qhimark = 10000; /* If this many pending, ignore blimit. */
+static long qlowmark = 100; /* Once only this many pending, use blimit. */
+
+module_param(blimit, long, 0444);
+module_param(qhimark, long, 0444);
+module_param(qlowmark, long, 0444);
+
+static ulong jiffies_till_first_fqs = ULONG_MAX;
+static ulong jiffies_till_next_fqs = ULONG_MAX;
+
+module_param(jiffies_till_first_fqs, ulong, 0644);
+module_param(jiffies_till_next_fqs, ulong, 0644);
+
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+
+static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp);
+static void force_qs_rnp(struct rcu_state *rsp,
+ int (*f)(struct rcu_data *rsp, bool *isidle,
+ unsigned long *maxj),
+ bool *isidle, unsigned long *maxj);
+static void force_quiescent_state(struct rcu_state *rsp);
+static int rcu_pending(int cpu);
+
+/*
+ * Return the number of RCU-sched batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_sched(void)
+{
+ return rcu_sched_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+ return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Force a quiescent state.
+ */
+void rcu_force_quiescent_state(void)
+{
+ force_quiescent_state(rcu_state_p);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
+ * Force a quiescent state for RCU BH.
+ */
+void rcu_bh_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ pr_info("%s: wait state: %d ->state: %#lx\n",
+ rsp->name, rsp->gp_state, rsp->gp_kthread->state);
+ /* sched_show_task(rsp->gp_kthread); */
+ }
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * Record the number of times rcutorture tests have been initiated and
+ * terminated. This information allows the debugfs tracing stats to be
+ * correlated to the rcutorture messages, even when the rcutorture module
+ * is being repeatedly loaded and unloaded. In other words, we cannot
+ * store this state in rcutorture itself.
+ */
+void rcutorture_record_test_transition(void)
+{
+ rcutorture_testseq++;
+ rcutorture_vernum = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
+
+/*
+ * Send along grace-period-related data for rcutorture diagnostics.
+ */
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed)
+{
+ struct rcu_state *rsp = NULL;
+
+ switch (test_type) {
+ case RCU_FLAVOR:
+ rsp = rcu_state_p;
+ break;
+ case RCU_BH_FLAVOR:
+ rsp = &rcu_bh_state;
+ break;
+ case RCU_SCHED_FLAVOR:
+ rsp = &rcu_sched_state;
+ break;
+ default:
+ break;
+ }
+ if (rsp != NULL) {
+ *flags = ACCESS_ONCE(rsp->gp_flags);
+ *gpnum = ACCESS_ONCE(rsp->gpnum);
+ *completed = ACCESS_ONCE(rsp->completed);
+ return;
+ }
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+
+/*
+ * Record the number of writer passes through the current rcutorture test.
+ * This is also used to correlate debugfs tracing stats with the rcutorture
+ * messages.
+ */
+void rcutorture_record_progress(unsigned long vernum)
+{
+ rcutorture_vernum++;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+ return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
+ rdp->nxttail[RCU_DONE_TAIL] != NULL;
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+ return &rsp->node[0];
+}
+
+/*
+ * Is there any need for future grace periods?
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_future_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int *fp = &rnp->need_future_gp[idx];
+
+ return ACCESS_ONCE(*fp);
+}
+
+/*
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ int i;
+
+ if (rcu_gp_in_progress(rsp))
+ return 0; /* No, a grace period is already in progress. */
+ if (rcu_future_needs_gp(rsp))
+ return 1; /* Yes, a no-CBs CPU needs one. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL])
+ return 0; /* No, this is a no-CBs (or offline) CPU. */
+ if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+ return 1; /* Yes, this CPU has newly registered callbacks. */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+ ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ rdp->nxtcompleted[i]))
+ return 1; /* Yes, CBs for future grace period. */
+ return 0; /* No grace period needed. */
+}
+
+/*
+ * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
+ bool user)
+{
+ struct rcu_state *rsp;
+ struct rcu_data *rdp;
+
+ trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
+ if (!user && !is_idle_task(current)) {
+ struct task_struct *idle __maybe_unused =
+ idle_task(smp_processor_id());
+
+ trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
+ ftrace_dump(DUMP_ORIG);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ do_nocb_deferred_wakeup(rdp);
+ }
+ rcu_prepare_for_idle(smp_processor_id());
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic(); /* Force ordering with next sojourn. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+
+ /*
+ * It is illegal to enter an extended quiescent state while
+ * in an RCU read-side critical section.
+ */
+ rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
+ "Illegal idle entry in RCU read-side critical section.");
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
+ "Illegal idle entry in RCU-bh read-side critical section.");
+ rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
+ "Illegal idle entry in RCU-sched read-side critical section.");
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ */
+static void rcu_eqs_enter(bool user)
+{
+ long long oldval;
+ struct rcu_dynticks *rdtp;
+
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
+ rdtp->dynticks_nesting = 0;
+ rcu_eqs_enter_common(rdtp, oldval, user);
+ } else {
+ rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
+ }
+}
+
+/**
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+void rcu_idle_enter(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_eqs_enter(false);
+ rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
+
+#ifdef CONFIG_RCU_USER_QS
+/**
+ * rcu_user_enter - inform RCU that we are resuming userspace.
+ *
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_user_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+void rcu_user_enter(void)
+{
+ rcu_eqs_enter(1);
+}
+#endif /* CONFIG_RCU_USER_QS */
+
+/**
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_exit(void)
+{
+ unsigned long flags;
+ long long oldval;
+ struct rcu_dynticks *rdtp;
+
+ local_irq_save(flags);
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting--;
+ WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ if (rdtp->dynticks_nesting)
+ trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
+ else
+ rcu_eqs_enter_common(rdtp, oldval, true);
+ rcu_sysidle_enter(rdtp, 1);
+ local_irq_restore(flags);
+}
+
+/*
+ * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
+ int user)
+{
+ smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ rcu_cleanup_after_idle(smp_processor_id());
+ trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
+ if (!user && !is_idle_task(current)) {
+ struct task_struct *idle __maybe_unused =
+ idle_task(smp_processor_id());
+
+ trace_rcu_dyntick(TPS("Error on exit: not idle task"),
+ oldval, rdtp->dynticks_nesting);
+ ftrace_dump(DUMP_ORIG);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ */
+static void rcu_eqs_exit(bool user)
+{
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE(oldval < 0);
+ if (oldval & DYNTICK_TASK_NEST_MASK) {
+ rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+ } else {
+ rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rcu_eqs_exit_common(rdtp, oldval, user);
+ }
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_eqs_exit(false);
+ rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
+#ifdef CONFIG_RCU_USER_QS
+/**
+ * rcu_user_exit - inform RCU that we are exiting userspace.
+ *
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
+ */
+void rcu_user_exit(void)
+{
+ rcu_eqs_exit(1);
+}
+#endif /* CONFIG_RCU_USER_QS */
+
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode! This code assumes that the idle loop never does upcalls to
+ * user mode. If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+ unsigned long flags;
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ local_irq_save(flags);
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting++;
+ WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ if (oldval)
+ trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
+ else
+ rcu_eqs_exit_common(rdtp, oldval, true);
+ rcu_sysidle_exit(rdtp, 1);
+ local_irq_restore(flags);
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ if (rdtp->dynticks_nmi_nesting == 0 &&
+ (atomic_read(&rdtp->dynticks) & 0x1))
+ return;
+ rdtp->dynticks_nmi_nesting++;
+ smp_mb__before_atomic(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ if (rdtp->dynticks_nmi_nesting == 0 ||
+ --rdtp->dynticks_nmi_nesting != 0)
+ return;
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
+
+/**
+ * __rcu_is_watching - are RCU read-side critical sections safe?
+ *
+ * Return true if RCU is watching the running CPU, which means that
+ * this CPU can safely enter RCU read-side critical sections. Unlike
+ * rcu_is_watching(), the caller of __rcu_is_watching() must have at
+ * least disabled preemption.
+ */
+bool notrace __rcu_is_watching(void)
+{
+ return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+}
+
+/**
+ * rcu_is_watching - see if RCU thinks that the current CPU is idle
+ *
+ * If the current CPU is in its idle loop and is neither in an interrupt
+ * or NMI handler, return true.
+ */
+bool notrace rcu_is_watching(void)
+{
+ int ret;
+
+ preempt_disable();
+ ret = __rcu_is_watching();
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_is_watching);
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online? Disable preemption to avoid false positives
+ * that could otherwise happen due to the current CPU number being sampled,
+ * this task being preempted, its old CPU being taken offline, resuming
+ * on some other CPU, then determining that its old CPU is now offline.
+ * It is OK to use RCU on an offline processor during initial boot, hence
+ * the check for rcu_scheduler_fully_active. Note also that it is OK
+ * for a CPU coming online to use RCU for one jiffy prior to marking itself
+ * online in the cpu_online_mask. Similarly, it is OK for a CPU going
+ * offline to continue to use RCU for one jiffy after marking itself
+ * offline in the cpu_online_mask. This leniency is necessary given the
+ * non-atomic nature of the online and offline processing, for example,
+ * the fact that a CPU enters the scheduler after completing the CPU_DYING
+ * notifiers.
+ *
+ * This is also why RCU internally marks CPUs online during the
+ * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ *
+ * Disable checking if in an NMI handler because we cannot safely report
+ * errors from NMI handlers anyway.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ bool ret;
+
+ if (in_nmi())
+ return true;
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_sched_data);
+ rnp = rdp->mynode;
+ ret = (rdp->grpmask & rnp->qsmaskinit) ||
+ !rcu_scheduler_fully_active;
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+/**
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ *
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true. The caller must have at least
+ * disabled preemption.
+ */
+static int rcu_is_cpu_rrupt_from_idle(void)
+{
+ return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state. Return 1 if this CPU
+ * is in dynticks idle mode, which is an extended quiescent state.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp,
+ bool *isidle, unsigned long *maxj)
+{
+ rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ rcu_sysidle_check_cpu(rdp, isidle, maxj);
+ if ((rdp->dynticks_snap & 0x1) == 0) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU, or by virtue of having been offline.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
+ bool *isidle, unsigned long *maxj)
+{
+ unsigned int curr;
+ int *rcrmp;
+ unsigned int snap;
+
+ curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned int)rdp->dynticks_snap;
+
+ /*
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq/NMI handlers, then we can safely pretend that the CPU
+ * already acknowledged the request to pass through a quiescent
+ * state. Either way, that CPU cannot possibly be in an RCU
+ * read-side critical section that started before the beginning
+ * of the current RCU grace period.
+ */
+ if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ rdp->dynticks_fqs++;
+ return 1;
+ }
+
+ /*
+ * Check for the CPU being offline, but only if the grace period
+ * is old enough. We don't need to worry about the CPU changing
+ * state: If we see it offline even once, it has been through a
+ * quiescent state.
+ *
+ * The reason for insisting that the grace period be at least
+ * one jiffy old is that CPUs that are not quite online and that
+ * have just gone offline can still execute RCU read-side critical
+ * sections.
+ */
+ if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
+ return 0; /* Grace period is not old enough. */
+ barrier();
+ if (cpu_is_offline(rdp->cpu)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
+ rdp->offline_fqs++;
+ return 1;
+ }
+
+ /*
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
+ * even context-switching back and forth between a pair of
+ * in-kernel CPU-bound tasks cannot advance grace periods.
+ * So if the grace period is old enough, make the CPU pay attention.
+ * Note that the unsynchronized assignments to the per-CPU
+ * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * bits can be lost, but they will be set again on the next
+ * force-quiescent-state pass. So lost bit sets do not result
+ * in incorrect behavior, merely in a grace period lasting
+ * a few jiffies longer than it might otherwise. Because
+ * there are at most four threads involved, and because the
+ * updates are only once every few jiffies, the probability of
+ * lossage (and thus of slight grace-period extension) is
+ * quite low.
+ *
+ * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+ * is set too high, we override with half of the RCU CPU stall
+ * warning delay.
+ */
+ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + jiffies_till_sched_qs) ||
+ ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ ACCESS_ONCE(rdp->cond_resched_completed) =
+ ACCESS_ONCE(rdp->mynode->completed);
+ smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+ ACCESS_ONCE(*rcrmp) =
+ ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ /* Time to beat on that CPU again! */
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ }
+ }
+
+ return 0;
+}
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+ unsigned long j = jiffies;
+ unsigned long j1;
+
+ rsp->gp_start = j;
+ smp_wmb(); /* Record start time before stall time. */
+ j1 = rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+ rsp->jiffies_resched = j + j1 / 2;
+}
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs. This is a fallback
+ * for architectures that do not implement trigger_all_cpu_backtrace().
+ * The NMI-triggered stack traces are more accurate because they are
+ * printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (rnp->qsmask != 0) {
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu))
+ dump_cpu_task(rnp->grplo + cpu);
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+ int cpu;
+ long delta;
+ unsigned long flags;
+ int ndetected = 0;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ long totqlen = 0;
+
+ /* Only let one CPU complain about others per time interval. */
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s detected stalls on CPUs/tasks:",
+ rsp->name);
+ print_cpu_stall_info_begin();
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ ndetected += rcu_print_task_stall(rnp);
+ if (rnp->qsmask != 0) {
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu)) {
+ print_cpu_stall_info(rsp,
+ rnp->grplo + cpu);
+ ndetected++;
+ }
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+
+ /*
+ * Now rat on any tasks that got kicked up to the root rcu_node
+ * due to CPU offlining.
+ */
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ ndetected += rcu_print_task_stall(rnp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ print_cpu_stall_info_end();
+ for_each_possible_cpu(cpu)
+ totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
+ smp_processor_id(), (long)(jiffies - rsp->gp_start),
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
+ if (ndetected == 0)
+ pr_err("INFO: Stall ended before state dump start\n");
+ else if (!trigger_all_cpu_backtrace())
+ rcu_dump_cpu_stacks(rsp);
+
+ /* Complain about tasks blocking the grace period. */
+
+ rcu_print_detail_task_stall(rsp);
+
+ force_quiescent_state(rsp); /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ long totqlen = 0;
+
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s self-detected stall on CPU", rsp->name);
+ print_cpu_stall_info_begin();
+ print_cpu_stall_info(rsp, smp_processor_id());
+ print_cpu_stall_info_end();
+ for_each_possible_cpu(cpu)
+ totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
+ jiffies - rsp->gp_start,
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
+ if (!trigger_all_cpu_backtrace())
+ dump_stack();
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
+ 3 * rcu_jiffies_till_stall_check() + 3;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ /*
+ * Attempt to revive the RCU machinery by forcing a context switch.
+ *
+ * A context switch would normally allow the RCU state machine to make
+ * progress and it could be we're stuck in kernel space without context
+ * switches for an entirely unreasonable amount of time.
+ */
+ resched_cpu(smp_processor_id());
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long completed;
+ unsigned long gpnum;
+ unsigned long gps;
+ unsigned long j;
+ unsigned long js;
+ struct rcu_node *rnp;
+
+ if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+ return;
+ j = jiffies;
+
+ /*
+ * Lots of memory barriers to reject false positives.
+ *
+ * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
+ * then rsp->gp_start, and finally rsp->completed. These values
+ * are updated in the opposite order with memory barriers (or
+ * equivalent) during grace-period initialization and cleanup.
+ * Now, a false positive can occur if we get an new value of
+ * rsp->gp_start and a old value of rsp->jiffies_stall. But given
+ * the memory barriers, the only way that this can happen is if one
+ * grace period ends and another starts between these two fetches.
+ * Detect this by comparing rsp->completed with the previous fetch
+ * from rsp->gpnum.
+ *
+ * Given this check, comparisons of jiffies, rsp->jiffies_stall,
+ * and rsp->gp_start suffice to forestall false positives.
+ */
+ gpnum = ACCESS_ONCE(rsp->gpnum);
+ smp_rmb(); /* Pick up ->gpnum first... */
+ js = ACCESS_ONCE(rsp->jiffies_stall);
+ smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+ gps = ACCESS_ONCE(rsp->gp_start);
+ smp_rmb(); /* ...and finally ->gp_start before ->completed. */
+ completed = ACCESS_ONCE(rsp->completed);
+ if (ULONG_CMP_GE(completed, gpnum) ||
+ ULONG_CMP_LT(j, js) ||
+ ULONG_CMP_GE(gps, js))
+ return; /* No stall or GP completed since entering function. */
+ rnp = rdp->mynode;
+ if (rcu_gp_in_progress(rsp) &&
+ (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(rsp);
+
+ } else if (rcu_gp_in_progress(rsp) &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
+
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(rsp);
+ }
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+}
+
+/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ int i;
+
+ if (init_nocb_callback_list(rdp))
+ return;
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period. This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ /*
+ * If RCU is idle, we just wait for the next grace period.
+ * But we can only be sure that RCU is idle if we are looking
+ * at the root rcu_node structure -- otherwise, a new grace
+ * period might have started, but just not yet gotten around
+ * to initializing the current non-root rcu_node structure.
+ */
+ if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+ return rnp->completed + 1;
+
+ /*
+ * Otherwise, wait for a possible partial grace period and
+ * then the subsequent full grace period.
+ */
+ return rnp->completed + 2;
+}
+
+/*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, const char *s)
+{
+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+ rnp->completed, c, rnp->level,
+ rnp->grplo, rnp->grphi, s);
+}
+
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks. The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field. Returns true if there
+ * is reason to awaken the grace-period kthread.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static bool __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long *c_out)
+{
+ unsigned long c;
+ int i;
+ bool ret = false;
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ /*
+ * Pick up grace-period number for new callbacks. If this
+ * grace period is already marked as needed, return to the caller.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
+ if (rnp->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
+ goto out;
+ }
+
+ /*
+ * If either this rcu_node structure or the root rcu_node structure
+ * believe that a grace period is in progress, then we must wait
+ * for the one following, which is in "c". Because our request
+ * will be noticed at the end of the current grace period, we don't
+ * need to explicitly start one.
+ */
+ if (rnp->gpnum != rnp->completed ||
+ ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ rnp->need_future_gp[c & 0x1]++;
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
+ goto out;
+ }
+
+ /*
+ * There might be no grace period in progress. If we don't already
+ * hold it, acquire the root rcu_node structure's lock in order to
+ * start one (if needed).
+ */
+ if (rnp != rnp_root) {
+ raw_spin_lock(&rnp_root->lock);
+ smp_mb__after_unlock_lock();
+ }
+
+ /*
+ * Get a new grace-period number. If there really is no grace
+ * period in progress, it will be smaller than the one we obtained
+ * earlier. Adjust callbacks as needed. Note that even no-CBs
+ * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp_root);
+ for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+ rdp->nxtcompleted[i] = c;
+
+ /*
+ * If the needed for the required grace period is already
+ * recorded, trace and leave.
+ */
+ if (rnp_root->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+ goto unlock_out;
+ }
+
+ /* Record the need for the future grace period. */
+ rnp_root->need_future_gp[c & 0x1]++;
+
+ /* If a grace period is not already in progress, start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
+ } else {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
+ ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ }
+unlock_out:
+ if (rnp != rnp_root)
+ raw_spin_unlock(&rnp_root->lock);
+out:
+ if (c_out != NULL)
+ *c_out = c;
+ return ret;
+}
+
+/*
+ * Clean up any old requests for the just-ended grace period. Also return
+ * whether any additional grace periods have been requested. Also invoke
+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+ * waiting for this grace period to complete.
+ */
+static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ int c = rnp->completed;
+ int needmore;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+
+ rcu_nocb_gp_cleanup(rsp, rnp);
+ rnp->need_future_gp[c & 0x1] = 0;
+ needmore = rnp->need_future_gp[(c + 1) & 0x1];
+ trace_rcu_future_gp(rnp, rdp, c,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+ return needmore;
+}
+
+/*
+ * Awaken the grace-period kthread for the specified flavor of RCU.
+ * Don't do a self-awaken, and don't bother awakening when there is
+ * nothing for the grace-period kthread to do (as in several CPUs
+ * raced to awaken, and we lost), and finally don't try to awaken
+ * a kthread that has not yet been created.
+ */
+static void rcu_gp_kthread_wake(struct rcu_state *rsp)
+{
+ if (current == rsp->gp_kthread ||
+ !ACCESS_ONCE(rsp->gp_flags) ||
+ !rsp->gp_kthread)
+ return;
+ wake_up(&rsp->gp_wq);
+}
+
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned. Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure. This function is idempotent, so it does
+ * not hurt to call it repeatedly. Returns an flag saying that we should
+ * awaken the RCU grace-period kthread.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+ bool ret;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return false;
+
+ /*
+ * Starting from the sublist containing the callbacks most
+ * recently assigned a ->completed number and working down, find the
+ * first sublist that is not assignable to an upcoming grace period.
+ * Such a sublist has something in it (first two tests) and has
+ * a ->completed number assigned that will complete sooner than
+ * the ->completed number for newly arrived callbacks (last test).
+ *
+ * The key point is that any later sublist can be assigned the
+ * same ->completed number as the newly arrived callbacks, which
+ * means that the callbacks in any of these later sublist can be
+ * grouped into a single sublist, whether or not they have already
+ * been assigned a ->completed number.
+ */
+ c = rcu_cbs_completed(rsp, rnp);
+ for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+ !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+ break;
+
+ /*
+ * If there are no sublist for unassigned callbacks, leave.
+ * At the same time, advance "i" one sublist, so that "i" will
+ * index into the sublist where all the remaining callbacks should
+ * be grouped into.
+ */
+ if (++i >= RCU_NEXT_TAIL)
+ return false;
+
+ /*
+ * Assign all subsequent callbacks' ->completed number to the next
+ * full grace period and group them all in the sublist initially
+ * indexed by "i".
+ */
+ for (; i <= RCU_NEXT_TAIL; i++) {
+ rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+ rdp->nxtcompleted[i] = c;
+ }
+ /* Record any needed additional grace periods. */
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
+
+ /* Trace depending on how much we were able to accelerate. */
+ if (!*rdp->nxttail[RCU_WAIT_TAIL])
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
+ else
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
+ return ret;
+}
+
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist. This function is idempotent, so it does not hurt to
+ * invoke it repeatedly. As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ int i, j;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return false;
+
+ /*
+ * Find all callbacks whose ->completed numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+ break;
+ rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+ }
+ /* Clean up any sublist tail pointers that were misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+
+ /* Copy down callbacks to fill in empty sublists. */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+ break;
+ rdp->nxttail[j] = rdp->nxttail[i];
+ rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+ }
+
+ /* Classify any remaining callbacks. */
+ return rcu_accelerate_cbs(rsp, rnp, rdp);
+}
+
+/*
+ * Update CPU-local rcu_data state to record the beginnings and ends of
+ * grace periods. The caller must hold the ->lock of the leaf rcu_node
+ * structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
+ */
+static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ bool ret;
+
+ /* Handle the ends of any preceding grace periods first. */
+ if (rdp->completed == rnp->completed) {
+
+ /* No grace period end, so just accelerate recent callbacks. */
+ ret = rcu_accelerate_cbs(rsp, rnp, rdp);
+
+ } else {
+
+ /* Advance callbacks. */
+ ret = rcu_advance_cbs(rsp, rnp, rdp);
+
+ /* Remember that we saw this grace-period completion. */
+ rdp->completed = rnp->completed;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
+ }
+
+ if (rdp->gpnum != rnp->gpnum) {
+ /*
+ * If the current grace period is waiting for this CPU,
+ * set up to detect a quiescent state, otherwise don't
+ * go looking for one.
+ */
+ rdp->gpnum = rnp->gpnum;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
+ rdp->passed_quiesce = 0;
+ rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+ zero_cpu_stall_ticks(rdp);
+ }
+ return ret;
+}
+
+static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ bool needwake;
+ struct rcu_node *rnp;
+
+ local_irq_save(flags);
+ rnp = rdp->mynode;
+ if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
+ rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+ local_irq_restore(flags);
+ return;
+ }
+ smp_mb__after_unlock_lock();
+ needwake = __note_gp_changes(rsp, rnp, rdp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+}
+
+/*
+ * Initialize a new grace period. Return 0 if no grace period required.
+ */
+static int rcu_gp_init(struct rcu_state *rsp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ rcu_bind_gp_kthread();
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ if (!ACCESS_ONCE(rsp->gp_flags)) {
+ /* Spurious wakeup, tell caller to go back to sleep. */
+ raw_spin_unlock_irq(&rnp->lock);
+ return 0;
+ }
+ ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+
+ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
+ /*
+ * Grace period already in progress, don't start another.
+ * Not supposed to be able to happen.
+ */
+ raw_spin_unlock_irq(&rnp->lock);
+ return 0;
+ }
+
+ /* Advance to a new grace period and initialize state. */
+ record_gp_stall_check_time(rsp);
+ /* Record GP times before starting GP, hence smp_store_release(). */
+ smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
+ trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
+ raw_spin_unlock_irq(&rnp->lock);
+
+ /* Exclude any concurrent CPU-hotplug operations. */
+ mutex_lock(&rsp->onoff_mutex);
+ smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
+
+ /*
+ * Set the quiescent-state-needed bits in all the rcu_node
+ * structures for all currently online CPUs in breadth-first order,
+ * starting from the root rcu_node structure, relying on the layout
+ * of the tree within the rsp->node[] array. Note that other CPUs
+ * will access only the leaves of the hierarchy, thus seeing that no
+ * grace period is in progress, at least until the corresponding
+ * leaf node has been initialized. In addition, we have excluded
+ * CPU-hotplug operations.
+ *
+ * The grace period cannot complete until the initialization
+ * process finishes, because this kthread handles both.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ rdp = this_cpu_ptr(rsp->rda);
+ rcu_preempt_check_blocked_tasks(rnp);
+ rnp->qsmask = rnp->qsmaskinit;
+ ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+ WARN_ON_ONCE(rnp->completed != rsp->completed);
+ ACCESS_ONCE(rnp->completed) = rsp->completed;
+ if (rnp == rdp->mynode)
+ (void)__note_gp_changes(rsp, rnp, rdp);
+ rcu_preempt_boost_start_gp(rnp);
+ trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+ rnp->level, rnp->grplo,
+ rnp->grphi, rnp->qsmask);
+ raw_spin_unlock_irq(&rnp->lock);
+#ifdef CONFIG_PROVE_RCU_DELAY
+ if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
+ system_state == SYSTEM_RUNNING)
+ udelay(200);
+#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
+ cond_resched();
+ }
+
+ mutex_unlock(&rsp->onoff_mutex);
+ return 1;
+}
+
+/*
+ * Do one round of quiescent-state forcing.
+ */
+static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+{
+ int fqs_state = fqs_state_in;
+ bool isidle = false;
+ unsigned long maxj;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ rsp->n_force_qs++;
+ if (fqs_state == RCU_SAVE_DYNTICK) {
+ /* Collect dyntick-idle snapshots. */
+ if (is_sysidle_rcu_state(rsp)) {
+ isidle = 1;
+ maxj = jiffies - ULONG_MAX / 4;
+ }
+ force_qs_rnp(rsp, dyntick_save_progress_counter,
+ &isidle, &maxj);
+ rcu_sysidle_report_gp(rsp, isidle, maxj);
+ fqs_state = RCU_FORCE_QS;
+ } else {
+ /* Handle dyntick-idle and offline CPUs. */
+ isidle = 0;
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+ }
+ /* Clear flag to prevent immediate re-entry. */
+ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
+ raw_spin_unlock_irq(&rnp->lock);
+ }
+ return fqs_state;
+}
+
+/*
+ * Clean up after the old grace period.
+ */
+static void rcu_gp_cleanup(struct rcu_state *rsp)
+{
+ unsigned long gp_duration;
+ bool needgp = false;
+ int nocb = 0;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ gp_duration = jiffies - rsp->gp_start;
+ if (gp_duration > rsp->gp_max)
+ rsp->gp_max = gp_duration;
+
+ /*
+ * We know the grace period is complete, but to everyone else
+ * it appears to still be ongoing. But it is also the case
+ * that to everyone else it looks like there is nothing that
+ * they can do to advance the grace period. It is therefore
+ * safe for us to drop the lock in order to mark the grace
+ * period as completed in all of the rcu_node structures.
+ */
+ raw_spin_unlock_irq(&rnp->lock);
+
+ /*
+ * Propagate new ->completed value to rcu_node structures so
+ * that other CPUs don't have to wait until the start of the next
+ * grace period to process their callbacks. This also avoids
+ * some nasty RCU grace-period initialization races by forcing
+ * the end of the current grace period to be completely recorded in
+ * all of the rcu_node structures before the beginning of the next
+ * grace period is recorded in any of the rcu_node structures.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ rdp = this_cpu_ptr(rsp->rda);
+ if (rnp == rdp->mynode)
+ needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
+ /* smp_mb() provided by prior unlock-lock pair. */
+ nocb += rcu_future_gp_cleanup(rsp, rnp);
+ raw_spin_unlock_irq(&rnp->lock);
+ cond_resched();
+ }
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+ rcu_nocb_gp_set(rnp, nocb);
+
+ /* Declare grace period done. */
+ ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+ trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
+ rsp->fqs_state = RCU_GP_IDLE;
+ rdp = this_cpu_ptr(rsp->rda);
+ /* Advance CBs to reduce false positives below. */
+ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
+ if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("newreq"));
+ }
+ raw_spin_unlock_irq(&rnp->lock);
+}
+
+/*
+ * Body of kthread that handles grace periods.
+ */
+static int __noreturn rcu_gp_kthread(void *arg)
+{
+ int fqs_state;
+ int gf;
+ unsigned long j;
+ int ret;
+ struct rcu_state *rsp = arg;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ for (;;) {
+
+ /* Handle grace-period start. */
+ for (;;) {
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("reqwait"));
+ rsp->gp_state = RCU_GP_WAIT_GPS;
+ wait_event_interruptible(rsp->gp_wq,
+ ACCESS_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
+ /* Locking provides needed memory barrier. */
+ if (rcu_gp_init(rsp))
+ break;
+ cond_resched();
+ flush_signals(current);
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("reqwaitsig"));
+ }
+
+ /* Handle quiescent-state forcing. */
+ fqs_state = RCU_SAVE_DYNTICK;
+ j = jiffies_till_first_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_first_fqs = HZ;
+ }
+ ret = 0;
+ for (;;) {
+ if (!ret)
+ rsp->jiffies_force_qs = jiffies + j;
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqswait"));
+ rsp->gp_state = RCU_GP_WAIT_FQS;
+ ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+ RCU_GP_FLAG_FQS) ||
+ (!ACCESS_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp)),
+ j);
+ /* Locking provides needed memory barriers. */
+ /* If grace period done, leave loop. */
+ if (!ACCESS_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp))
+ break;
+ /* If time for quiescent-state forcing, do it. */
+ if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
+ (gf & RCU_GP_FLAG_FQS)) {
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqsstart"));
+ fqs_state = rcu_gp_fqs(rsp, fqs_state);
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqsend"));
+ cond_resched();
+ } else {
+ /* Deal with stray signal. */
+ cond_resched();
+ flush_signals(current);
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqswaitsig"));
+ }
+ j = jiffies_till_next_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_next_fqs = HZ;
+ } else if (j < 1) {
+ j = 1;
+ jiffies_till_next_fqs = 1;
+ }
+ }
+
+ /* Handle grace-period end. */
+ rcu_gp_cleanup(rsp);
+ }
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period. The caller must hold
+ * the root node's ->lock and hard irqs must be disabled.
+ *
+ * Note that it is legal for a dying CPU (which is marked as offline) to
+ * invoke this function. This can happen when the dying CPU reports its
+ * quiescent state.
+ *
+ * Returns true if the grace-period kthread must be awakened.
+ */
+static bool
+rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
+ /*
+ * Either we have not yet spawned the grace-period
+ * task, this CPU does not need another grace period,
+ * or a grace period is already in progress.
+ * Either way, don't start a new grace period.
+ */
+ return false;
+ }
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+ TPS("newreq"));
+
+ /*
+ * We can't do wakeups while holding the rnp->lock, as that
+ * could cause possible deadlocks with the rq->lock. Defer
+ * the wakeup to our caller.
+ */
+ return true;
+}
+
+/*
+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+ * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
+ * is invoked indirectly from rcu_advance_cbs(), which would result in
+ * endless recursion -- or would do so if it wasn't for the self-deadlock
+ * that is encountered beforehand.
+ *
+ * Returns true if the grace-period kthread needs to be awakened.
+ */
+static bool rcu_start_gp(struct rcu_state *rsp)
+{
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ bool ret = false;
+
+ /*
+ * If there is no grace period in progress right now, any
+ * callbacks we have up to this point will be satisfied by the
+ * next grace period. Also, advancing the callbacks reduces the
+ * probability of false positives from cpu_needs_another_gp()
+ * resulting in pointless grace periods. So, advance callbacks
+ * then start the grace period!
+ */
+ ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
+ ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
+ return ret;
+}
+
+/*
+ * Report a full set of quiescent states to the specified rcu_state
+ * data structure. This involves cleaning up after the prior grace
+ * period and letting rcu_start_gp() start up the next grace period
+ * if one is needed. Note that the caller must hold rnp->lock, which
+ * is released before return.
+ */
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
+ __releases(rcu_get_root(rsp)->lock)
+{
+ WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
+}
+
+/*
+ * Similar to rcu_report_qs_rdp(), for which it is a helper function.
+ * Allows quiescent states for a group of CPUs to be reported at one go
+ * to the specified rcu_node structure, though all the CPUs in the group
+ * must be represented by the same rcu_node structure (which need not be
+ * a leaf rcu_node structure, though it often will be). That structure's
+ * lock must be held upon entry, and it is released before return.
+ */
+static void
+rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ struct rcu_node *rnp_c;
+
+ /* Walk up the rcu_node hierarchy. */
+ for (;;) {
+ if (!(rnp->qsmask & mask)) {
+
+ /* Our bit has already been cleared, so done. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ rnp->qsmask &= ~mask;
+ trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+ mask, rnp->qsmask, rnp->level,
+ rnp->grplo, rnp->grphi,
+ !!rnp->gp_tasks);
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+
+ /* Other bits still set at this level, so done. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ mask = rnp->grpmask;
+ if (rnp->parent == NULL) {
+
+ /* No more levels. Exit loop holding root lock. */
+
+ break;
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rnp_c = rnp;
+ rnp = rnp->parent;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rnp_c->qsmask);
+ }
+
+ /*
+ * Get here if we are the last CPU to pass through a quiescent
+ * state for this grace period. Invoke rcu_report_qs_rsp()
+ * to clean up and start the next grace period if one is needed.
+ */
+ rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU to that CPU's rcu_data
+ * structure. This must be either called from the specified CPU, or
+ * called when the specified CPU is known to be offline (and when it is
+ * also known that no other CPU is concurrently trying to help the offline
+ * CPU). The lastcomp argument is used to make sure we are still in the
+ * grace period of interest. We don't want to end the current grace period
+ * based on quiescent states detected in an earlier grace period!
+ */
+static void
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ bool needwake;
+ struct rcu_node *rnp;
+
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
+ rnp->completed == rnp->gpnum) {
+
+ /*
+ * The grace period in which this quiescent state was
+ * recorded has ended, so don't report it upwards.
+ * We will instead need a new quiescent state that lies
+ * within the current grace period.
+ */
+ rdp->passed_quiesce = 0; /* need qs for new gp. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ mask = rdp->grpmask;
+ if ((rnp->qsmask & mask) == 0) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else {
+ rdp->qs_pending = 0;
+
+ /*
+ * This GP can't end until cpu checks in, so all of our
+ * callbacks can be processed during the next GP.
+ */
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+
+ rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ /* Check for grace-period ends and beginnings. */
+ note_gp_changes(rsp, rdp);
+
+ /*
+ * Does this CPU still need to do its part for current grace period?
+ * If no, return and let the other CPUs do their part as well.
+ */
+ if (!rdp->qs_pending)
+ return;
+
+ /*
+ * Was there a quiescent state since the beginning of the grace
+ * period? If no, then exit and wait for the next call.
+ */
+ if (!rdp->passed_quiesce)
+ return;
+
+ /*
+ * Tell RCU we are done (but rcu_report_qs_rdp() will be the
+ * judge of that).
+ */
+ rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Send the specified CPU's RCU callbacks to the orphanage. The
+ * specified CPU must be offline, and the caller must hold the
+ * ->orphan_lock.
+ */
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+ struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ /* No-CBs CPUs do not have orphanable callbacks. */
+ if (rcu_is_nocb_cpu(rdp->cpu))
+ return;
+
+ /*
+ * Orphan the callbacks. First adjust the counts. This is safe
+ * because _rcu_barrier() excludes CPU-hotplug operations, so it
+ * cannot be running now. Thus no memory barrier is required.
+ */
+ if (rdp->nxtlist != NULL) {
+ rsp->qlen_lazy += rdp->qlen_lazy;
+ rsp->qlen += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
+ rdp->qlen_lazy = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
+ }
+
+ /*
+ * Next, move those callbacks still needing a grace period to
+ * the orphanage, where some other CPU will pick them up.
+ * Some of the callbacks might have gone partway through a grace
+ * period, but that is too bad. They get to start over because we
+ * cannot assume that grace periods are synchronized across CPUs.
+ * We don't bother updating the ->nxttail[] array yet, instead
+ * we just reset the whole thing later on.
+ */
+ if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+ *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
+ rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
+ }
+
+ /*
+ * Then move the ready-to-invoke callbacks to the orphanage,
+ * where some other CPU will pick them up. These will not be
+ * required to pass though another grace period: They are done.
+ */
+ if (rdp->nxtlist != NULL) {
+ *rsp->orphan_donetail = rdp->nxtlist;
+ rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
+ }
+
+ /* Finally, initialize the rcu_data structure's list to empty. */
+ init_callback_list(rdp);
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage. The caller must hold the ->orphan_lock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
+{
+ int i;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+ /* No-CBs CPUs are handled specially. */
+ if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+ return;
+
+ /* Do the accounting first. */
+ rdp->qlen_lazy += rsp->qlen_lazy;
+ rdp->qlen += rsp->qlen;
+ rdp->n_cbs_adopted += rsp->qlen;
+ if (rsp->qlen_lazy != rsp->qlen)
+ rcu_idle_count_callbacks_posted();
+ rsp->qlen_lazy = 0;
+ rsp->qlen = 0;
+
+ /*
+ * We do not need a memory barrier here because the only way we
+ * can get here if there is an rcu_barrier() in flight is if
+ * we are the task doing the rcu_barrier().
+ */
+
+ /* First adopt the ready-to-invoke callbacks. */
+ if (rsp->orphan_donelist != NULL) {
+ *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+ for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = rsp->orphan_donetail;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+
+ /* And then adopt the callbacks that still need a grace period. */
+ if (rsp->orphan_nxtlist != NULL) {
+ *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+ rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+}
+
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+ RCU_TRACE(unsigned long mask);
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+
+ RCU_TRACE(mask = rdp->grpmask);
+ trace_rcu_grace_period(rsp->name,
+ rnp->gpnum + 1 - !!(rnp->qsmask & mask),
+ TPS("cpuofl"));
+}
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them. There can only be one CPU hotplug operation at a time,
+ * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ */
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ int need_report = 0;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ /* Adjust any no-longer-needed kthreads. */
+ rcu_boost_kthread_setaffinity(rnp, -1);
+
+ /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
+
+ /* Exclude any attempts to start a new grace period. */
+ mutex_lock(&rsp->onoff_mutex);
+ raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
+
+ /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+ rcu_adopt_orphan_cbs(rsp, flags);
+
+ /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+ mask = rdp->grpmask; /* rnp->grplo is constant. */
+ do {
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rnp->qsmaskinit &= ~mask;
+ if (rnp->qsmaskinit != 0) {
+ if (rnp != rdp->mynode)
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ break;
+ }
+ if (rnp == rdp->mynode)
+ need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+ else
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ } while (rnp != NULL);
+
+ /*
+ * We still hold the leaf rcu_node structure lock here, and
+ * irqs are still disabled. The reason for this subterfuge is
+ * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
+ * held leads to deadlock.
+ */
+ raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
+ rnp = rdp->mynode;
+ if (need_report & RCU_OFL_TASKS_NORM_GP)
+ rcu_report_unblock_qs_rnp(rnp, flags);
+ else
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (need_report & RCU_OFL_TASKS_EXP_GP)
+ rcu_report_exp_rnp(rsp, rnp, true);
+ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+ cpu, rdp->qlen, rdp->nxtlist);
+ init_callback_list(rdp);
+ /* Disallow further callbacks on this CPU. */
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ mutex_unlock(&rsp->onoff_mutex);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+}
+
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period. Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_head *next, *list, **tail;
+ long bl, count, count_lazy;
+ int i;
+
+ /* If no callbacks are ready, just return. */
+ if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+ trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
+ trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ need_resched(), is_idle_task(current),
+ rcu_is_callbacks_kthread());
+ return;
+ }
+
+ /*
+ * Extract the list of ready callbacks, disabling to prevent
+ * races with call_rcu() from interrupt handlers.
+ */
+ local_irq_save(flags);
+ WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+ bl = rdp->blimit;
+ trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
+ list = rdp->nxtlist;
+ rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
+ tail = rdp->nxttail[RCU_DONE_TAIL];
+ for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = &rdp->nxtlist;
+ local_irq_restore(flags);
+
+ /* Invoke callbacks. */
+ count = count_lazy = 0;
+ while (list) {
+ next = list->next;
+ prefetch(next);
+ debug_rcu_head_unqueue(list);
+ if (__rcu_reclaim(rsp->name, list))
+ count_lazy++;
+ list = next;
+ /* Stop only if limit reached and CPU has something to do. */
+ if (++count >= bl &&
+ (need_resched() ||
+ (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
+ break;
+ }
+
+ local_irq_save(flags);
+ trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+ is_idle_task(current),
+ rcu_is_callbacks_kthread());
+
+ /* Update count, and requeue any remaining callbacks. */
+ if (list != NULL) {
+ *tail = rdp->nxtlist;
+ rdp->nxtlist = list;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ if (&rdp->nxtlist == rdp->nxttail[i])
+ rdp->nxttail[i] = tail;
+ else
+ break;
+ }
+ smp_mb(); /* List handling before counting for rcu_barrier(). */
+ rdp->qlen_lazy -= count_lazy;
+ ACCESS_ONCE(rdp->qlen) -= count;
+ rdp->n_cbs_invoked += count;
+
+ /* Reinstate batch limit if we have worked down the excess. */
+ if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+ rdp->blimit = blimit;
+
+ /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+ if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
+
+ local_irq_restore(flags);
+
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_core();
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule RCU core processing.
+ *
+ * This function must be called from hardirq context. It is normally
+ * invoked from the scheduling-clock interrupt. If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+ trace_rcu_utilization(TPS("Start scheduler-tick"));
+ increment_cpu_stall_ticks();
+ if (user || rcu_is_cpu_rrupt_from_idle()) {
+
+ /*
+ * Get here if this CPU took its interrupt from user
+ * mode or from the idle loop, and if this is not a
+ * nested interrupt. In this case, the CPU is in
+ * a quiescent state, so note it.
+ *
+ * No memory barrier is required here because both
+ * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
+ * variables that other CPUs neither access nor modify,
+ * at least not while the corresponding CPU is online.
+ */
+
+ rcu_sched_qs(cpu);
+ rcu_bh_qs(cpu);
+
+ } else if (!in_softirq()) {
+
+ /*
+ * Get here if this CPU did not take its interrupt from
+ * softirq, in other words, if it is not interrupting
+ * a rcu_bh read-side critical section. This is an _bh
+ * critical section, so note it.
+ */
+
+ rcu_bh_qs(cpu);
+ }
+ rcu_preempt_check_callbacks(cpu);
+ if (rcu_pending(cpu))
+ invoke_rcu_core();
+ trace_rcu_utilization(TPS("End scheduler-tick"));
+}
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Also initiate boosting for any threads blocked on the root rcu_node.
+ *
+ * The caller must have suppressed start of new grace periods.
+ */
+static void force_qs_rnp(struct rcu_state *rsp,
+ int (*f)(struct rcu_data *rsp, bool *isidle,
+ unsigned long *maxj),
+ bool *isidle, unsigned long *maxj)
+{
+ unsigned long bit;
+ int cpu;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rsp, rnp) {
+ cond_resched();
+ mask = 0;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!rcu_gp_in_progress(rsp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rnp->qsmask == 0) {
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+ continue;
+ }
+ cpu = rnp->grplo;
+ bit = 1;
+ for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+ if ((rnp->qsmask & bit) != 0) {
+ if ((rnp->qsmaskinit & bit) != 0)
+ *isidle = 0;
+ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+ mask |= bit;
+ }
+ }
+ if (mask != 0) {
+
+ /* rcu_report_qs_rnp() releases rnp->lock. */
+ rcu_report_qs_rnp(mask, rsp, rnp, flags);
+ continue;
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+ rnp = rcu_get_root(rsp);
+ if (rnp->qsmask == 0) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+ }
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ bool ret;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_old = NULL;
+
+ /* Funnel through hierarchy to reduce memory contention. */
+ rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ for (; rnp != NULL; rnp = rnp->parent) {
+ ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+ !raw_spin_trylock(&rnp->fqslock);
+ if (rnp_old != NULL)
+ raw_spin_unlock(&rnp_old->fqslock);
+ if (ret) {
+ ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ return;
+ }
+ rnp_old = rnp;
+ }
+ /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
+
+ /* Reached the root of the rcu_node tree, acquire lock. */
+ raw_spin_lock_irqsave(&rnp_old->lock, flags);
+ smp_mb__after_unlock_lock();
+ raw_spin_unlock(&rnp_old->fqslock);
+ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ return; /* Someone beat us to it. */
+ }
+ ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
+ raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
+}
+
+/*
+ * This does the RCU core processing work for the specified rcu_state
+ * and rcu_data structures. This may be called only from the CPU to
+ * whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ bool needwake;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+ WARN_ON_ONCE(rdp->beenonline == 0);
+
+ /* Update RCU state based on any recent quiescent states. */
+ rcu_check_quiescent_state(rsp, rdp);
+
+ /* Does this CPU require a not-yet-started grace period? */
+ local_irq_save(flags);
+ if (cpu_needs_another_gp(rsp, rdp)) {
+ raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+ needwake = rcu_start_gp(rsp);
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ } else {
+ local_irq_restore(flags);
+ }
+
+ /* If there are callbacks ready, invoke them. */
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_callbacks(rsp, rdp);
+
+ /* Do any needed deferred wakeups of rcuo kthreads. */
+ do_nocb_deferred_wakeup(rdp);
+}
+
+/*
+ * Do RCU core processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+ struct rcu_state *rsp;
+
+ if (cpu_is_offline(smp_processor_id()))
+ return;
+ trace_rcu_utilization(TPS("Start RCU core"));
+ for_each_rcu_flavor(rsp)
+ __rcu_process_callbacks(rsp);
+ trace_rcu_utilization(TPS("End RCU core"));
+}
+
+/*
+ * Schedule RCU callback invocation. If the specified type of RCU
+ * does not support RCU priority boosting, just do a direct call,
+ * otherwise wake up the per-CPU kernel kthread. Note that because we
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
+ */
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+ return;
+ if (likely(!rsp->boost)) {
+ rcu_do_batch(rsp, rdp);
+ return;
+ }
+ invoke_rcu_callbacks_kthread();
+}
+
+static void invoke_rcu_core(void)
+{
+ if (cpu_online(smp_processor_id()))
+ raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+ struct rcu_head *head, unsigned long flags)
+{
+ bool needwake;
+
+ /*
+ * If called from an extended quiescent state, invoke the RCU
+ * core in order to force a re-evaluation of RCU's idleness.
+ */
+ if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+ invoke_rcu_core();
+
+ /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+ if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+ return;
+
+ /*
+ * Force the grace period if too many callbacks or too long waiting.
+ * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * if some other CPU has recently done so. Also, don't bother
+ * invoking force_quiescent_state() if the newly enqueued callback
+ * is the only one waiting for a grace period to complete.
+ */
+ if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+
+ /* Are we ignoring a completed grace period? */
+ note_gp_changes(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock(&rnp_root->lock);
+ smp_mb__after_unlock_lock();
+ needwake = rcu_start_gp(rsp);
+ raw_spin_unlock(&rnp_root->lock);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
+ }
+}
+
+/*
+ * RCU callback function to leak a callback.
+ */
+static void rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
+ * Helper function for call_rcu() and friends. The cpu argument will
+ * normally be -1, indicating "currently running CPU". It may specify
+ * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
+ * is expected to specify a CPU.
+ */
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+ struct rcu_state *rsp, int cpu, bool lazy)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+ if (debug_rcu_head_queue(head)) {
+ /* Probable double call_rcu(), so leak the callback. */
+ ACCESS_ONCE(head->func) = rcu_leak_callback;
+ WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
+ return;
+ }
+ head->func = func;
+ head->next = NULL;
+
+ /*
+ * Opportunistically note grace-period endings and beginnings.
+ * Note that we might see a beginning right after we see an
+ * end, but never vice versa, since this CPU has to pass through
+ * a quiescent state betweentimes.
+ */
+ local_irq_save(flags);
+ rdp = this_cpu_ptr(rsp->rda);
+
+ /* Add the callback to our list. */
+ if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
+ int offline;
+
+ if (cpu != -1)
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+ WARN_ON_ONCE(offline);
+ /* _call_rcu() is illegal on offline CPU; leak the callback. */
+ local_irq_restore(flags);
+ return;
+ }
+ ACCESS_ONCE(rdp->qlen)++;
+ if (lazy)
+ rdp->qlen_lazy++;
+ else
+ rcu_idle_count_callbacks_posted();
+ smp_mb(); /* Count before adding callback for rcu_barrier(). */
+ *rdp->nxttail[RCU_NEXT_TAIL] = head;
+ rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+ if (__is_kfree_rcu_offset((unsigned long)func))
+ trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+ rdp->qlen_lazy, rdp->qlen);
+ else
+ trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+
+ /* Go handle any RCU core processing required. */
+ __call_rcu_core(rsp, rdp, head, flags);
+ local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU-sched callback for invocation after a grace period.
+ */
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_sched_state, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
+/*
+ * Queue an RCU callback for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_bh_state, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks. Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, rcu_state_p, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+ int ret;
+
+ might_sleep(); /* Check for RCU read-side critical section. */
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-sched
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-sched read-side critical sections have completed. These read-side
+ * critical sections are delimited by rcu_read_lock_sched() and
+ * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
+ * local_irq_disable(), and so on may be used in place of
+ * rcu_read_lock_sched().
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * non-threaded hardware-interrupt handlers, in progress on entry will
+ * have completed before this primitive returns. However, this does not
+ * guarantee that softirq handlers will have completed, since in some
+ * kernels, these handlers can run in process context, and can block.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_sched() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-sched read-side critical section whose beginning
+ * preceded the call to synchronize_sched(). In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_sched() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_sched() and before the beginning of
+ * that RCU read-side critical section. Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_sched(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * This primitive provides the guarantees made by the (now removed)
+ * synchronize_kernel() API. In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ * In "classic RCU", these two guarantees happen to be one and
+ * the same, but can differ in realtime RCU implementations.
+ */
+void synchronize_sched(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU-sched read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_expedited)
+ synchronize_sched_expedited();
+ else
+ wait_rcu_gp(call_rcu_sched);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu_bh(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_expedited)
+ synchronize_rcu_bh_expedited();
+ else
+ wait_rcu_gp(call_rcu_bh);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
+/**
+ * get_state_synchronize_rcu - Snapshot current RCU state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_rcu()
+ * and cond_synchronize_rcu().
+ */
+ return smp_load_acquire(&rcu_state_p->gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu(), just return. Otherwise, invoke
+ * synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
+ */
+ newstate = smp_load_acquire(&rcu_state_p->completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
+}
+
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code. In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * Note that it is illegal to call this function while holding any lock
+ * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
+ * to call this function from a CPU-hotplug notifier. Failing to observe
+ * these restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word. Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs. If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period. We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done. If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot. In this case, our work is
+ * done for us, and we can simply return. Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+ long firstsnap, s, snap;
+ int trycount = 0;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ /*
+ * If we are in danger of counter wrap, just do synchronize_sched().
+ * By allowing sync_sched_expedited_started to advance no more than
+ * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
+ * that more than 3.5 billion CPUs would be required to force a
+ * counter wrap on a 32-bit system. Quite a few more CPUs would of
+ * course be required on a 64-bit system.
+ */
+ if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
+ (ulong)atomic_long_read(&rsp->expedited_done) +
+ ULONG_MAX / 8)) {
+ synchronize_sched();
+ atomic_long_inc(&rsp->expedited_wrap);
+ return;
+ }
+
+ /*
+ * Take a ticket. Note that atomic_inc_return() implies a
+ * full memory barrier.
+ */
+ snap = atomic_long_inc_return(&rsp->expedited_start);
+ firstsnap = snap;
+ get_online_cpus();
+ WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+
+ /*
+ * Each pass through the following loop attempts to force a
+ * context switch on each CPU.
+ */
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
+ put_online_cpus();
+ atomic_long_inc(&rsp->expedited_tryfail);
+
+ /* Check to see if someone else did our work for us. */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ return;
+ }
+
+ /* No joy, try again later. Or just synchronize_sched(). */
+ if (trycount++ < 10) {
+ udelay(trycount * num_online_cpus());
+ } else {
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ return;
+ }
+
+ /* Recheck to see if someone else did our work for us. */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone2);
+ return;
+ }
+
+ /*
+ * Refetching sync_sched_expedited_started allows later
+ * callers to piggyback on our grace period. We retry
+ * after they started, so our grace period works for them,
+ * and they started after our first try, so their grace
+ * period works for us.
+ */
+ get_online_cpus();
+ snap = atomic_long_read(&rsp->expedited_start);
+ smp_mb(); /* ensure read is before try_stop_cpus(). */
+ }
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
+
+ /*
+ * Everyone up to our most recent fetch is covered by our grace
+ * period. Update the counter, but only if our work is still
+ * relevant -- which it won't be if someone who started later
+ * than we did already did their update.
+ */
+ do {
+ atomic_long_inc(&rsp->expedited_done_tries);
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_done_lost);
+ break;
+ }
+ } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
+ atomic_long_inc(&rsp->expedited_done_exit);
+
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first. However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ rdp->n_rcu_pending++;
+
+ /* Check for CPU stalls, if enabled. */
+ check_cpu_stall(rsp, rdp);
+
+ /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
+ if (rcu_nohz_full_cpu(rsp))
+ return 0;
+
+ /* Is the RCU core waiting for a quiescent state from this CPU? */
+ if (rcu_scheduler_fully_active &&
+ rdp->qs_pending && !rdp->passed_quiesce) {
+ rdp->n_rp_qs_pending++;
+ } else if (rdp->qs_pending && rdp->passed_quiesce) {
+ rdp->n_rp_report_qs++;
+ return 1;
+ }
+
+ /* Does this CPU have callbacks ready to invoke? */
+ if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+ rdp->n_rp_cb_ready++;
+ return 1;
+ }
+
+ /* Has RCU gone idle with this CPU needing another grace period? */
+ if (cpu_needs_another_gp(rsp, rdp)) {
+ rdp->n_rp_cpu_needs_gp++;
+ return 1;
+ }
+
+ /* Has another RCU grace period completed? */
+ if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+ rdp->n_rp_gp_completed++;
+ return 1;
+ }
+
+ /* Has a new RCU grace period started? */
+ if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+ rdp->n_rp_gp_started++;
+ return 1;
+ }
+
+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp)) {
+ rdp->n_rp_nocb_defer_wakeup++;
+ return 1;
+ }
+
+ /* nothing to do */
+ rdp->n_rp_need_nothing++;
+ return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so. This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+static int rcu_pending(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ return 1;
+ return 0;
+}
+
+/*
+ * Return true if the specified CPU has any callback. If all_lazy is
+ * non-NULL, store an indication of whether all callbacks are lazy.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
+ */
+static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+{
+ bool al = true;
+ bool hc = false;
+ struct rcu_data *rdp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (!rdp->nxtlist)
+ continue;
+ hc = true;
+ if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
+ al = false;
+ break;
+ }
+ }
+ if (all_lazy)
+ *all_lazy = al;
+ return hc;
+}
+
+/*
+ * Helper function for _rcu_barrier() tracing. If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
+ int cpu, unsigned long done)
+{
+ trace_rcu_barrier(rsp->name, s, cpu,
+ atomic_read(&rsp->barrier_cpu_count), done);
+}
+
+/*
+ * RCU callback function for _rcu_barrier(). If we are last, wake
+ * up the task executing _rcu_barrier().
+ */
+static void rcu_barrier_callback(struct rcu_head *rhp)
+{
+ struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+ struct rcu_state *rsp = rdp->rsp;
+
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ complete(&rsp->barrier_completion);
+ } else {
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ }
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *type)
+{
+ struct rcu_state *rsp = type;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+}
+
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
+ */
+static void _rcu_barrier(struct rcu_state *rsp)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+ unsigned long snap_done;
+
+ _rcu_barrier_trace(rsp, "Begin", -1, snap);
+
+ /* Take mutex to serialize concurrent rcu_barrier() requests. */
+ mutex_lock(&rsp->barrier_mutex);
+
+ /*
+ * Ensure that all prior references, including to ->n_barrier_done,
+ * are ordered before the _rcu_barrier() machinery.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /*
+ * Recheck ->n_barrier_done to see if others did our work for us.
+ * This means checking ->n_barrier_done for an even-to-odd-to-even
+ * transition. The "if" expression below therefore rounds the old
+ * value up to the next even number and adds two before comparing.
+ */
+ snap_done = rsp->n_barrier_done;
+ _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+
+ /*
+ * If the value in snap is odd, we needed to wait for the current
+ * rcu_barrier() to complete, then wait for the next one, in other
+ * words, we need the value of snap_done to be three larger than
+ * the value of snap. On the other hand, if the value in snap is
+ * even, we only had to wait for the next rcu_barrier() to complete,
+ * in other words, we need the value of snap_done to be only two
+ * greater than the value of snap. The "(snap + 3) & ~0x1" computes
+ * this for us (thank you, Linus!).
+ */
+ if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ smp_mb(); /* caller's subsequent code after above check. */
+ mutex_unlock(&rsp->barrier_mutex);
+ return;
+ }
+
+ /*
+ * Increment ->n_barrier_done to avoid duplicate work. Use
+ * ACCESS_ONCE() to prevent the compiler from speculating
+ * the increment to precede the early-exit check.
+ */
+ ACCESS_ONCE(rsp->n_barrier_done)++;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+ smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+
+ /*
+ * Initialize the count to one rather than to zero in order to
+ * avoid a too-soon return to zero in case of a short grace period
+ * (or preemption of this task). Exclude CPU-hotplug operations
+ * to ensure that no offline CPU has callbacks queued.
+ */
+ init_completion(&rsp->barrier_completion);
+ atomic_set(&rsp->barrier_cpu_count, 1);
+ get_online_cpus();
+
+ /*
+ * Force each CPU with callbacks to register a new callback.
+ * When that callback is invoked, we will know that all of the
+ * corresponding CPU's preceding callbacks have been invoked.
+ */
+ for_each_possible_cpu(cpu) {
+ if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
+ continue;
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rcu_is_nocb_cpu(cpu)) {
+ _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
+ rsp, cpu, 0);
+ } else if (ACCESS_ONCE(rdp->qlen)) {
+ _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ rsp->n_barrier_done);
+ smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
+ } else {
+ _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ rsp->n_barrier_done);
+ }
+ }
+ put_online_cpus();
+
+ /*
+ * Now that we have an rcu_barrier_callback() callback on each
+ * CPU, and thus each counted, remove the initial count.
+ */
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count))
+ complete(&rsp->barrier_completion);
+
+ /* Increment ->n_barrier_done to prevent duplicate work. */
+ smp_mb(); /* Keep increment after above mechanism. */
+ ACCESS_ONCE(rsp->n_barrier_done)++;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+ smp_mb(); /* Keep increment before caller's subsequent code. */
+
+ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
+ wait_for_completion(&rsp->barrier_completion);
+
+ /* Other rcu_barrier() invocations can now safely proceed. */
+ mutex_unlock(&rsp->barrier_mutex);
+}
+
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+ _rcu_barrier(&rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+ _rcu_barrier(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
+/*
+ * Do boot-time initialization of a CPU's per-CPU RCU data.
+ */
+static void __init
+rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Set up local state, ensuring consistent view of global state. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+ init_callback_list(rdp);
+ rdp->qlen_lazy = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
+ rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+ WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
+ WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+ rdp->cpu = cpu;
+ rdp->rsp = rsp;
+ rcu_boot_init_nocb_percpu_data(rdp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data. Note that only one online or
+ * offline event can be happening at a given time. Note also that we
+ * can accept some slop in the rsp->completed access due to the fact
+ * that this CPU cannot possibly have any RCU callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Exclude new grace periods. */
+ mutex_lock(&rsp->onoff_mutex);
+
+ /* Set up local state, ensuring consistent view of global state. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rdp->beenonline = 1; /* We have now been online. */
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->blimit = blimit;
+ init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rcu_sysidle_init_percpu_data(rdp->dynticks);
+ atomic_set(&rdp->dynticks->dynticks,
+ (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
+ /* Add CPU to rcu_node bitmasks. */
+ rnp = rdp->mynode;
+ mask = rdp->grpmask;
+ do {
+ /* Exclude any attempts to start a new GP on small systems. */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ rnp->qsmaskinit |= mask;
+ mask = rnp->grpmask;
+ if (rnp == rdp->mynode) {
+ /*
+ * If there is a grace period in progress, we will
+ * set up to wait for it next time we run the
+ * RCU core code.
+ */
+ rdp->gpnum = rnp->completed;
+ rdp->completed = rnp->completed;
+ rdp->passed_quiesce = 0;
+ rdp->qs_pending = 0;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
+ rnp = rnp->parent;
+ } while (rnp != NULL && !(rnp->qsmaskinit & mask));
+ local_irq_restore(flags);
+
+ mutex_unlock(&rsp->onoff_mutex);
+}
+
+static void rcu_prepare_cpu(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_init_percpu_data(cpu, rsp);
+}
+
+/*
+ * Handle CPU online/offline notification events.
+ */
+static int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_state *rsp;
+
+ trace_rcu_utilization(TPS("Start CPU hotplug"));
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ rcu_prepare_cpu(cpu);
+ rcu_prepare_kthreads(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ rcu_boost_kthread_setaffinity(rnp, -1);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcu_boost_kthread_setaffinity(rnp, cpu);
+ break;
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dead_cpu(cpu, rsp);
+ break;
+ default:
+ break;
+ }
+ trace_rcu_utilization(TPS("End CPU hotplug"));
+ return NOTIFY_OK;
+}
+
+static int rcu_pm_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_expedited = 1;
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ rcu_expedited = 0;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Spawn the kthread that handles this RCU flavor's grace periods.
+ */
+static int __init rcu_spawn_gp_kthread(void)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+ struct task_struct *t;
+
+ for_each_rcu_flavor(rsp) {
+ t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+ BUG_ON(IS_ERR(t));
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rsp->gp_kthread = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_spawn_nocb_kthreads(rsp);
+ }
+ return 0;
+}
+early_initcall(rcu_spawn_gp_kthread);
+
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process. Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system). After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections. This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+ WARN_ON(num_online_cpus() != 1);
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+ int i;
+
+ rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+ int ccur;
+ int cprv;
+ int i;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = rsp->levelcnt[i];
+ rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp,
+ struct rcu_data __percpu *rda)
+{
+ static char *buf[] = { "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static char *fqs[] = { "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static u8 fl_mask = 0x1;
+ int cpustride = 1;
+ int i;
+ int j;
+ struct rcu_node *rnp;
+
+ BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+
+ /* Silence gcc 4.8 warning about array index out of range. */
+ if (rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls overflow");
+
+ /* Initialize the level-tracking arrays. */
+
+ for (i = 0; i < rcu_num_lvls; i++)
+ rsp->levelcnt[i] = num_rcu_lvl[i];
+ for (i = 1; i < rcu_num_lvls; i++)
+ rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+ rcu_init_levelspread(rsp);
+ rsp->flavor_mask = fl_mask;
+ fl_mask <<= 1;
+
+ /* Initialize the elements themselves, starting from the leaves. */
+
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ cpustride *= rsp->levelspread[i];
+ rnp = rsp->level[i];
+ for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+ raw_spin_lock_init(&rnp->lock);
+ lockdep_set_class_and_name(&rnp->lock,
+ &rcu_node_class[i], buf[i]);
+ raw_spin_lock_init(&rnp->fqslock);
+ lockdep_set_class_and_name(&rnp->fqslock,
+ &rcu_fqs_class[i], fqs[i]);
+ rnp->gpnum = rsp->gpnum;
+ rnp->completed = rsp->completed;
+ rnp->qsmask = 0;
+ rnp->qsmaskinit = 0;
+ rnp->grplo = j * cpustride;
+ rnp->grphi = (j + 1) * cpustride - 1;
+ if (rnp->grphi >= nr_cpu_ids)
+ rnp->grphi = nr_cpu_ids - 1;
+ if (i == 0) {
+ rnp->grpnum = 0;
+ rnp->grpmask = 0;
+ rnp->parent = NULL;
+ } else {
+ rnp->grpnum = j % rsp->levelspread[i - 1];
+ rnp->grpmask = 1UL << rnp->grpnum;
+ rnp->parent = rsp->level[i - 1] +
+ j / rsp->levelspread[i - 1];
+ }
+ rnp->level = i;
+ INIT_LIST_HEAD(&rnp->blkd_tasks);
+ rcu_init_one_nocb(rnp);
+ }
+ }
+
+ rsp->rda = rda;
+ init_waitqueue_head(&rsp->gp_wq);
+ rnp = rsp->level[rcu_num_lvls - 1];
+ for_each_possible_cpu(i) {
+ while (i > rnp->grphi)
+ rnp++;
+ per_cpu_ptr(rsp->rda, i)->mynode = rnp;
+ rcu_boot_init_percpu_data(i, rsp);
+ }
+ list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+
+/*
+ * Compute the rcu_node tree geometry from kernel parameters. This cannot
+ * replace the definitions in tree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+ ulong d;
+ int i;
+ int j;
+ int n = nr_cpu_ids;
+ int rcu_capacity[MAX_RCU_LVLS + 1];
+
+ /*
+ * Initialize any unspecified boot parameters.
+ * The default values of jiffies_till_first_fqs and
+ * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
+ * value, which is a function of HZ, then adding one for each
+ * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
+ */
+ d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
+ if (jiffies_till_first_fqs == ULONG_MAX)
+ jiffies_till_first_fqs = d;
+ if (jiffies_till_next_fqs == ULONG_MAX)
+ jiffies_till_next_fqs = d;
+
+ /* If the compile-time values are accurate, just leave. */
+ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ nr_cpu_ids == NR_CPUS)
+ return;
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ rcu_fanout_leaf, nr_cpu_ids);
+
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels. Setting rcu_capacity[0] makes
+ * some of the arithmetic easier.
+ */
+ rcu_capacity[0] = 1;
+ rcu_capacity[1] = rcu_fanout_leaf;
+ for (i = 2; i <= MAX_RCU_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+ /*
+ * The boot-time rcu_fanout_leaf parameter is only permitted
+ * to increase the leaf-level fanout, not decrease it. Of course,
+ * the leaf-level fanout cannot exceed the number of bits in
+ * the rcu_node masks. Finally, the tree must be able to accommodate
+ * the configured number of CPUs. Complain and fall back to the
+ * compile-time values if these limits are exceeded.
+ */
+ if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+ n > rcu_capacity[MAX_RCU_LVLS]) {
+ WARN_ON(1);
+ return;
+ }
+
+ /* Calculate the number of rcu_nodes at each level of the tree. */
+ for (i = 1; i <= MAX_RCU_LVLS; i++)
+ if (n <= rcu_capacity[i]) {
+ for (j = 0; j <= i; j++)
+ num_rcu_lvl[j] =
+ DIV_ROUND_UP(n, rcu_capacity[i - j]);
+ rcu_num_lvls = i;
+ for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+ num_rcu_lvl[j] = 0;
+ break;
+ }
+
+ /* Calculate the total number of rcu_node structures. */
+ rcu_num_nodes = 0;
+ for (i = 0; i <= MAX_RCU_LVLS; i++)
+ rcu_num_nodes += num_rcu_lvl[i];
+ rcu_num_nodes -= n;
+}
+
+void __init rcu_init(void)
+{
+ int cpu;
+
+ rcu_bootup_announce();
+ rcu_init_geometry();
+ rcu_init_one(&rcu_bh_state, &rcu_bh_data);
+ rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ __rcu_init_preempt();
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+ /*
+ * We don't need protection against CPU-hotplug here because
+ * this is called early in boot, before either interrupts
+ * or the scheduler are operational.
+ */
+ cpu_notifier(rcu_cpu_notify, 0);
+ pm_notifier(rcu_pm_notify, 0);
+ for_each_online_cpu(cpu)
+ rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+}
+
+#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index fddff92d667..0f69a79c5b7 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -27,47 +27,44 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/irq_work.h>
/*
- * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
* In theory, it should be possible to add more levels straightforwardly.
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_LEAF 16
-#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
-#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
-# define NUM_RCU_LVLS 1
+# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_2
-# define NUM_RCU_LVLS 2
+# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_3
-# define NUM_RCU_LVLS 3
+# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_4
-# define NUM_RCU_LVLS 4
+# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -80,6 +77,9 @@
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
/*
* Dynticks per-CPU state.
*/
@@ -88,6 +88,26 @@ struct rcu_dynticks {
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ long long dynticks_idle_nesting;
+ /* irq/process nesting level from idle. */
+ atomic_t dynticks_idle; /* Even value for idle, else odd. */
+ /* "Idle" excludes userspace execution. */
+ unsigned long dynticks_idle_jiffies;
+ /* End of last non-NMI non-idle period. */
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ bool all_lazy; /* Are all CPU's CBs lazy? */
+ unsigned long nonlazy_posted;
+ /* # times non-lazy CBs posted to CPU. */
+ unsigned long nonlazy_posted_snap;
+ /* idle-period nonlazy_posted snapshot. */
+ unsigned long last_accelerate;
+ /* Last jiffy CBs were accelerated. */
+ unsigned long last_advance_all;
+ /* Last jiffy CBs were all advanced. */
+ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
/* RCU's kthread states for tracing. */
@@ -120,9 +140,6 @@ struct rcu_node {
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
- atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
- /* Since this has meaning only for leaf */
- /* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -182,12 +199,13 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
- struct task_struct *node_kthread_task;
- /* kthread that takes care of this rcu_node */
- /* structure, for example, awakening the */
- /* per-CPU kthreads as needed. */
- unsigned int node_kthread_status;
- /* State of node_kthread_task for tracing. */
+#ifdef CONFIG_RCU_NOCB_CPU
+ wait_queue_head_t nocb_gp_wq[2];
+ /* Place for rcu_nocb_kthread() to wait GP. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ int need_future_gp[2];
+ /* Counts of upcoming no-CB GP requests. */
+ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -196,7 +214,7 @@ struct rcu_node {
*/
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -205,7 +223,7 @@ struct rcu_node {
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+ (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -214,8 +232,8 @@ struct rcu_node {
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/* Index values for nxttail array in struct rcu_data. */
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -231,14 +249,17 @@ struct rcu_data {
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
- unsigned long passed_quiesce_gpnum;
- /* gpnum at time of quiescent state. */
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
- bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+ unsigned long ticks_this_gp; /* The number of scheduling-clock */
+ /* ticks this CPU has handled */
+ /* during and after the last grace */
+ /* period it is aware of. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
/* 2) batch handling */
/*
@@ -265,10 +286,14 @@ struct rcu_data {
*/
struct rcu_head *nxtlist;
struct rcu_head **nxttail[RCU_NEXT_SIZE];
- long qlen; /* # of queued callbacks */
+ unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ /* grace periods for sublists. */
+ long qlen_lazy; /* # of lazy queued callbacks */
+ long qlen; /* # of queued callbacks, incl lazy */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
+ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
@@ -282,7 +307,9 @@ struct rcu_data {
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
unsigned long offline_fqs; /* Kicked due to being offline. */
- unsigned long resched_ipi; /* Sent a resched IPI. */
+ unsigned long cond_resched_completed;
+ /* Grace period that needs help */
+ /* from cond_resched(). */
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -292,9 +319,33 @@ struct rcu_data {
unsigned long n_rp_cpu_needs_gp;
unsigned long n_rp_gp_completed;
unsigned long n_rp_gp_started;
- unsigned long n_rp_need_fqs;
+ unsigned long n_rp_nocb_defer_wakeup;
unsigned long n_rp_need_nothing;
+ /* 6) _rcu_barrier() and OOM callbacks. */
+ struct rcu_head barrier_head;
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ struct rcu_head oom_head;
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+ /* 7) Callback offloading. */
+#ifdef CONFIG_RCU_NOCB_CPU
+ struct rcu_head *nocb_head; /* CBs waiting for kthread. */
+ struct rcu_head **nocb_tail;
+ atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
+ atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ int nocb_p_count; /* # CBs being invoked by kthread */
+ int nocb_p_count_lazy; /* (approximate). */
+ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
+ struct task_struct *nocb_kthread;
+ bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+ /* 8) RCU CPU stall data. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+ unsigned int softirq_snap; /* Snapshot of softirq activity. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
int cpu;
struct rcu_state *rsp;
};
@@ -306,23 +357,17 @@ struct rcu_data {
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
-#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
+#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
+ /* For jiffies_till_first_fqs and */
+ /* and jiffies_till_next_fqs. */
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
-#endif
-
-#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
- RCU_STALL_DELAY_DELTA)
- /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
- /* for rsp->jiffies_stall */
-#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
- /* to take at least one */
- /* scheduling clock irq */
- /* before ratting on them. */
+#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
+ /* delay between bouts of */
+ /* quiescent-state forcing. */
+
+#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
+ /* at least one scheduling clock */
+ /* irq before ratting on them. */
#define rcu_wait(cond) \
do { \
@@ -347,32 +392,61 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
+ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
+ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
+ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
+ void (*func)(struct rcu_head *head));
/* The following fields are guarded by the root rcu_node's lock. */
u8 fqs_state ____cacheline_internodealigned_in_smp;
/* Force QS state. */
- u8 fqs_active; /* force_quiescent_state() */
- /* is running. */
- u8 fqs_need_gp; /* A CPU was prevented from */
- /* starting a new grace */
- /* period because */
- /* force_quiescent_state() */
- /* was running. */
u8 boost; /* Subject to priority boost. */
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
+ struct task_struct *gp_kthread; /* Task for grace periods. */
+ wait_queue_head_t gp_wq; /* Where GP task waits. */
+ short gp_flags; /* Commands for GP task. */
+ short gp_state; /* GP kthread sleep state. */
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t onofflock; /* exclude on/offline and */
- /* starting new GP. */
- raw_spinlock_t fqslock; /* Only one task forcing */
- /* quiescent states. */
+ raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
+ /* Protect following fields. */
+ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ /* need a grace period. */
+ struct rcu_head **orphan_nxttail; /* Tail of above. */
+ struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ /* are ready to invoke. */
+ struct rcu_head **orphan_donetail; /* Tail of above. */
+ long qlen_lazy; /* Number of lazy callbacks. */
+ long qlen; /* Total number of callbacks. */
+ /* End of fields guarded by orphan_lock. */
+
+ struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
+
+ struct mutex barrier_mutex; /* Guards barrier fields. */
+ atomic_t barrier_cpu_count; /* # CPUs waiting on. */
+ struct completion barrier_completion; /* Wake at barrier end. */
+ unsigned long n_barrier_done; /* ++ at start and end of */
+ /* _rcu_barrier(). */
+ /* End of fields guarded by barrier_mutex. */
+
+ atomic_long_t expedited_start; /* Starting ticket. */
+ atomic_long_t expedited_done; /* Done ticket. */
+ atomic_long_t expedited_wrap; /* # near-wrap incidents. */
+ atomic_long_t expedited_tryfail; /* # acquisition failures. */
+ atomic_long_t expedited_workdone1; /* # done by others #1. */
+ atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_normal; /* # fallbacks to normal. */
+ atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
+ atomic_long_t expedited_done_tries; /* # tries to update _done. */
+ atomic_long_t expedited_done_lost; /* # times beaten to _done. */
+ atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs; /* Number of calls to */
@@ -385,11 +459,30 @@ struct rcu_state {
/* but in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
+ unsigned long jiffies_resched; /* Time at which to resched */
+ /* a reluctant CPU. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
- char *name; /* Name of structure. */
+ const char *name; /* Name of structure. */
+ char abbr; /* Abbreviated name. */
+ struct list_head flavors; /* List of RCU flavors. */
};
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
+#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
+
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_WAIT_INIT 0 /* Initial state. */
+#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
+#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+
+extern struct list_head rcu_struct_flavors;
+
+/* Sequence through rcu_state structures for each RCU flavor. */
+#define for_each_rcu_flavor(rsp) \
+ list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
+
/* Return values for rcu_preempt_offline_tasks(). */
#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
@@ -428,29 +521,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
-static void rcu_preempt_stall_reset(void);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp);
-static void rcu_preempt_offline_cpu(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_preempt_check_callbacks(int cpu);
-static void rcu_preempt_process_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
-static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_needs_cpu(int cpu);
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -458,18 +543,58 @@ static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm);
-static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
-static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
+static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
+static void rcu_idle_count_callbacks_posted(void);
+static void print_cpu_stall_info_begin(void);
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
+static void print_cpu_stall_info_end(void);
+static void zero_cpu_stall_ticks(struct rcu_data *rdp);
+static void increment_cpu_stall_ticks(void);
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool lazy, unsigned long flags);
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+ struct rcu_data *rdp,
+ unsigned long flags);
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
+static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
+static bool init_nocb_callback_list(struct rcu_data *rdp);
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
+static void rcu_bind_gp_kthread(void);
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
#endif /* #ifndef RCU_TREE_NONCORE */
+
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_NOCB_CPU
+/* Sum up queue lengths for tracing. */
+static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
+{
+ *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+}
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
+{
+ *ql = 0;
+ *qll = 0;
+}
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
new file mode 100644
index 00000000000..02ac0fb186b
--- /dev/null
+++ b/kernel/rcu/tree_plugin.h
@@ -0,0 +1,2854 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include "../time/tick-internal.h"
+
+#define RCU_KTHREAD_PRIO 1
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else
+#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
+#endif
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static char __initdata nocb_buf[NR_CPUS * 5];
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary. If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+#ifdef CONFIG_RCU_TRACE
+ pr_info("\tRCU debugfs-based tracing is enabled.\n");
+#endif
+#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+ pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+ CONFIG_RCU_FANOUT);
+#endif
+#ifdef CONFIG_RCU_FANOUT_EXACT
+ pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+#endif
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+#endif
+#ifdef CONFIG_PROVE_RCU
+ pr_info("\tRCU lockdep checking is enabled.\n");
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+ pr_info("\tRCU torture testing starts during boot.\n");
+#endif
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
+ pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
+#endif
+#if defined(CONFIG_RCU_CPU_STALL_INFO)
+ pr_info("\tAdditional per-CPU info printed with stalls.\n");
+#endif
+#if NUM_RCU_LVL_4 != 0
+ pr_info("\tFour-level hierarchy is enabled.\n");
+#endif
+ if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+ if (nr_cpu_ids != NR_CPUS)
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+#ifdef CONFIG_RCU_NOCB_CPU
+#ifndef CONFIG_RCU_NOCB_CPU_NONE
+ if (!have_rcu_nocb_mask) {
+ zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
+ have_rcu_nocb_mask = true;
+ }
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+ pr_info("\tOffload RCU callbacks from CPU 0\n");
+ cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+ pr_info("\tOffload RCU callbacks from all CPUs\n");
+ cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
+ if (have_rcu_nocb_mask) {
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
+ pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+ }
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+}
+
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
+RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
+static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+
+static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+
+/*
+ * Tell them what RCU they are running.
+ */
+static void __init rcu_bootup_announce(void)
+{
+ pr_info("Preemptible hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
+}
+
+/*
+ * Return the number of RCU-preempt batches processed thus far
+ * for debug and statistics.
+ */
+long rcu_batches_completed_preempt(void)
+{
+ return rcu_preempt_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_batches_completed_preempt();
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU. Note
+ * that this just means that the task currently running on the CPU is
+ * not in a quiescent state. There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ */
+static void rcu_preempt_qs(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+
+ if (rdp->passed_quiesce == 0)
+ trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
+ rdp->passed_quiesce = 1;
+ current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from. If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section. Therefore, the current grace period
+ * cannot be permitted to complete until the blkd_tasks list entries
+ * predating the current grace period drain, in other words, until
+ * rnp->gp_tasks becomes NULL.
+ *
+ * Caller must disable preemption.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+ struct task_struct *t = current;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ if (t->rcu_read_lock_nesting > 0 &&
+ (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+ /* Possibly blocking in an RCU read-side critical section. */
+ rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+ t->rcu_blocked_node = rnp;
+
+ /*
+ * If this CPU has already checked in, then this task
+ * will hold up the next grace period rather than the
+ * current grace period. Queue the task accordingly.
+ * If the task is queued for the current grace period
+ * (i.e., this CPU has not yet passed through a quiescent
+ * state for the current grace period), then as long
+ * as that task remains queued, the current grace period
+ * cannot end. Note that there is some uncertainty as
+ * to exactly when the current grace period started.
+ * We take a conservative approach, which can result
+ * in unnecessarily waiting on tasks that started very
+ * slightly after the current grace period began. C'est
+ * la vie!!!
+ *
+ * But first, note that the current CPU must still be
+ * on line!
+ */
+ WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+ WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
+ if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
+ list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
+ rnp->gp_tasks = &t->rcu_node_entry;
+#ifdef CONFIG_RCU_BOOST
+ if (rnp->boost_tasks != NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ } else {
+ list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+ if (rnp->qsmask & rdp->grpmask)
+ rnp->gp_tasks = &t->rcu_node_entry;
+ }
+ trace_rcu_preempt_task(rdp->rsp->name,
+ t->pid,
+ (rnp->qsmask & rdp->grpmask)
+ ? rnp->gpnum
+ : rnp->gpnum + 1);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else if (t->rcu_read_lock_nesting < 0 &&
+ t->rcu_read_unlock_special) {
+
+ /*
+ * Complete exit from RCU read-side critical section on
+ * behalf of preempted instance of __rcu_read_unlock().
+ */
+ rcu_read_unlock_special(t);
+ }
+
+ /*
+ * Either we were not in an RCU read-side critical section to
+ * begin with, or we have now recorded that critical section
+ * globally. Either way, we can now note a quiescent state
+ * for this CPU. Again, if we were in an RCU read-side critical
+ * section, and if that critical section was blocking the current
+ * grace period, then the fact that the task has been enqueued
+ * means that we continue to block the current grace period.
+ */
+ local_irq_save(flags);
+ rcu_preempt_qs(cpu);
+ local_irq_restore(flags);
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period
+ * for the specified rcu_node structure. If the caller needs a reliable
+ * answer, it must hold the rcu_node's ->lock.
+ */
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+{
+ return rnp->gp_tasks != NULL;
+}
+
+/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period. The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long mask;
+ struct rcu_node *rnp_p;
+
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return; /* Still need more quiescent states! */
+ }
+
+ rnp_p = rnp->parent;
+ if (rnp_p == NULL) {
+ /*
+ * Either there is only one rcu_node in the tree,
+ * or tasks were kicked up to root rcu_node due to
+ * CPUs going offline.
+ */
+ rcu_report_qs_rsp(&rcu_preempt_state, flags);
+ return;
+ }
+
+ /* Report up the rest of the hierarchy. */
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
+}
+
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t,
+ struct rcu_node *rnp)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rnp->blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+void rcu_read_unlock_special(struct task_struct *t)
+{
+ int empty;
+ int empty_exp;
+ int empty_exp_now;
+ unsigned long flags;
+ struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ struct rcu_node *rnp;
+ int special;
+
+ /* NMI handlers cannot block and cannot safely manipulate state. */
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * If RCU core is waiting for this CPU to exit critical section,
+ * let it know that we have done so.
+ */
+ special = t->rcu_read_unlock_special;
+ if (special & RCU_READ_UNLOCK_NEED_QS) {
+ rcu_preempt_qs(smp_processor_id());
+ if (!t->rcu_read_unlock_special) {
+ local_irq_restore(flags);
+ return;
+ }
+ }
+
+ /* Hardware IRQ handlers cannot block, complain if they get here. */
+ if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* Clean up if blocked during RCU read-side critical section. */
+ if (special & RCU_READ_UNLOCK_BLOCKED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+ /*
+ * Remove this task from the list it blocked on. The
+ * task can migrate while we acquire the lock, but at
+ * most one time. So at most two passes through loop.
+ */
+ for (;;) {
+ rnp = t->rcu_blocked_node;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ if (rnp == t->rcu_blocked_node)
+ break;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+ empty = !rcu_preempt_blocked_readers_cgp(rnp);
+ empty_exp = !rcu_preempted_readers_exp(rnp);
+ smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+ np = rcu_next_node_entry(t, rnp);
+ list_del_init(&t->rcu_node_entry);
+ t->rcu_blocked_node = NULL;
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
+ rnp->gpnum, t->pid);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp->gp_tasks = np;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp->exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+ /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
+ if (t->rcu_boost_mutex) {
+ rbmp = t->rcu_boost_mutex;
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ /*
+ * If this was the last task on the current list, and if
+ * we aren't waiting on any CPUs, report the quiescent state.
+ * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+ * so we must take a snapshot of the expedited state.
+ */
+ empty_exp_now = !rcu_preempted_readers_exp(rnp);
+ if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
+ rnp->gpnum,
+ 0, rnp->qsmask,
+ rnp->level,
+ rnp->grplo,
+ rnp->grphi,
+ !!rnp->gp_tasks);
+ rcu_report_unblock_qs_rnp(rnp, flags);
+ } else {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost if we were boosted. */
+ if (rbmp)
+ rt_mutex_unlock(rbmp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ /*
+ * If this was the last task on the expedited lists,
+ * then we need to report up the rcu_node hierarchy.
+ */
+ if (!empty_exp && empty_exp_now)
+ rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ sched_show_task(t);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ rcu_print_detail_task_stall_rnp(rnp);
+ rcu_for_each_leaf_node(rsp, rnp)
+ rcu_print_detail_task_stall_rnp(rnp);
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+ pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+ rnp->level, rnp->grplo, rnp->grphi);
+}
+
+static void rcu_print_task_stall_end(void)
+{
+ pr_cont("\n");
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+}
+
+static void rcu_print_task_stall_end(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return 0;
+ rcu_print_task_stall_begin(rnp);
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ rcu_print_task_stall_end();
+ return ndetected;
+}
+
+/*
+ * Check that the list of blocked tasks for the newly completed grace
+ * period is in fact empty. It is a serious bug to complete a grace
+ * period that still has RCU readers blocked! This function must be
+ * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
+ * must be held by the caller.
+ *
+ * Also, if there are blocked tasks on the list, they automatically
+ * block the newly created grace period, so set up ->gp_tasks accordingly.
+ */
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
+{
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ if (!list_empty(&rnp->blkd_tasks))
+ rnp->gp_tasks = rnp->blkd_tasks.next;
+ WARN_ON_ONCE(rnp->qsmask);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Handle tasklist migration for case in which all CPUs covered by the
+ * specified rcu_node have gone offline. Move them up to the root
+ * rcu_node. The reason for not just moving them to the immediate
+ * parent is to remove the need for rcu_read_unlock_special() to
+ * make more than two attempts to acquire the target rcu_node's lock.
+ * Returns true if there were tasks blocking the current RCU grace
+ * period.
+ *
+ * Returns 1 if there was previously a task blocking the current grace
+ * period on the specified rcu_node structure.
+ *
+ * The caller must hold rnp->lock with irqs disabled.
+ */
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ struct list_head *lp;
+ struct list_head *lp_root;
+ int retval = 0;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+ struct task_struct *t;
+
+ if (rnp == rnp_root) {
+ WARN_ONCE(1, "Last CPU thought to be offlined?");
+ return 0; /* Shouldn't happen: at least one CPU online. */
+ }
+
+ /* If we are on an internal node, complain bitterly. */
+ WARN_ON_ONCE(rnp != rdp->mynode);
+
+ /*
+ * Move tasks up to root rcu_node. Don't try to get fancy for
+ * this corner-case operation -- just put this node's tasks
+ * at the head of the root node's list, and update the root node's
+ * ->gp_tasks and ->exp_tasks pointers to those of this node's,
+ * if non-NULL. This might result in waiting for more tasks than
+ * absolutely necessary, but this is a good performance/complexity
+ * tradeoff.
+ */
+ if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
+ retval |= RCU_OFL_TASKS_NORM_GP;
+ if (rcu_preempted_readers_exp(rnp))
+ retval |= RCU_OFL_TASKS_EXP_GP;
+ lp = &rnp->blkd_tasks;
+ lp_root = &rnp_root->blkd_tasks;
+ while (!list_empty(lp)) {
+ t = list_entry(lp->next, typeof(*t), rcu_node_entry);
+ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+ smp_mb__after_unlock_lock();
+ list_del(&t->rcu_node_entry);
+ t->rcu_blocked_node = rnp_root;
+ list_add(&t->rcu_node_entry, lp_root);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp_root->gp_tasks = rnp->gp_tasks;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp_root->exp_tasks = rnp->exp_tasks;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp_root->boost_tasks = rnp->boost_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
+ }
+
+ rnp->gp_tasks = NULL;
+ rnp->exp_tasks = NULL;
+#ifdef CONFIG_RCU_BOOST
+ rnp->boost_tasks = NULL;
+ /*
+ * In case root is being boosted and leaf was not. Make sure
+ * that we boost the tasks blocking the current grace period
+ * in this case.
+ */
+ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+ smp_mb__after_unlock_lock();
+ if (rnp_root->boost_tasks != NULL &&
+ rnp_root->boost_tasks != rnp_root->gp_tasks &&
+ rnp_root->boost_tasks != rnp_root->exp_tasks)
+ rnp_root->boost_tasks = rnp_root->gp_tasks;
+ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ return retval;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Check for a quiescent state from the current CPU. When a task blocks,
+ * the task is recorded in the corresponding CPU's rcu_node structure,
+ * which is checked elsewhere.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(int cpu)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting == 0) {
+ rcu_preempt_qs(cpu);
+ return;
+ }
+ if (t->rcu_read_lock_nesting > 0 &&
+ per_cpu(rcu_preempt_data, cpu).qs_pending)
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_preempt_do_callbacks(void)
+{
+ rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Queue a preemptible-RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ if (!rcu_scheduler_active)
+ return;
+ if (rcu_expedited)
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(struct rcu_node *rnp)
+{
+ return rnp->exp_tasks != NULL;
+}
+
+/*
+ * return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period. Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+ return !rcu_preempted_readers_exp(rnp) &&
+ ACCESS_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period. This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree. (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Most callers will set the "wake" flag, but the task initiating the
+ * expedited grace period need not wake itself.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake)
+{
+ unsigned long flags;
+ unsigned long mask;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ for (;;) {
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ break;
+ }
+ if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
+ wake_up(&sync_rcu_preempt_exp_wq);
+ }
+ break;
+ }
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+ rnp = rnp->parent;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled */
+ smp_mb__after_unlock_lock();
+ rnp->expmask &= ~mask;
+ }
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure. If there are no such
+ * tasks, report it up the rcu_node hierarchy.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
+ * CPU hotplug operations.
+ */
+static void
+sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ unsigned long flags;
+ int must_wait = 0;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (list_empty(&rnp->blkd_tasks)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else {
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+ must_wait = 1;
+ }
+ if (!must_wait)
+ rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
+}
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU-preempt grace period, but expedite it. The basic
+ * idea is to invoke synchronize_sched_expedited() to push all the tasks to
+ * the ->blkd_tasks lists and wait for this list to drain. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.
+ * In fact, if you are using synchronize_rcu_expedited() in a loop,
+ * please restructure your code to batch your updates, and then Use a
+ * single synchronize_rcu() instead.
+ *
+ * Note that it is illegal to call this function while holding any lock
+ * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
+ * to call this function from a CPU-hotplug notifier. Failing to observe
+ * these restriction will result in deadlock.
+ */
+void synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = &rcu_preempt_state;
+ unsigned long snap;
+ int trycount = 0;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+ smp_mb(); /* Above access cannot bleed into critical section. */
+
+ /*
+ * Block CPU-hotplug operations. This means that any CPU-hotplug
+ * operation that finds an rcu_node structure with tasks in the
+ * process of being boosted will know that all tasks blocking
+ * this expedited grace period will already be in the process of
+ * being boosted. This simplifies the process of moving tasks
+ * from leaf to root rcu_node structures.
+ */
+ get_online_cpus();
+
+ /*
+ * Acquire lock, falling back to synchronize_rcu() if too many
+ * lock-acquisition failures. Of course, if someone does the
+ * expedited grace period for us, just leave.
+ */
+ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
+ if (ULONG_CMP_LT(snap,
+ ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ put_online_cpus();
+ goto mb_ret; /* Others did our work for us. */
+ }
+ if (trycount++ < 10) {
+ udelay(trycount * num_online_cpus());
+ } else {
+ put_online_cpus();
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+ }
+ if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ put_online_cpus();
+ goto unlock_mb_ret; /* Others did our work for us. */
+ }
+
+ /* force all RCU readers onto ->blkd_tasks lists. */
+ synchronize_sched_expedited();
+
+ /* Initialize ->expmask for all non-leaf rcu_node structures. */
+ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ rnp->expmask = rnp->qsmaskinit;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+
+ /* Snapshot current state of ->blkd_tasks lists. */
+ rcu_for_each_leaf_node(rsp, rnp)
+ sync_rcu_preempt_exp_init(rsp, rnp);
+ if (NUM_RCU_NODES > 1)
+ sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+
+ put_online_cpus();
+
+ /* Wait for snapshotted ->blkd_tasks lists to drain. */
+ rnp = rcu_get_root(rsp);
+ wait_event(sync_rcu_preempt_exp_wq,
+ sync_rcu_preempt_exp_done(rnp));
+
+ /* Clean up and exit. */
+ smp_mb(); /* ensure expedited GP seen before counter increment. */
+ ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
+unlock_mb_ret:
+ mutex_unlock(&sync_rcu_preempt_exp_mutex);
+mb_ret:
+ smp_mb(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ *
+ * Note that this primitive does not necessarily wait for an RCU grace period
+ * to complete. For example, if there are no RCU callbacks queued anywhere
+ * in the system, then rcu_barrier() is within its rights to return
+ * immediately, without waiting for anything, much less an RCU grace period.
+ */
+void rcu_barrier(void)
+{
+ _rcu_barrier(&rcu_preempt_state);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * Initialize preemptible RCU's state structures.
+ */
+static void __init __rcu_init_preempt(void)
+{
+ rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+}
+
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (likely(list_empty(&current->rcu_node_entry)))
+ return;
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+ __rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+static struct rcu_state *rcu_state_p = &rcu_sched_state;
+
+/*
+ * Tell them what RCU they are running.
+ */
+static void __init rcu_bootup_announce(void)
+{
+ pr_info("Hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
+}
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_batches_completed_sched();
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, there are never any preempted
+ * RCU readers.
+ */
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Because preemptible RCU does not exist, no quieting of tasks. */
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked,
+ * so there is no need to check for blocked tasks. So check only for
+ * bogus qsmask values.
+ */
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
+{
+ WARN_ON_ONCE(rnp->qsmask);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Because preemptible RCU does not exist, it never needs to migrate
+ * tasks that were blocked within RCU read-side critical sections, and
+ * such non-existent tasks cannot possibly have been blocking the current
+ * grace period.
+ */
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ return 0;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(int cpu)
+{
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptible RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+ synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Because preemptible RCU does not exist, there is never any need to
+ * report on tasks preempted in RCU read-side critical sections during
+ * expedited RCU grace periods.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake)
+{
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Because preemptible RCU does not exist, rcu_barrier() is just
+ * another name for rcu_barrier_sched().
+ */
+void rcu_barrier(void)
+{
+ rcu_barrier_sched();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * Because preemptible RCU does not exist, it need not be initialized.
+ */
+static void __init __rcu_init_preempt(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, tasks cannot possibly exit
+ * while in preemptible RCU read-side critical sections.
+ */
+void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "../locking/rtmutex_common.h"
+
+#ifdef CONFIG_RCU_TRACE
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+ if (list_empty(&rnp->blkd_tasks))
+ rnp->n_balk_blkd_tasks++;
+ else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
+ rnp->n_balk_exp_gp_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
+ rnp->n_balk_boost_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
+ rnp->n_balk_notblocked++;
+ else if (rnp->gp_tasks != NULL &&
+ ULONG_CMP_LT(jiffies, rnp->boost_time))
+ rnp->n_balk_notyet++;
+ else
+ rnp->n_balk_nos++;
+}
+
+#else /* #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+ wake_up_process(t);
+}
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->exp_tasks
+ * or ->boost_tasks, advancing the pointer to the next task in the
+ * ->blkd_tasks list.
+ *
+ * Note that irqs must be enabled: boosting the task can block.
+ * Returns 1 if there are more tasks needing to be boosted.
+ */
+static int rcu_boost(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct task_struct *t;
+ struct list_head *tb;
+
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+ return 0; /* Nothing left to boost. */
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+
+ /*
+ * Recheck under the lock: all tasks in need of boosting
+ * might exit their RCU read-side critical sections on their own.
+ */
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return 0;
+ }
+
+ /*
+ * Preferentially boost tasks blocking expedited grace periods.
+ * This cannot starve the normal grace periods because a second
+ * expedited grace period must boost all blocked tasks, including
+ * those blocking the pre-existing normal grace period.
+ */
+ if (rnp->exp_tasks != NULL) {
+ tb = rnp->exp_tasks;
+ rnp->n_exp_boosts++;
+ } else {
+ tb = rnp->boost_tasks;
+ rnp->n_normal_boosts++;
+ }
+ rnp->n_tasks_boosted++;
+
+ /*
+ * We boost task t by manufacturing an rt_mutex that appears to
+ * be held by task t. We leave a pointer to that rt_mutex where
+ * task t can find it, and task t will release the mutex when it
+ * exits its outermost RCU read-side critical section. Then
+ * simply acquiring this artificial rt_mutex will boost task
+ * t's priority. (Thanks to tglx for suggesting this approach!)
+ *
+ * Note that task t must acquire rnp->lock to remove itself from
+ * the ->blkd_tasks list, which it will do from exit() if from
+ * nowhere else. We therefore are guaranteed that task t will
+ * stay around at least until we drop rnp->lock. Note that
+ * rnp->lock also resolves races between our priority boosting
+ * and task t's exiting its outermost RCU read-side critical
+ * section.
+ */
+ t = container_of(tb, struct task_struct, rcu_node_entry);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
+ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+
+ return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+ ACCESS_ONCE(rnp->boost_tasks) != NULL;
+}
+
+/*
+ * Priority-boosting kthread. One per leaf rcu_node and one for the
+ * root rcu_node.
+ */
+static int rcu_boost_kthread(void *arg)
+{
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ int spincnt = 0;
+ int more2boost;
+
+ trace_rcu_utilization(TPS("Start boost kthread@init"));
+ for (;;) {
+ rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
+ rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+ trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
+ rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+ more2boost = rcu_boost(rnp);
+ if (more2boost)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
+ spincnt = 0;
+ }
+ }
+ /* NOTREACHED */
+ trace_rcu_utilization(TPS("End boost kthread@notreached"));
+ return 0;
+}
+
+/*
+ * Check to see if it is time to start boosting RCU readers that are
+ * blocking the current grace period, and, if so, tell the per-rcu_node
+ * kthread to start boosting them. If there is an expedited grace
+ * period in progress, it is always time to boost.
+ *
+ * The caller must hold rnp->lock, which this function releases.
+ * The ->boost_kthread_task is immortal, so we don't need to worry
+ * about it going away.
+ */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+ struct task_struct *t;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ rnp->n_balk_exp_gp_tasks++;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rnp->exp_tasks != NULL ||
+ (rnp->gp_tasks != NULL &&
+ rnp->boost_tasks == NULL &&
+ rnp->qsmask == 0 &&
+ ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+ if (rnp->exp_tasks == NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ t = rnp->boost_kthread_task;
+ if (t)
+ rcu_wake_cond(t, rnp->boost_kthread_status);
+ } else {
+ rcu_initiate_boost_trace(rnp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+/*
+ * Wake up the per-CPU kthread to invoke RCU callbacks.
+ */
+static void invoke_rcu_callbacks_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
+ current != __this_cpu_read(rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_cpu_kthread_status));
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+ return __this_cpu_read(rcu_cpu_kthread_task) == current;
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+ rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+
+/*
+ * Create an RCU-boost kthread for the specified node if one does not
+ * already exist. We only create this kthread for preemptible RCU.
+ * Returns zero if all is well, a negated errno otherwise.
+ */
+static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ int rnp_index = rnp - &rsp->node[0];
+ unsigned long flags;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (&rcu_preempt_state != rsp)
+ return 0;
+
+ if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ return 0;
+
+ rsp->boost = 1;
+ if (rnp->boost_kthread_task != NULL)
+ return 0;
+ t = kthread_create(rcu_boost_kthread, (void *)rnp,
+ "rcub/%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ rnp->boost_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+ return 0;
+}
+
+static void rcu_kthread_do_work(void)
+{
+ rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
+ rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
+ rcu_preempt_do_callbacks();
+}
+
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
+ struct sched_param sp;
+
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+ unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
+ int spincnt;
+
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ this_cpu_inc(rcu_cpu_kthread_loops);
+ local_irq_disable();
+ work = *workp;
+ *workp = 0;
+ local_irq_enable();
+ if (work)
+ rcu_kthread_do_work();
+ local_bh_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
+ }
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
+}
+
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+ struct task_struct *t = rnp->boost_kthread_task;
+ unsigned long mask = rnp->qsmaskinit;
+ cpumask_var_t cm;
+ int cpu;
+
+ if (!t)
+ return;
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+ if ((mask & 0x1) && cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ if (cpumask_weight(cm) == 0) {
+ cpumask_setall(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+ cpumask_clear_cpu(cpu, cm);
+ WARN_ON_ONCE(cpumask_weight(cm) == 0);
+ }
+ set_cpus_allowed_ptr(t, cm);
+ free_cpumask_var(cm);
+}
+
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+ struct rcu_node *rnp;
+ int cpu;
+
+ rcu_scheduler_fully_active = 1;
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_cpu_has_work, cpu) = 0;
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+ rnp = rcu_get_root(rcu_state_p);
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+ if (NUM_RCU_NODES > 1) {
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+ }
+ return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
+static void rcu_prepare_kthreads(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+ if (rcu_scheduler_fully_active)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+static void invoke_rcu_callbacks_kthread(void)
+{
+ WARN_ON_ONCE(1);
+}
+
+static bool rcu_is_callbacks_kthread(void)
+{
+ return false;
+}
+
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+}
+
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+}
+
+static int __init rcu_scheduler_really_started(void)
+{
+ rcu_scheduler_fully_active = 1;
+ return 0;
+}
+early_initcall(rcu_scheduler_really_started);
+
+static void rcu_prepare_kthreads(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so. This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
+ * any flavor of RCU.
+ */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+ *delta_jiffies = ULONG_MAX;
+ return rcu_cpu_has_callbacks(cpu, NULL);
+}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
+ * after it.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+}
+
+/*
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
+ * is nothing.
+ */
+static void rcu_prepare_for_idle(int cpu)
+{
+}
+
+/*
+ * Don't bother keeping a running count of the number of RCU callbacks
+ * posted because CONFIG_RCU_FAST_NO_HZ=n.
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+}
+
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+
+/*
+ * This code is invoked when a CPU goes idle, at which point we want
+ * to have the CPU do everything required for RCU so that it can enter
+ * the energy-efficient dyntick-idle mode. This is handled by a
+ * state machine implemented by rcu_prepare_for_idle() below.
+ *
+ * The following three proprocessor symbols control this state machine:
+ *
+ * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
+ * to sleep in dyntick-idle mode with RCU callbacks pending. This
+ * is sized to be roughly one RCU grace period. Those energy-efficiency
+ * benchmarkers who might otherwise be tempted to set this to a large
+ * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
+ * system. And if you are -that- concerned about energy efficiency,
+ * just power the system down and be done with it!
+ * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
+ * permitted to sleep in dyntick-idle mode with only lazy RCU
+ * callbacks pending. Setting this too high can OOM your system.
+ *
+ * The values below work well in practice. If future workloads require
+ * adjustment, they can be converted into kernel config parameters, though
+ * making the state machine smarter might be a better option.
+ */
+#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
+#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+
+static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+module_param(rcu_idle_gp_delay, int, 0644);
+static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
+module_param(rcu_idle_lazy_gp_delay, int, 0644);
+
+extern int tick_nohz_active;
+
+/*
+ * Try to advance callbacks for all flavors of RCU on the current CPU, but
+ * only if it has been awhile since the last time we did so. Afterwards,
+ * if there are any callbacks ready for immediate invocation, return true.
+ */
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
+{
+ bool cbs_ready = false;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ /* Exit early if we advanced recently. */
+ if (jiffies == rdtp->last_advance_all)
+ return 0;
+ rdtp->last_advance_all = jiffies;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+
+ /*
+ * Don't bother checking unless a grace period has
+ * completed since we last checked and there are
+ * callbacks not yet ready to invoke.
+ */
+ if (rdp->completed != rnp->completed &&
+ rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+ note_gp_changes(rsp, rdp);
+
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ cbs_ready = true;
+ }
+ return cbs_ready;
+}
+
+/*
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+ * to invoke. If the CPU has callbacks, try to advance them. Tell the
+ * caller to set the timeout based on whether or not there are non-lazy
+ * callbacks.
+ *
+ * The caller must have disabled interrupts.
+ */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+int rcu_needs_cpu(int cpu, unsigned long *dj)
+{
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Snapshot to detect later posting of non-lazy callback. */
+ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+
+ /* If no callbacks, RCU doesn't need the CPU. */
+ if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ *dj = ULONG_MAX;
+ return 0;
+ }
+
+ /* Attempt to advance callbacks. */
+ if (rcu_try_advance_all_cbs()) {
+ /* Some ready to invoke, so initiate later invocation. */
+ invoke_rcu_core();
+ return 1;
+ }
+ rdtp->last_accelerate = jiffies;
+
+ /* Request timer delay depending on laziness, and round. */
+ if (!rdtp->all_lazy) {
+ *dj = round_up(rcu_idle_gp_delay + jiffies,
+ rcu_idle_gp_delay) - jiffies;
+ } else {
+ *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ }
+ return 0;
+}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Prepare a CPU for idle from an RCU perspective. The first major task
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
+ * The second major task is to check to see if a non-lazy callback has
+ * arrived at a CPU that previously had only lazy callbacks. The third
+ * major task is to accelerate (that is, assign grace-period numbers to)
+ * any recently arrived callbacks.
+ *
+ * The caller must have disabled interrupts.
+ */
+static void rcu_prepare_for_idle(int cpu)
+{
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ bool needwake;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+ int tne;
+
+ /* Handle nohz enablement switches conservatively. */
+ tne = ACCESS_ONCE(tick_nohz_active);
+ if (tne != rdtp->tick_nohz_enabled_snap) {
+ if (rcu_cpu_has_callbacks(cpu, NULL))
+ invoke_rcu_core(); /* force nohz to see update. */
+ rdtp->tick_nohz_enabled_snap = tne;
+ return;
+ }
+ if (!tne)
+ return;
+
+ /* If this is a no-CBs CPU, no callbacks, just return. */
+ if (rcu_is_nocb_cpu(cpu))
+ return;
+
+ /*
+ * If a non-lazy callback arrived at a CPU having only lazy
+ * callbacks, invoke RCU core for the side-effect of recalculating
+ * idle duration on re-entry to idle.
+ */
+ if (rdtp->all_lazy &&
+ rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+ rdtp->all_lazy = false;
+ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+ invoke_rcu_core();
+ return;
+ }
+
+ /*
+ * If we have not yet accelerated this jiffy, accelerate all
+ * callbacks on this CPU.
+ */
+ if (rdtp->last_accelerate == jiffies)
+ return;
+ rdtp->last_accelerate = jiffies;
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (!*rdp->nxttail[RCU_DONE_TAIL])
+ continue;
+ rnp = rdp->mynode;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+}
+
+/*
+ * Clean up for exit from idle. Attempt to advance callbacks based on
+ * any grace periods that elapsed while the CPU was idle, and if any
+ * callbacks are now ready to invoke, initiate invocation.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ if (rcu_is_nocb_cpu(cpu))
+ return;
+ if (rcu_try_advance_all_cbs())
+ invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+}
+
+/*
+ * Keep a running count of the number of non-lazy callbacks posted
+ * on this CPU. This running counter (which is never decremented) allows
+ * rcu_prepare_for_idle() to detect when something out of the idle loop
+ * posts a callback, even if an equal number of callbacks are invoked.
+ * Of course, callbacks should only be posted from within a trace event
+ * designed to be called from idle or from within RCU_NONIDLE().
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+ __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
+}
+
+/*
+ * Data for flushing lazy RCU callbacks at OOM time.
+ */
+static atomic_t oom_callback_count;
+static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
+
+/*
+ * RCU OOM callback -- decrement the outstanding count and deliver the
+ * wake-up if we are the last one.
+ */
+static void rcu_oom_callback(struct rcu_head *rhp)
+{
+ if (atomic_dec_and_test(&oom_callback_count))
+ wake_up(&oom_callback_wq);
+}
+
+/*
+ * Post an rcu_oom_notify callback on the current CPU if it has at
+ * least one lazy callback. This will unnecessarily post callbacks
+ * to CPUs that already have a non-lazy callback at the end of their
+ * callback list, but this is an infrequent operation, so accept some
+ * extra overhead to keep things simple.
+ */
+static void rcu_oom_notify_cpu(void *unused)
+{
+ struct rcu_state *rsp;
+ struct rcu_data *rdp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (rdp->qlen_lazy != 0) {
+ atomic_inc(&oom_callback_count);
+ rsp->call(&rdp->oom_head, rcu_oom_callback);
+ }
+ }
+}
+
+/*
+ * If low on memory, ensure that each CPU has a non-lazy callback.
+ * This will wake up CPUs that have only lazy callbacks, in turn
+ * ensuring that they free up the corresponding memory in a timely manner.
+ * Because an uncertain amount of memory will be freed in some uncertain
+ * timeframe, we do not claim to have freed anything.
+ */
+static int rcu_oom_notify(struct notifier_block *self,
+ unsigned long notused, void *nfreed)
+{
+ int cpu;
+
+ /* Wait for callbacks from earlier instance to complete. */
+ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+ smp_mb(); /* Ensure callback reuse happens after callback invocation. */
+
+ /*
+ * Prevent premature wakeup: ensure that all increments happen
+ * before there is a chance of the counter reaching zero.
+ */
+ atomic_set(&oom_callback_count, 1);
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
+ cond_resched();
+ }
+ put_online_cpus();
+
+ /* Unconditionally decrement: no need to wake ourselves up. */
+ atomic_dec(&oom_callback_count);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_oom_nb = {
+ .notifier_call = rcu_oom_notify
+};
+
+static int __init rcu_register_oom_notifier(void)
+{
+ register_oom_notifier(&rcu_oom_nb);
+ return 0;
+}
+early_initcall(rcu_register_oom_notifier);
+
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+
+#ifdef CONFIG_RCU_FAST_NO_HZ
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
+
+ sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+ rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ulong2long(nlpd),
+ rdtp->all_lazy ? 'L' : '.',
+ rdtp->tick_nohz_enabled_snap ? '.' : 'D');
+}
+
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ *cp = '\0';
+}
+
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+/* Initiate the stall-info list. */
+static void print_cpu_stall_info_begin(void)
+{
+ pr_cont("\n");
+}
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period
+ * (flavor specified by rsp), then print the number of scheduling
+ * clock interrupts the CPU has taken during the time that it has
+ * been aware. Otherwise, print the number of RCU grace periods
+ * that this CPU is ignorant of, for example, "1" if the CPU was
+ * aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
+{
+ char fast_no_hz[72];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = rdp->dynticks;
+ char *ticks_title;
+ unsigned long ticks_value;
+
+ if (rsp->gpnum == rdp->gpnum) {
+ ticks_title = "ticks this GP";
+ ticks_value = rdp->ticks_this_gp;
+ } else {
+ ticks_title = "GPs behind";
+ ticks_value = rsp->gpnum - rdp->gpnum;
+ }
+ print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+ pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+ cpu, ticks_value, ticks_title,
+ atomic_read(&rdtp->dynticks) & 0xfff,
+ rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ fast_no_hz);
+}
+
+/* Terminate the stall-info list. */
+static void print_cpu_stall_info_end(void)
+{
+ pr_err("\t");
+}
+
+/* Zero ->ticks_this_gp for all flavors of RCU. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+ rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+}
+
+/* Increment ->ticks_this_gp for all flavors of RCU. */
+static void increment_cpu_stall_ticks(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ raw_cpu_inc(rsp->rda->ticks_this_gp);
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+static void print_cpu_stall_info_begin(void)
+{
+ pr_cont(" {");
+}
+
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
+{
+ pr_cont(" %d", cpu);
+}
+
+static void print_cpu_stall_info_end(void)
+{
+ pr_cont("} ");
+}
+
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+}
+
+static void increment_cpu_stall_ticks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For each CPU in the set, there is a
+ * kthread created that pulls the callbacks from the corresponding CPU,
+ * waits for a grace period to elapse, and invokes the callbacks.
+ * The no-CBs CPUs do a wake_up() on their kthread when they insert
+ * a callback into any empty list, unless the rcu_nocb_poll boot parameter
+ * has been specified, in which case each kthread actively polls its
+ * CPU. (Which isn't so great for energy efficiency, but which does
+ * reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callback processing could also in theory be used as
+ * an energy-efficiency measure because CPUs with no RCU callbacks
+ * queued are more aggressive about entering dyntick-idle mode.
+ */
+
+
+/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ have_rcu_nocb_mask = true;
+ cpulist_parse(str, rcu_nocb_mask);
+ return 1;
+}
+__setup("rcu_nocbs=", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = 1;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+}
+
+/*
+ * Set the root rcu_node structure's ->need_future_gp field
+ * based on the sum of those of all rcu_node structures. This does
+ * double-count the root rcu_node structure's requests, but this
+ * is necessary to handle the possibility of a rcu_nocb_kthread()
+ * having awakened during the time that the rcu_node structures
+ * were being updated for the end of the previous grace period.
+ */
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+ rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+ init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+}
+
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+/* Is the specified CPU a no-CBs CPU? */
+bool rcu_is_nocb_cpu(int cpu)
+{
+ if (have_rcu_nocb_mask)
+ return cpumask_test_cpu(cpu, rcu_nocb_mask);
+ return false;
+}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Enqueue the specified string of rcu_head structures onto the specified
+ * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
+ * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
+ * counts are supplied by rhcount and rhcount_lazy.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
+ struct rcu_head *rhp,
+ struct rcu_head **rhtp,
+ int rhcount, int rhcount_lazy,
+ unsigned long flags)
+{
+ int len;
+ struct rcu_head **old_rhpp;
+ struct task_struct *t;
+
+ /* Enqueue the callback on the nocb list and update counts. */
+ old_rhpp = xchg(&rdp->nocb_tail, rhtp);
+ ACCESS_ONCE(*old_rhpp) = rhp;
+ atomic_long_add(rhcount, &rdp->nocb_q_count);
+ atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
+
+ /* If we are not being polled and there is a kthread, awaken it ... */
+ t = ACCESS_ONCE(rdp->nocb_kthread);
+ if (rcu_nocb_poll || !t) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ len = atomic_long_read(&rdp->nocb_q_count);
+ if (old_rhpp == &rdp->nocb_head) {
+ if (!irqs_disabled_flags(flags)) {
+ wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rdp->nocb_defer_wakeup = true;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ rdp->qlen_last_fqs_check = 0;
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ wake_up_process(t); /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = LONG_MAX / 2;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
+ } else {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
+ }
+ return;
+}
+
+/*
+ * This is a helper for __call_rcu(), which invokes this when the normal
+ * callback queue is inoperable. If this is not a no-CBs CPU, this
+ * function returns failure back to __call_rcu(), which can complain
+ * appropriately.
+ *
+ * Otherwise, this function queues the callback where the corresponding
+ * "rcuo" kthread can find it.
+ */
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool lazy, unsigned long flags)
+{
+
+ if (!rcu_is_nocb_cpu(rdp->cpu))
+ return 0;
+ __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
+ if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+ trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+ (unsigned long)rhp->func,
+ -atomic_long_read(&rdp->nocb_q_count_lazy),
+ -atomic_long_read(&rdp->nocb_q_count));
+ else
+ trace_rcu_callback(rdp->rsp->name, rhp,
+ -atomic_long_read(&rdp->nocb_q_count_lazy),
+ -atomic_long_read(&rdp->nocb_q_count));
+ return 1;
+}
+
+/*
+ * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
+ * not a no-CBs CPU.
+ */
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+ struct rcu_data *rdp,
+ unsigned long flags)
+{
+ long ql = rsp->qlen;
+ long qll = rsp->qlen_lazy;
+
+ /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ if (!rcu_is_nocb_cpu(smp_processor_id()))
+ return 0;
+ rsp->qlen = 0;
+ rsp->qlen_lazy = 0;
+
+ /* First, enqueue the donelist, if any. This preserves CB ordering. */
+ if (rsp->orphan_donelist != NULL) {
+ __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
+ rsp->orphan_donetail, ql, qll, flags);
+ ql = qll = 0;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+ if (rsp->orphan_nxtlist != NULL) {
+ __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
+ rsp->orphan_nxttail, ql, qll, flags);
+ ql = qll = 0;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+ return 1;
+}
+
+/*
+ * If necessary, kick off a new grace period, and either way wait
+ * for a subsequent grace period to complete.
+ */
+static void rcu_nocb_wait_gp(struct rcu_data *rdp)
+{
+ unsigned long c;
+ bool d;
+ unsigned long flags;
+ bool needwake;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ needwake = rcu_start_future_gp(rnp, rdp, &c);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rdp->rsp);
+
+ /*
+ * Wait for the grace period. Do so interruptibly to avoid messing
+ * up the load average.
+ */
+ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+ for (;;) {
+ wait_event_interruptible(
+ rnp->nocb_gp_wq[c & 0x1],
+ (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ if (likely(d))
+ break;
+ flush_signals(current);
+ trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+ }
+ trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+ smp_mb(); /* Ensure that CB invocation happens after GP end. */
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
+ * callbacks queued by the corresponding no-CBs CPU.
+ */
+static int rcu_nocb_kthread(void *arg)
+{
+ int c, cl;
+ bool firsttime = 1;
+ struct rcu_head *list;
+ struct rcu_head *next;
+ struct rcu_head **tail;
+ struct rcu_data *rdp = arg;
+
+ /* Each pass through this loop invokes one batch of callbacks */
+ for (;;) {
+ /* If not polling, wait for next batch of callbacks. */
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("Sleep"));
+ wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+ /* Memory barrier provide by xchg() below. */
+ } else if (firsttime) {
+ firsttime = 0;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("Poll"));
+ }
+ list = ACCESS_ONCE(rdp->nocb_head);
+ if (!list) {
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WokeEmpty"));
+ schedule_timeout_interruptible(1);
+ flush_signals(current);
+ continue;
+ }
+ firsttime = 1;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WokeNonEmpty"));
+
+ /*
+ * Extract queued callbacks, update counts, and wait
+ * for a grace period to elapse.
+ */
+ ACCESS_ONCE(rdp->nocb_head) = NULL;
+ tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+ c = atomic_long_xchg(&rdp->nocb_q_count, 0);
+ cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+ ACCESS_ONCE(rdp->nocb_p_count) += c;
+ ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
+ rcu_nocb_wait_gp(rdp);
+
+ /* Each pass through the following loop invokes a callback. */
+ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+ c = cl = 0;
+ while (list) {
+ next = list->next;
+ /* Wait for enqueuing to complete, if needed. */
+ while (next == NULL && &list->next != tail) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WaitQueue"));
+ schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WokeQueue"));
+ next = list->next;
+ }
+ debug_rcu_head_unqueue(list);
+ local_bh_disable();
+ if (__rcu_reclaim(rdp->rsp->name, list))
+ cl++;
+ c++;
+ local_bh_enable();
+ list = next;
+ }
+ trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
+ ACCESS_ONCE(rdp->nocb_p_count) -= c;
+ ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
+ rdp->n_nocbs_invoked += c;
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+ return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (!rcu_nocb_need_deferred_wakeup(rdp))
+ return;
+ ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
+ wake_up(&rdp->nocb_wq);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ rdp->nocb_tail = &rdp->nocb_head;
+ init_waitqueue_head(&rdp->nocb_wq);
+}
+
+/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ struct task_struct *t;
+
+ if (rcu_nocb_mask == NULL)
+ return;
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ t = kthread_run(rcu_nocb_kthread, rdp,
+ "rcuo%c/%d", rsp->abbr, cpu);
+ BUG_ON(IS_ERR(t));
+ ACCESS_ONCE(rdp->nocb_kthread) = t;
+ }
+}
+
+/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
+static bool init_nocb_callback_list(struct rcu_data *rdp)
+{
+ if (rcu_nocb_mask == NULL ||
+ !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
+ return false;
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ return true;
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+}
+
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool lazy, unsigned long flags)
+{
+ return 0;
+}
+
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+ struct rcu_data *rdp,
+ unsigned long flags)
+{
+ return 0;
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+{
+}
+
+static bool init_nocb_callback_list(struct rcu_data *rdp)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ * arbitrarily long period of time with the scheduling-clock tick turned
+ * off. RCU will be paying attention to this CPU because it is in the
+ * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ * machine because the scheduling-clock tick has been disabled. Therefore,
+ * if an adaptive-ticks CPU is failing to respond to the current grace
+ * period and has not be idle from an RCU perspective, kick it.
+ */
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(cpu))
+ smp_send_reschedule(cpu);
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
+
+
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+
+/*
+ * Define RCU flavor that holds sysidle state. This needs to be the
+ * most active flavor of RCU.
+ */
+#ifdef CONFIG_PREEMPT_RCU
+static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+static int full_sysidle_state; /* Current system-idle state. */
+#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
+#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
+#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
+#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
+#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
+
+/*
+ * Invoked to note exit from irq or task transition to idle. Note that
+ * usermode execution does -not- count as idle here! After all, we want
+ * to detect full-system idle states, not RCU quiescent states and grace
+ * periods. The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+{
+ unsigned long j;
+
+ /* Adjust nesting, check for fully idle. */
+ if (irq) {
+ rdtp->dynticks_idle_nesting--;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+ if (rdtp->dynticks_idle_nesting != 0)
+ return; /* Still not fully idle. */
+ } else {
+ if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
+ DYNTICK_TASK_NEST_VALUE) {
+ rdtp->dynticks_idle_nesting = 0;
+ } else {
+ rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+ return; /* Still not fully idle. */
+ }
+ }
+
+ /* Record start of fully idle period. */
+ j = jiffies;
+ ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ smp_mb__before_atomic();
+ atomic_inc(&rdtp->dynticks_idle);
+ smp_mb__after_atomic();
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
+}
+
+/*
+ * Unconditionally force exit from full system-idle state. This is
+ * invoked when a normal CPU exits idle, but must be called separately
+ * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
+ * is that the timekeeping CPU is permitted to take scheduling-clock
+ * interrupts while the system is in system-idle state, and of course
+ * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
+ * interrupt from any other type of interrupt.
+ */
+void rcu_sysidle_force_exit(void)
+{
+ int oldstate = ACCESS_ONCE(full_sysidle_state);
+ int newoldstate;
+
+ /*
+ * Each pass through the following loop attempts to exit full
+ * system-idle state. If contention proves to be a problem,
+ * a trylock-based contention tree could be used here.
+ */
+ while (oldstate > RCU_SYSIDLE_SHORT) {
+ newoldstate = cmpxchg(&full_sysidle_state,
+ oldstate, RCU_SYSIDLE_NOT);
+ if (oldstate == newoldstate &&
+ oldstate == RCU_SYSIDLE_FULL_NOTED) {
+ rcu_kick_nohz_cpu(tick_do_timer_cpu);
+ return; /* We cleared it, done! */
+ }
+ oldstate = newoldstate;
+ }
+ smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
+}
+
+/*
+ * Invoked to note entry to irq or task transition from idle. Note that
+ * usermode execution does -not- count as idle here! The caller must
+ * have disabled interrupts.
+ */
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+{
+ /* Adjust nesting, check for already non-idle. */
+ if (irq) {
+ rdtp->dynticks_idle_nesting++;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+ if (rdtp->dynticks_idle_nesting != 1)
+ return; /* Already non-idle. */
+ } else {
+ /*
+ * Allow for irq misnesting. Yes, it really is possible
+ * to enter an irq handler then never leave it, and maybe
+ * also vice versa. Handle both possibilities.
+ */
+ if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
+ rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+ return; /* Already non-idle. */
+ } else {
+ rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
+ }
+ }
+
+ /* Record end of idle period. */
+ smp_mb__before_atomic();
+ atomic_inc(&rdtp->dynticks_idle);
+ smp_mb__after_atomic();
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
+
+ /*
+ * If we are the timekeeping CPU, we are permitted to be non-idle
+ * during a system-idle state. This must be the case, because
+ * the timekeeping CPU has to take scheduling-clock interrupts
+ * during the time that the system is transitioning to full
+ * system-idle state. This means that the timekeeping CPU must
+ * invoke rcu_sysidle_force_exit() directly if it does anything
+ * more than take a scheduling-clock interrupt.
+ */
+ if (smp_processor_id() == tick_do_timer_cpu)
+ return;
+
+ /* Update system-idle state: We are clearly no longer fully idle! */
+ rcu_sysidle_force_exit();
+}
+
+/*
+ * Check to see if the current CPU is idle. Note that usermode execution
+ * does not count as idle. The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj)
+{
+ int cur;
+ unsigned long j;
+ struct rcu_dynticks *rdtp = rdp->dynticks;
+
+ /*
+ * If some other CPU has already reported non-idle, if this is
+ * not the flavor of RCU that tracks sysidle state, or if this
+ * is an offline or the timekeeping CPU, nothing to do.
+ */
+ if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+ cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
+ return;
+ if (rcu_gp_in_progress(rdp->rsp))
+ WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+
+ /* Pick up current idle and NMI-nesting counter and check. */
+ cur = atomic_read(&rdtp->dynticks_idle);
+ if (cur & 0x1) {
+ *isidle = false; /* We are not idle! */
+ return;
+ }
+ smp_mb(); /* Read counters before timestamps. */
+
+ /* Pick up timestamps. */
+ j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ /* If this CPU entered idle more recently, update maxj timestamp. */
+ if (ULONG_CMP_LT(*maxj, j))
+ *maxj = j;
+}
+
+/*
+ * Is this the flavor of RCU that is handling full-system idle?
+ */
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+ return rsp == rcu_sysidle_state;
+}
+
+/*
+ * Return a delay in jiffies based on the number of CPUs, rcu_node
+ * leaf fanout, and jiffies tick rate. The idea is to allow larger
+ * systems more time to transition to full-idle state in order to
+ * avoid the cache thrashing that otherwise occur on the state variable.
+ * Really small systems (less than a couple of tens of CPUs) should
+ * instead use a single global atomically incremented counter, and later
+ * versions of this will automatically reconfigure themselves accordingly.
+ */
+static unsigned long rcu_sysidle_delay(void)
+{
+ if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+ return 0;
+ return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
+}
+
+/*
+ * Advance the full-system-idle state. This is invoked when all of
+ * the non-timekeeping CPUs are idle.
+ */
+static void rcu_sysidle(unsigned long j)
+{
+ /* Check the current state. */
+ switch (ACCESS_ONCE(full_sysidle_state)) {
+ case RCU_SYSIDLE_NOT:
+
+ /* First time all are idle, so note a short idle period. */
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ break;
+
+ case RCU_SYSIDLE_SHORT:
+
+ /*
+ * Idle for a bit, time to advance to next state?
+ * cmpxchg failure means race with non-idle, let them win.
+ */
+ if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+ (void)cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
+ break;
+
+ case RCU_SYSIDLE_LONG:
+
+ /*
+ * Do an additional check pass before advancing to full.
+ * cmpxchg failure means race with non-idle, let them win.
+ */
+ if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+ (void)cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/*
+ * Found a non-idle non-timekeeping CPU, so kick the system-idle state
+ * back to the beginning.
+ */
+static void rcu_sysidle_cancel(void)
+{
+ smp_mb();
+ if (full_sysidle_state > RCU_SYSIDLE_SHORT)
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+}
+
+/*
+ * Update the sysidle state based on the results of a force-quiescent-state
+ * scan of the CPUs' dyntick-idle state.
+ */
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+ unsigned long maxj, bool gpkt)
+{
+ if (rsp != rcu_sysidle_state)
+ return; /* Wrong flavor, ignore. */
+ if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+ return; /* Running state machine from timekeeping CPU. */
+ if (isidle)
+ rcu_sysidle(maxj); /* More idle! */
+ else
+ rcu_sysidle_cancel(); /* Idle is over. */
+}
+
+/*
+ * Wrapper for rcu_sysidle_report() when called from the grace-period
+ * kthread's context.
+ */
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj)
+{
+ rcu_sysidle_report(rsp, isidle, maxj, true);
+}
+
+/* Callback and function for forcing an RCU grace period. */
+struct rcu_sysidle_head {
+ struct rcu_head rh;
+ int inuse;
+};
+
+static void rcu_sysidle_cb(struct rcu_head *rhp)
+{
+ struct rcu_sysidle_head *rshp;
+
+ /*
+ * The following memory barrier is needed to replace the
+ * memory barriers that would normally be in the memory
+ * allocator.
+ */
+ smp_mb(); /* grace period precedes setting inuse. */
+
+ rshp = container_of(rhp, struct rcu_sysidle_head, rh);
+ ACCESS_ONCE(rshp->inuse) = 0;
+}
+
+/*
+ * Check to see if the system is fully idle, other than the timekeeping CPU.
+ * The caller must have disabled interrupts.
+ */
+bool rcu_sys_is_idle(void)
+{
+ static struct rcu_sysidle_head rsh;
+ int rss = ACCESS_ONCE(full_sysidle_state);
+
+ if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
+ return false;
+
+ /* Handle small-system case by doing a full scan of CPUs. */
+ if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
+ int oldrss = rss - 1;
+
+ /*
+ * One pass to advance to each state up to _FULL.
+ * Give up if any pass fails to advance the state.
+ */
+ while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
+ int cpu;
+ bool isidle = true;
+ unsigned long maxj = jiffies - ULONG_MAX / 4;
+ struct rcu_data *rdp;
+
+ /* Scan all the CPUs looking for nonidle CPUs. */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+ rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
+ if (!isidle)
+ break;
+ }
+ rcu_sysidle_report(rcu_sysidle_state,
+ isidle, maxj, false);
+ oldrss = rss;
+ rss = ACCESS_ONCE(full_sysidle_state);
+ }
+ }
+
+ /* If this is the first observation of an idle period, record it. */
+ if (rss == RCU_SYSIDLE_FULL) {
+ rss = cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
+ return rss == RCU_SYSIDLE_FULL;
+ }
+
+ smp_mb(); /* ensure rss load happens before later caller actions. */
+
+ /* If already fully idle, tell the caller (in case of races). */
+ if (rss == RCU_SYSIDLE_FULL_NOTED)
+ return true;
+
+ /*
+ * If we aren't there yet, and a grace period is not in flight,
+ * initiate a grace period. Either way, tell the caller that
+ * we are not there yet. We use an xchg() rather than an assignment
+ * to make up for the memory barriers that would otherwise be
+ * provided by the memory allocator.
+ */
+ if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
+ !rcu_gp_in_progress(rcu_sysidle_state) &&
+ !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
+ call_rcu(&rsh.rh, rcu_sysidle_cb);
+ return false;
+}
+
+/*
+ * Initialize dynticks sysidle state for CPUs coming online.
+ */
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+ rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
+}
+
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+{
+}
+
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+{
+}
+
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj)
+{
+}
+
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+ return false;
+}
+
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj)
+{
+}
+
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+/*
+ * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
+ * grace-period kthread will do force_quiescent_state() processing?
+ * The idea is to avoid waking up RCU core processing on such a
+ * CPU unless the grace period has extended for too long.
+ *
+ * This code relies on the fact that all NO_HZ_FULL CPUs are also
+ * CONFIG_RCU_NOCB_CPU CPUs.
+ */
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(smp_processor_id()) &&
+ (!rcu_gp_in_progress(rsp) ||
+ ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
+ return 1;
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+ return 0;
+}
+
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+
+ if (cpu < 0 || cpu >= nr_cpu_ids)
+ return;
+ if (raw_smp_processor_id() != cpu)
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index 654cfe67f0d..5cdc62e1bee 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -44,7 +44,59 @@
#include <linux/seq_file.h>
#define RCU_TREE_NONCORE
-#include "rcutree.h"
+#include "tree.h"
+
+static int r_open(struct inode *inode, struct file *file,
+ const struct seq_operations *op)
+{
+ int ret = seq_open(file, op);
+ if (!ret) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ m->private = inode->i_private;
+ }
+ return ret;
+}
+
+static void *r_start(struct seq_file *m, loff_t *pos)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ *pos = cpumask_next(*pos - 1, cpu_possible_mask);
+ if ((*pos) < nr_cpu_ids)
+ return per_cpu_ptr(rsp->rda, *pos);
+ return NULL;
+}
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return r_start(m, pos);
+}
+
+static void r_stop(struct seq_file *m, void *v)
+{
+}
+
+static int show_rcubarrier(struct seq_file *m, void *v)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ seq_printf(m, "bcc: %d nbd: %lu\n",
+ atomic_read(&rsp->barrier_cpu_count),
+ rsp->n_barrier_done);
+ return 0;
+}
+
+static int rcubarrier_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcubarrier, inode->i_private);
+}
+
+static const struct file_operations rcubarrier_fops = {
+ .owner = THIS_MODULE,
+ .open = rcubarrier_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = single_release,
+};
#ifdef CONFIG_RCU_BOOST
@@ -59,22 +111,26 @@ static char convert_kthread_status(unsigned int kthread_status)
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
+ long ql, qll;
+
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
+ seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->completed, rdp->gpnum,
- rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
- rdp->qs_pending);
+ ulong2long(rdp->completed), ulong2long(rdp->gpnum),
+ rdp->passed_quiesce, rdp->qs_pending);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
- seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, " ql=%ld qs=%c%c%c%c",
- rdp->qlen,
+ seq_printf(m, " of=%lu", rdp->offline_fqs);
+ rcu_nocb_q_lengths(rdp, &ql, &qll);
+ qll += rdp->qlen_lazy;
+ ql += rdp->qlen;
+ seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
+ qll, ql,
".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
rdp->nxttail[RCU_NEXT_TAIL]],
".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -83,117 +139,73 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c/%d ktl=%x",
+ seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
- per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);
- seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
- rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
+ seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
+ rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
+ rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
-#define PRINT_RCU_DATA(name, func, m) \
- do { \
- int _p_r_d_i; \
- \
- for_each_possible_cpu(_p_r_d_i) \
- func(m, &per_cpu(name, _p_r_d_i)); \
- } while (0)
-
-static int show_rcudata(struct seq_file *m, void *unused)
+static int show_rcudata(struct seq_file *m, void *v)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "rcu_preempt:\n");
- PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "rcu_sched:\n");
- PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
- seq_puts(m, "rcu_bh:\n");
- PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+ print_one_rcu_data(m, (struct rcu_data *)v);
return 0;
}
+static const struct seq_operations rcudate_op = {
+ .start = r_start,
+ .next = r_next,
+ .stop = r_stop,
+ .show = show_rcudata,
+};
+
static int rcudata_open(struct inode *inode, struct file *file)
{
- return single_open(file, show_rcudata, NULL);
+ return r_open(inode, file, &rcudate_op);
}
static const struct file_operations rcudata_fops = {
.owner = THIS_MODULE,
.open = rcudata_open,
.read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+ .llseek = no_llseek,
+ .release = seq_release,
};
-static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
+static int show_rcuexp(struct seq_file *m, void *v)
{
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
- rdp->completed, rdp->gpnum,
- rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
- rdp->qs_pending);
- seq_printf(m, ",%d,%llx,%d,%lu",
- atomic_read(&rdp->dynticks->dynticks),
- rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi_nesting,
- rdp->dynticks_fqs);
- seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
- ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
- rdp->nxttail[RCU_NEXT_TAIL]],
- ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
- rdp->nxttail[RCU_NEXT_READY_TAIL]],
- ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
- rdp->nxttail[RCU_WAIT_TAIL]],
- ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
-#ifdef CONFIG_RCU_BOOST
- seq_printf(m, ",%d,\"%c\"",
- per_cpu(rcu_cpu_has_work, rdp->cpu),
- convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
- rdp->cpu)));
-#endif /* #ifdef CONFIG_RCU_BOOST */
- seq_printf(m, ",%ld", rdp->blimit);
- seq_printf(m, ",%lu,%lu,%lu\n",
- rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-
-static int show_rcudata_csv(struct seq_file *m, void *unused)
-{
- seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
- seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
- seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
-#ifdef CONFIG_RCU_BOOST
- seq_puts(m, "\"kt\",\"ktl\"");
-#endif /* #ifdef CONFIG_RCU_BOOST */
- seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "\"rcu_preempt:\"\n");
- PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "\"rcu_sched:\"\n");
- PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
- seq_puts(m, "\"rcu_bh:\"\n");
- PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+
+ seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
+ atomic_long_read(&rsp->expedited_start),
+ atomic_long_read(&rsp->expedited_done),
+ atomic_long_read(&rsp->expedited_wrap),
+ atomic_long_read(&rsp->expedited_tryfail),
+ atomic_long_read(&rsp->expedited_workdone1),
+ atomic_long_read(&rsp->expedited_workdone2),
+ atomic_long_read(&rsp->expedited_normal),
+ atomic_long_read(&rsp->expedited_stoppedcpus),
+ atomic_long_read(&rsp->expedited_done_tries),
+ atomic_long_read(&rsp->expedited_done_lost),
+ atomic_long_read(&rsp->expedited_done_exit));
return 0;
}
-static int rcudata_csv_open(struct inode *inode, struct file *file)
+static int rcuexp_open(struct inode *inode, struct file *file)
{
- return single_open(file, show_rcudata_csv, NULL);
+ return single_open(file, show_rcuexp, inode->i_private);
}
-static const struct file_operations rcudata_csv_fops = {
+static const struct file_operations rcuexp_fops = {
.owner = THIS_MODULE,
- .open = rcudata_csv_open,
+ .open = rcuexp_open,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = no_llseek,
.release = single_release,
};
@@ -201,8 +213,7 @@ static const struct file_operations rcudata_csv_fops = {
static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
{
- seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
- "j=%04x bt=%04x\n",
+ seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
rnp->grplo, rnp->grphi,
"T."[list_empty(&rnp->blkd_tasks)],
"N."[!rnp->gp_tasks],
@@ -210,11 +221,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
"B."[!rnp->boost_tasks],
convert_kthread_status(rnp->boost_kthread_status),
rnp->n_tasks_boosted, rnp->n_exp_boosts,
- rnp->n_normal_boosts,
+ rnp->n_normal_boosts);
+ seq_printf(m, "j=%04x bt=%04x\n",
(int)(jiffies & 0xffff),
(int)(rnp->boost_time & 0xffff));
- seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
- " balk",
+ seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
rnp->n_balk_blkd_tasks,
rnp->n_balk_exp_gp_tasks,
rnp->n_balk_boost_tasks,
@@ -241,27 +252,11 @@ static const struct file_operations rcu_node_boost_fops = {
.owner = THIS_MODULE,
.open = rcu_node_boost_open,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = no_llseek,
.release = single_release,
};
-/*
- * Create the rcuboost debugfs entry. Standard error return.
- */
-static int rcu_boost_trace_create_file(struct dentry *rcudir)
-{
- return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
- &rcu_node_boost_fops);
-}
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-static int rcu_boost_trace_create_file(struct dentry *rcudir)
-{
- return 0; /* There cannot be an error if we didn't create it! */
-}
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_BOOST */
static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
{
@@ -270,15 +265,16 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
struct rcu_node *rnp;
gpnum = rsp->gpnum;
- seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
- rsp->completed, gpnum, rsp->fqs_state,
+ seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
+ ulong2long(rsp->completed), ulong2long(gpnum),
+ rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
- (int)(jiffies & 0xffff),
+ (int)(jiffies & 0xffff));
+ seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+ ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+ for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
level = rnp->level;
@@ -293,29 +289,23 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_puts(m, "\n");
}
-static int show_rcuhier(struct seq_file *m, void *unused)
+static int show_rcuhier(struct seq_file *m, void *v)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "rcu_preempt:\n");
- print_one_rcu_state(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "rcu_sched:\n");
- print_one_rcu_state(m, &rcu_sched_state);
- seq_puts(m, "rcu_bh:\n");
- print_one_rcu_state(m, &rcu_bh_state);
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ print_one_rcu_state(m, rsp);
return 0;
}
static int rcuhier_open(struct inode *inode, struct file *file)
{
- return single_open(file, show_rcuhier, NULL);
+ return single_open(file, show_rcuhier, inode->i_private);
}
static const struct file_operations rcuhier_fops = {
.owner = THIS_MODULE,
.open = rcuhier_open,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = no_llseek,
.release = single_release,
};
@@ -329,95 +319,82 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
struct rcu_node *rnp = &rsp->node[0];
raw_spin_lock_irqsave(&rnp->lock, flags);
- completed = rsp->completed;
- gpnum = rsp->gpnum;
- if (rsp->completed == rsp->gpnum)
+ completed = ACCESS_ONCE(rsp->completed);
+ gpnum = ACCESS_ONCE(rsp->gpnum);
+ if (completed == gpnum)
gpage = 0;
else
gpage = jiffies - rsp->gp_start;
gpmax = rsp->gp_max;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
- rsp->name, completed, gpnum, gpage, gpmax);
+ seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
+ ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
}
-static int show_rcugp(struct seq_file *m, void *unused)
+static int show_rcugp(struct seq_file *m, void *v)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- show_one_rcugp(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- show_one_rcugp(m, &rcu_sched_state);
- show_one_rcugp(m, &rcu_bh_state);
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ show_one_rcugp(m, rsp);
return 0;
}
static int rcugp_open(struct inode *inode, struct file *file)
{
- return single_open(file, show_rcugp, NULL);
+ return single_open(file, show_rcugp, inode->i_private);
}
static const struct file_operations rcugp_fops = {
.owner = THIS_MODULE,
.open = rcugp_open,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = no_llseek,
.release = single_release,
};
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
{
- seq_printf(m, "%3d%cnp=%ld "
- "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
- "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+ if (!rdp->beenonline)
+ return;
+ seq_printf(m, "%3d%cnp=%ld ",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->n_rcu_pending,
+ rdp->n_rcu_pending);
+ seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
rdp->n_rp_qs_pending,
rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
- rdp->n_rp_cpu_needs_gp,
+ rdp->n_rp_cpu_needs_gp);
+ seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
rdp->n_rp_gp_completed,
rdp->n_rp_gp_started,
- rdp->n_rp_need_fqs,
+ rdp->n_rp_nocb_defer_wakeup,
rdp->n_rp_need_nothing);
}
-static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
-{
- int cpu;
- struct rcu_data *rdp;
-
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
- if (rdp->beenonline)
- print_one_rcu_pending(m, rdp);
- }
-}
-
-static int show_rcu_pending(struct seq_file *m, void *unused)
+static int show_rcu_pending(struct seq_file *m, void *v)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "rcu_preempt:\n");
- print_rcu_pendings(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "rcu_sched:\n");
- print_rcu_pendings(m, &rcu_sched_state);
- seq_puts(m, "rcu_bh:\n");
- print_rcu_pendings(m, &rcu_bh_state);
+ print_one_rcu_pending(m, (struct rcu_data *)v);
return 0;
}
+static const struct seq_operations rcu_pending_op = {
+ .start = r_start,
+ .next = r_next,
+ .stop = r_stop,
+ .show = show_rcu_pending,
+};
+
static int rcu_pending_open(struct inode *inode, struct file *file)
{
- return single_open(file, show_rcu_pending, NULL);
+ return r_open(inode, file, &rcu_pending_op);
}
static const struct file_operations rcu_pending_fops = {
.owner = THIS_MODULE,
.open = rcu_pending_open,
.read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+ .llseek = no_llseek,
+ .release = seq_release,
};
static int show_rcutorture(struct seq_file *m, void *unused)
@@ -447,38 +424,58 @@ static struct dentry *rcudir;
static int __init rcutree_trace_init(void)
{
+ struct rcu_state *rsp;
struct dentry *retval;
+ struct dentry *rspdir;
rcudir = debugfs_create_dir("rcu", NULL);
if (!rcudir)
goto free_out;
- retval = debugfs_create_file("rcudata", 0444, rcudir,
- NULL, &rcudata_fops);
- if (!retval)
- goto free_out;
+ for_each_rcu_flavor(rsp) {
+ rspdir = debugfs_create_dir(rsp->name, rcudir);
+ if (!rspdir)
+ goto free_out;
- retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
- NULL, &rcudata_csv_fops);
- if (!retval)
- goto free_out;
+ retval = debugfs_create_file("rcudata", 0444,
+ rspdir, rsp, &rcudata_fops);
+ if (!retval)
+ goto free_out;
- if (rcu_boost_trace_create_file(rcudir))
- goto free_out;
+ retval = debugfs_create_file("rcuexp", 0444,
+ rspdir, rsp, &rcuexp_fops);
+ if (!retval)
+ goto free_out;
- retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
- if (!retval)
- goto free_out;
+ retval = debugfs_create_file("rcu_pending", 0444,
+ rspdir, rsp, &rcu_pending_fops);
+ if (!retval)
+ goto free_out;
- retval = debugfs_create_file("rcuhier", 0444, rcudir,
- NULL, &rcuhier_fops);
- if (!retval)
- goto free_out;
+ retval = debugfs_create_file("rcubarrier", 0444,
+ rspdir, rsp, &rcubarrier_fops);
+ if (!retval)
+ goto free_out;
- retval = debugfs_create_file("rcu_pending", 0444, rcudir,
- NULL, &rcu_pending_fops);
- if (!retval)
- goto free_out;
+#ifdef CONFIG_RCU_BOOST
+ if (rsp == &rcu_preempt_state) {
+ retval = debugfs_create_file("rcuboost", 0444,
+ rspdir, NULL, &rcu_node_boost_fops);
+ if (!retval)
+ goto free_out;
+ }
+#endif
+
+ retval = debugfs_create_file("rcugp", 0444,
+ rspdir, rsp, &rcugp_fops);
+ if (!retval)
+ goto free_out;
+
+ retval = debugfs_create_file("rcuhier", 0444,
+ rspdir, rsp, &rcuhier_fops);
+ if (!retval)
+ goto free_out;
+ }
retval = debugfs_create_file("rcutorture", 0444, rcudir,
NULL, &rcutorture_fops);
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index 2bc4e135ff2..bc788357053 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2001
*
@@ -45,12 +45,72 @@
#include <linux/mutex.h>
#include <linux/export.h>
#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/module.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/rcu.h>
#include "rcu.h"
+MODULE_ALIAS("rcupdate");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcupdate."
+
+module_param(rcu_expedited, int, 0);
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting != 1) {
+ --t->rcu_read_lock_nesting;
+ } else {
+ barrier(); /* critical section before exit code. */
+ t->rcu_read_lock_nesting = INT_MIN;
+#ifdef CONFIG_PROVE_RCU_DELAY
+ udelay(10); /* Make preemption more probable. */
+#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
@@ -66,11 +126,13 @@ static struct lock_class_key rcu_sched_lock_key;
struct lockdep_map rcu_sched_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_callback_key;
+struct lockdep_map rcu_callback_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
+EXPORT_SYMBOL_GPL(rcu_callback_map);
-int debug_lockdep_rcu_enabled(void)
+int notrace debug_lockdep_rcu_enabled(void)
{
return rcu_scheduler_active && debug_locks &&
current->lockdep_recursion == 0;
@@ -88,12 +150,17 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
* section.
*
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
*/
int rcu_read_lock_bh_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
- if (rcu_is_cpu_idle())
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
return 0;
return in_softirq() || irqs_disabled();
}
@@ -132,66 +199,18 @@ void wait_rcu_gp(call_rcu_func_t crf)
}
EXPORT_SYMBOL_GPL(wait_rcu_gp);
-#ifdef CONFIG_PROVE_RCU
-/*
- * wrapper function to avoid #include problems.
- */
-int rcu_my_thread_group_empty(void)
-{
- return thread_group_empty(current);
-}
-EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
/*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
-{
- struct rcu_head *head = addr;
-
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- /*
- * Ensure that queued callbacks are all executed.
- * If we detect that we are nested in a RCU read-side critical
- * section, we should simply fail, otherwise we would deadlock.
- * In !PREEMPT configurations, there is no way to tell if we are
- * in a RCU read-side critical section or not, so we never
- * attempt any fixup and just print a warning.
- */
-#ifndef CONFIG_PREEMPT
- WARN_ON_ONCE(1);
- return 0;
-#endif
- if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
- irqs_disabled()) {
- WARN_ON_ONCE(1);
- return 0;
- }
- rcu_barrier();
- rcu_barrier_sched();
- rcu_barrier_bh();
- debug_object_init(head, &rcuhead_debug_descr);
- return 1;
- default:
- return 0;
- }
-}
-
-/*
* fixup_activate is called when:
* - an active object is activated
* - an unknown object is activated (might be a statically initialized object)
@@ -211,69 +230,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
debug_object_init(head, &rcuhead_debug_descr);
debug_object_activate(head, &rcuhead_debug_descr);
return 0;
-
- case ODEBUG_STATE_ACTIVE:
- /*
- * Ensure that queued callbacks are all executed.
- * If we detect that we are nested in a RCU read-side critical
- * section, we should simply fail, otherwise we would deadlock.
- * In !PREEMPT configurations, there is no way to tell if we are
- * in a RCU read-side critical section or not, so we never
- * attempt any fixup and just print a warning.
- */
-#ifndef CONFIG_PREEMPT
- WARN_ON_ONCE(1);
- return 0;
-#endif
- if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
- irqs_disabled()) {
- WARN_ON_ONCE(1);
- return 0;
- }
- rcu_barrier();
- rcu_barrier_sched();
- rcu_barrier_bh();
- debug_object_activate(head, &rcuhead_debug_descr);
- return 1;
default:
- return 0;
- }
-}
-
-/*
- * fixup_free is called when:
- * - an active object is freed
- */
-static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
-{
- struct rcu_head *head = addr;
-
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- /*
- * Ensure that queued callbacks are all executed.
- * If we detect that we are nested in a RCU read-side critical
- * section, we should simply fail, otherwise we would deadlock.
- * In !PREEMPT configurations, there is no way to tell if we are
- * in a RCU read-side critical section or not, so we never
- * attempt any fixup and just print a warning.
- */
-#ifndef CONFIG_PREEMPT
- WARN_ON_ONCE(1);
- return 0;
-#endif
- if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
- irqs_disabled()) {
- WARN_ON_ONCE(1);
- return 0;
- }
- rcu_barrier();
- rcu_barrier_sched();
- rcu_barrier_bh();
- debug_object_free(head, &rcuhead_debug_descr);
return 1;
- default:
- return 0;
}
}
@@ -312,19 +270,83 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
- .fixup_init = rcuhead_fixup_init,
.fixup_activate = rcuhead_fixup_activate,
- .fixup_free = rcuhead_fixup_free,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old, unsigned long c)
{
- trace_rcu_torture_read(rcutorturename, rhp);
+ trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
}
EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+#endif
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
#endif
+
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
deleted file mode 100644
index 9cb1ae4aabd..00000000000
--- a/kernel/rcutiny_plugin.h
+++ /dev/null
@@ -1,1073 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
- RCU_TRACE(long qlen); /* Number of pending CBs. */
- RCU_TRACE(char *name); /* Name of RCU type. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_sched")
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_bh")
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-#ifdef CONFIG_TINY_PREEMPT_RCU
-
-#include <linux/delay.h>
-
-/* Global control variables for preemptible RCU. */
-struct rcu_preempt_ctrlblk {
- struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
- struct rcu_head **nexttail;
- /* Tasks blocked in a preemptible RCU */
- /* read-side critical section while an */
- /* preemptible-RCU grace period is in */
- /* progress must wait for a later grace */
- /* period. This pointer points to the */
- /* ->next pointer of the last task that */
- /* must wait for a later grace period, or */
- /* to &->rcb.rcucblist if there is no */
- /* such task. */
- struct list_head blkd_tasks;
- /* Tasks blocked in RCU read-side critical */
- /* section. Tasks are placed at the head */
- /* of this list and age towards the tail. */
- struct list_head *gp_tasks;
- /* Pointer to the first task blocking the */
- /* current grace period, or NULL if there */
- /* is no such task. */
- struct list_head *exp_tasks;
- /* Pointer to first task blocking the */
- /* current expedited grace period, or NULL */
- /* if there is no such task. If there */
- /* is no current expedited grace period, */
- /* then there cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
- struct list_head *boost_tasks;
- /* Pointer to first task that needs to be */
- /* priority-boosted, or NULL if no priority */
- /* boosting is needed. If there is no */
- /* current or expedited grace period, there */
- /* can be no such task. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
- u8 gpnum; /* Current grace period. */
- u8 gpcpu; /* Last grace period blocked by the CPU. */
- u8 completed; /* Last grace period completed. */
- /* If all three are equal, RCU is idle. */
-#ifdef CONFIG_RCU_BOOST
- unsigned long boost_time; /* When to start boosting (jiffies) */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-#ifdef CONFIG_RCU_TRACE
- unsigned long n_grace_periods;
-#ifdef CONFIG_RCU_BOOST
- unsigned long n_tasks_boosted;
- /* Total number of tasks boosted. */
- unsigned long n_exp_boosts;
- /* Number of tasks boosted for expedited GP. */
- unsigned long n_normal_boosts;
- /* Number of tasks boosted for normal GP. */
- unsigned long n_balk_blkd_tasks;
- /* Refused to boost: no blocked tasks. */
- unsigned long n_balk_exp_gp_tasks;
- /* Refused to boost: nothing blocking GP. */
- unsigned long n_balk_boost_tasks;
- /* Refused to boost: already boosting. */
- unsigned long n_balk_notyet;
- /* Refused to boost: not yet time. */
- unsigned long n_balk_nos;
- /* Refused to boost: not sure why, though. */
- /* This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-#endif /* #ifdef CONFIG_RCU_TRACE */
-};
-
-static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
- .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
- .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
- .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
- .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
- RCU_TRACE(.rcb.name = "rcu_preempt")
-};
-
-static int rcu_preempted_readers_exp(void);
-static void rcu_report_exp_done(void);
-
-/*
- * Return true if the CPU has not yet responded to the current grace period.
- */
-static int rcu_cpu_blocking_cur_gp(void)
-{
- return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
-}
-
-/*
- * Check for a running RCU reader. Because there is only one CPU,
- * there can be but one running RCU reader at a time. ;-)
- */
-static int rcu_preempt_running_reader(void)
-{
- return current->rcu_read_lock_nesting;
-}
-
-/*
- * Check for preempted RCU readers blocking any grace period.
- * If the caller needs a reliable answer, it must disable hard irqs.
- */
-static int rcu_preempt_blocked_readers_any(void)
-{
- return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
-}
-
-/*
- * Check for preempted RCU readers blocking the current grace period.
- * If the caller needs a reliable answer, it must disable hard irqs.
- */
-static int rcu_preempt_blocked_readers_cgp(void)
-{
- return rcu_preempt_ctrlblk.gp_tasks != NULL;
-}
-
-/*
- * Return true if another preemptible-RCU grace period is needed.
- */
-static int rcu_preempt_needs_another_gp(void)
-{
- return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
-}
-
-/*
- * Return true if a preemptible-RCU grace period is in progress.
- * The caller must disable hardirqs.
- */
-static int rcu_preempt_gp_in_progress(void)
-{
- return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
-}
-
-/*
- * Advance a ->blkd_tasks-list pointer to the next entry, instead
- * returning NULL if at the end of the list.
- */
-static struct list_head *rcu_next_node_entry(struct task_struct *t)
-{
- struct list_head *np;
-
- np = t->rcu_node_entry.next;
- if (np == &rcu_preempt_ctrlblk.blkd_tasks)
- np = NULL;
- return np;
-}
-
-#ifdef CONFIG_RCU_TRACE
-
-#ifdef CONFIG_RCU_BOOST
-static void rcu_initiate_boost_trace(void);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Dump additional statistice for TINY_PREEMPT_RCU.
- */
-static void show_tiny_preempt_stats(struct seq_file *m)
-{
- seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
- rcu_preempt_ctrlblk.rcb.qlen,
- rcu_preempt_ctrlblk.n_grace_periods,
- rcu_preempt_ctrlblk.gpnum,
- rcu_preempt_ctrlblk.gpcpu,
- rcu_preempt_ctrlblk.completed,
- "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
- "N."[!rcu_preempt_ctrlblk.gp_tasks],
- "E."[!rcu_preempt_ctrlblk.exp_tasks]);
-#ifdef CONFIG_RCU_BOOST
- seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
- " ",
- "B."[!rcu_preempt_ctrlblk.boost_tasks],
- rcu_preempt_ctrlblk.n_tasks_boosted,
- rcu_preempt_ctrlblk.n_exp_boosts,
- rcu_preempt_ctrlblk.n_normal_boosts,
- (int)(jiffies & 0xffff),
- (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
- seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
- " balk",
- rcu_preempt_ctrlblk.n_balk_blkd_tasks,
- rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
- rcu_preempt_ctrlblk.n_balk_boost_tasks,
- rcu_preempt_ctrlblk.n_balk_notyet,
- rcu_preempt_ctrlblk.n_balk_nos);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-#ifdef CONFIG_RCU_BOOST
-
-#include "rtmutex_common.h"
-
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-
-/* Controls for rcu_kthread() kthread. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
-
-/*
- * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
- * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
- */
-static int rcu_boost(void)
-{
- unsigned long flags;
- struct rt_mutex mtx;
- struct task_struct *t;
- struct list_head *tb;
-
- if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
- rcu_preempt_ctrlblk.exp_tasks == NULL)
- return 0; /* Nothing to boost. */
-
- raw_local_irq_save(flags);
-
- /*
- * Recheck with irqs disabled: all tasks in need of boosting
- * might exit their RCU read-side critical sections on their own
- * if we are preempted just before disabling irqs.
- */
- if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
- rcu_preempt_ctrlblk.exp_tasks == NULL) {
- raw_local_irq_restore(flags);
- return 0;
- }
-
- /*
- * Preferentially boost tasks blocking expedited grace periods.
- * This cannot starve the normal grace periods because a second
- * expedited grace period must boost all blocked tasks, including
- * those blocking the pre-existing normal grace period.
- */
- if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
- tb = rcu_preempt_ctrlblk.exp_tasks;
- RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
- } else {
- tb = rcu_preempt_ctrlblk.boost_tasks;
- RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
- }
- RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
-
- /*
- * We boost task t by manufacturing an rt_mutex that appears to
- * be held by task t. We leave a pointer to that rt_mutex where
- * task t can find it, and task t will release the mutex when it
- * exits its outermost RCU read-side critical section. Then
- * simply acquiring this artificial rt_mutex will boost task
- * t's priority. (Thanks to tglx for suggesting this approach!)
- */
- t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&mtx, t);
- t->rcu_boost_mutex = &mtx;
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
- raw_local_irq_restore(flags);
- rt_mutex_lock(&mtx);
- rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
-
- return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
- ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
-}
-
-/*
- * Check to see if it is now time to start boosting RCU readers blocking
- * the current grace period, and, if so, tell the rcu_kthread_task to
- * start boosting them. If there is an expedited boost in progress,
- * we wait for it to complete.
- *
- * If there are no blocked readers blocking the current grace period,
- * return 0 to let the caller know, otherwise return 1. Note that this
- * return value is independent of whether or not boosting was done.
- */
-static int rcu_initiate_boost(void)
-{
- if (!rcu_preempt_blocked_readers_cgp() &&
- rcu_preempt_ctrlblk.exp_tasks == NULL) {
- RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
- return 0;
- }
- if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
- (rcu_preempt_ctrlblk.gp_tasks != NULL &&
- rcu_preempt_ctrlblk.boost_tasks == NULL &&
- ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
- if (rcu_preempt_ctrlblk.exp_tasks == NULL)
- rcu_preempt_ctrlblk.boost_tasks =
- rcu_preempt_ctrlblk.gp_tasks;
- invoke_rcu_callbacks();
- } else
- RCU_TRACE(rcu_initiate_boost_trace());
- return 1;
-}
-
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
-
-/*
- * Do priority-boost accounting for the start of a new grace period.
- */
-static void rcu_preempt_boost_start_gp(void)
-{
- rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
-}
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * If there is no RCU priority boosting, we don't initiate boosting,
- * but we do indicate whether there are blocked readers blocking the
- * current grace period.
- */
-static int rcu_initiate_boost(void)
-{
- return rcu_preempt_blocked_readers_cgp();
-}
-
-/*
- * If there is no RCU priority boosting, nothing to do at grace-period start.
- */
-static void rcu_preempt_boost_start_gp(void)
-{
-}
-
-#endif /* else #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Record a preemptible-RCU quiescent state for the specified CPU. Note
- * that this just means that the task currently running on the CPU is
- * in a quiescent state. There might be any number of tasks blocked
- * while in an RCU read-side critical section.
- *
- * Unlike the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
- * ->rcu_read_unlock_special.
- *
- * Because this is a single-CPU implementation, the only way a grace
- * period can end is if the CPU is in a quiescent state. The reason is
- * that a blocked preemptible-RCU reader can exit its critical section
- * only if the CPU is running it at the time. Therefore, when the
- * last task blocking the current grace period exits its RCU read-side
- * critical section, neither the CPU nor blocked tasks will be stopping
- * the current grace period. (In contrast, SMP implementations
- * might have CPUs running in RCU read-side critical sections that
- * block later grace periods -- but this is not possible given only
- * one CPU.)
- */
-static void rcu_preempt_cpu_qs(void)
-{
- /* Record both CPU and task as having responded to current GP. */
- rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
- current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
-
- /* If there is no GP then there is nothing more to do. */
- if (!rcu_preempt_gp_in_progress())
- return;
- /*
- * Check up on boosting. If there are readers blocking the
- * current grace period, leave.
- */
- if (rcu_initiate_boost())
- return;
-
- /* Advance callbacks. */
- rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
- rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
- rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
-
- /* If there are no blocked readers, next GP is done instantly. */
- if (!rcu_preempt_blocked_readers_any())
- rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
-
- /* If there are done callbacks, cause them to be invoked. */
- if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
- invoke_rcu_callbacks();
-}
-
-/*
- * Start a new RCU grace period if warranted. Hard irqs must be disabled.
- */
-static void rcu_preempt_start_gp(void)
-{
- if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
-
- /* Official start of GP. */
- rcu_preempt_ctrlblk.gpnum++;
- RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
-
- /* Any blocked RCU readers block new GP. */
- if (rcu_preempt_blocked_readers_any())
- rcu_preempt_ctrlblk.gp_tasks =
- rcu_preempt_ctrlblk.blkd_tasks.next;
-
- /* Set up for RCU priority boosting. */
- rcu_preempt_boost_start_gp();
-
- /* If there is no running reader, CPU is done with GP. */
- if (!rcu_preempt_running_reader())
- rcu_preempt_cpu_qs();
- }
-}
-
-/*
- * We have entered the scheduler, and the current task might soon be
- * context-switched away from. If this task is in an RCU read-side
- * critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the blkd_tasks list.
- * If the task started after the current grace period began, as recorded
- * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
- * before the element referenced by ->gp_tasks (or at the tail if
- * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
- * The task will dequeue itself when it exits the outermost enclosing
- * RCU read-side critical section. Therefore, the current grace period
- * cannot be permitted to complete until the ->gp_tasks pointer becomes
- * NULL.
- *
- * Caller must disable preemption.
- */
-void rcu_preempt_note_context_switch(void)
-{
- struct task_struct *t = current;
- unsigned long flags;
-
- local_irq_save(flags); /* must exclude scheduler_tick(). */
- if (rcu_preempt_running_reader() &&
- (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
-
- /* Possibly blocking in an RCU read-side critical section. */
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
-
- /*
- * If this CPU has already checked in, then this task
- * will hold up the next grace period rather than the
- * current grace period. Queue the task accordingly.
- * If the task is queued for the current grace period
- * (i.e., this CPU has not yet passed through a quiescent
- * state for the current grace period), then as long
- * as that task remains queued, the current grace period
- * cannot end.
- */
- list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
- if (rcu_cpu_blocking_cur_gp())
- rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
- }
-
- /*
- * Either we were not in an RCU read-side critical section to
- * begin with, or we have now recorded that critical section
- * globally. Either way, we can now note a quiescent state
- * for this CPU. Again, if we were in an RCU read-side critical
- * section, and if that critical section was blocking the current
- * grace period, then the fact that the task has been enqueued
- * means that current grace period continues to be blocked.
- */
- rcu_preempt_cpu_qs();
- local_irq_restore(flags);
-}
-
-/*
- * Tiny-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Handle special cases during rcu_read_unlock(), such as needing to
- * notify RCU core processing or task having blocked during the RCU
- * read-side critical section.
- */
-static void rcu_read_unlock_special(struct task_struct *t)
-{
- int empty;
- int empty_exp;
- unsigned long flags;
- struct list_head *np;
- int special;
-
- /*
- * NMI handlers cannot block and cannot safely manipulate state.
- * They therefore cannot possibly be special, so just leave.
- */
- if (in_nmi())
- return;
-
- local_irq_save(flags);
-
- /*
- * If RCU core is waiting for this CPU to exit critical section,
- * let it know that we have done so.
- */
- special = t->rcu_read_unlock_special;
- if (special & RCU_READ_UNLOCK_NEED_QS)
- rcu_preempt_cpu_qs();
-
- /* Hardware IRQ handlers cannot block. */
- if (in_irq()) {
- local_irq_restore(flags);
- return;
- }
-
- /* Clean up if blocked during RCU read-side critical section. */
- if (special & RCU_READ_UNLOCK_BLOCKED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
-
- /*
- * Remove this task from the ->blkd_tasks list and adjust
- * any pointers that might have been referencing it.
- */
- empty = !rcu_preempt_blocked_readers_cgp();
- empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
- np = rcu_next_node_entry(t);
- list_del_init(&t->rcu_node_entry);
- if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
- rcu_preempt_ctrlblk.gp_tasks = np;
- if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
- rcu_preempt_ctrlblk.exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
- rcu_preempt_ctrlblk.boost_tasks = np;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- /*
- * If this was the last task on the current list, and if
- * we aren't waiting on the CPU, report the quiescent state
- * and start a new grace period if needed.
- */
- if (!empty && !rcu_preempt_blocked_readers_cgp()) {
- rcu_preempt_cpu_qs();
- rcu_preempt_start_gp();
- }
-
- /*
- * If this was the last task on the expedited lists,
- * then we need wake up the waiting task.
- */
- if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
- rcu_report_exp_done();
- }
-#ifdef CONFIG_RCU_BOOST
- /* Unboost self if was boosted. */
- if (special & RCU_READ_UNLOCK_BOOSTED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
- rt_mutex_unlock(t->rcu_boost_mutex);
- t->rcu_boost_mutex = NULL;
- }
-#endif /* #ifdef CONFIG_RCU_BOOST */
- local_irq_restore(flags);
-}
-
-/*
- * Tiny-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
- --t->rcu_read_lock_nesting;
- barrier(); /* decrement before load of ->rcu_read_unlock_special */
- if (t->rcu_read_lock_nesting == 0 &&
- unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
-#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-/*
- * Check for a quiescent state from the current CPU. When a task blocks,
- * the task is recorded in the rcu_preempt_ctrlblk structure, which is
- * checked elsewhere. This is called from the scheduling-clock interrupt.
- *
- * Caller must disable hard irqs.
- */
-static void rcu_preempt_check_callbacks(void)
-{
- struct task_struct *t = current;
-
- if (rcu_preempt_gp_in_progress() &&
- (!rcu_preempt_running_reader() ||
- !rcu_cpu_blocking_cur_gp()))
- rcu_preempt_cpu_qs();
- if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
- rcu_preempt_ctrlblk.rcb.donetail)
- invoke_rcu_callbacks();
- if (rcu_preempt_gp_in_progress() &&
- rcu_cpu_blocking_cur_gp() &&
- rcu_preempt_running_reader())
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
-}
-
-/*
- * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from rcu_process_callbacks() to
- * handle that case. Of course, it is invoked for all flavors of
- * RCU, but RCU callbacks can appear only on one of the lists, and
- * neither ->nexttail nor ->donetail can possibly be NULL, so there
- * is no need for an explicit check.
- */
-static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
-{
- if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
- rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
-}
-
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
- __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
-}
-
-/*
- * Queue a preemptible -RCU callback for invocation after a grace period.
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
-
- debug_rcu_head_queue(head);
- head->func = func;
- head->next = NULL;
-
- local_irq_save(flags);
- *rcu_preempt_ctrlblk.nexttail = head;
- rcu_preempt_ctrlblk.nexttail = &head->next;
- RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
- rcu_preempt_start_gp(); /* checks to see if GP needed. */
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/*
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void synchronize_rcu(void)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- if (!rcu_scheduler_active)
- return;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
- WARN_ON_ONCE(rcu_preempt_running_reader());
- if (!rcu_preempt_blocked_readers_any())
- return;
-
- /* Once we get past the fastpath checks, same code as rcu_barrier(). */
- rcu_barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
-
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(void)
-{
- return rcu_preempt_ctrlblk.exp_tasks != NULL;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.
- */
-static void rcu_report_exp_done(void)
-{
- wake_up(&sync_rcu_preempt_exp_wq);
-}
-
-/*
- * Wait for an rcu-preempt grace period, but expedite it. The basic idea
- * is to rely in the fact that there is but one CPU, and that it is
- * illegal for a task to invoke synchronize_rcu_expedited() while in a
- * preemptible-RCU read-side critical section. Therefore, any such
- * critical sections must correspond to blocked tasks, which must therefore
- * be on the ->blkd_tasks list. So just record the current head of the
- * list in the ->exp_tasks pointer, and wait for all tasks including and
- * after the task pointed to by ->exp_tasks to drain.
- */
-void synchronize_rcu_expedited(void)
-{
- unsigned long flags;
- struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
- unsigned long snap;
-
- barrier(); /* ensure prior action seen before grace period. */
-
- WARN_ON_ONCE(rcu_preempt_running_reader());
-
- /*
- * Acquire lock so that there is only one preemptible RCU grace
- * period in flight. Of course, if someone does the expedited
- * grace period for us while we are acquiring the lock, just leave.
- */
- snap = sync_rcu_preempt_exp_count + 1;
- mutex_lock(&sync_rcu_preempt_exp_mutex);
- if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
- goto unlock_mb_ret; /* Others did our work for us. */
-
- local_irq_save(flags);
-
- /*
- * All RCU readers have to already be on blkd_tasks because
- * we cannot legally be executing in an RCU read-side critical
- * section.
- */
-
- /* Snapshot current head of ->blkd_tasks list. */
- rpcp->exp_tasks = rpcp->blkd_tasks.next;
- if (rpcp->exp_tasks == &rpcp->blkd_tasks)
- rpcp->exp_tasks = NULL;
-
- /* Wait for tail of ->blkd_tasks list to drain. */
- if (!rcu_preempted_readers_exp())
- local_irq_restore(flags);
- else {
- rcu_initiate_boost();
- local_irq_restore(flags);
- wait_event(sync_rcu_preempt_exp_wq,
- !rcu_preempted_readers_exp());
- }
-
- /* Clean up and exit. */
- barrier(); /* ensure expedited GP seen before counter increment. */
- sync_rcu_preempt_exp_count++;
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
- barrier(); /* ensure subsequent action seen after grace period. */
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-/*
- * Does preemptible RCU need the CPU to stay out of dynticks mode?
- */
-int rcu_preempt_needs_cpu(void)
-{
- if (!rcu_preempt_running_reader())
- rcu_preempt_cpu_qs();
- return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
-}
-
-/*
- * Check for a task exiting while in a preemptible -RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
-#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
-
-#ifdef CONFIG_RCU_TRACE
-
-/*
- * Because preemptible RCU does not exist, it is not necessary to
- * dump out its statistics.
- */
-static void show_tiny_preempt_stats(struct seq_file *m)
-{
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to check.
- */
-static void rcu_preempt_check_callbacks(void)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to remove.
- */
-static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
-
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_callbacks(void)
-{
- have_rcu_kthread_work = 1;
- wake_up(&rcu_kthread_wq);
-}
-
-#ifdef CONFIG_RCU_TRACE
-
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return rcu_kthread_task == current;
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed. It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
-{
- unsigned long work;
- unsigned long morework;
- unsigned long flags;
-
- for (;;) {
- wait_event_interruptible(rcu_kthread_wq,
- have_rcu_kthread_work != 0);
- morework = rcu_boost();
- local_irq_save(flags);
- work = have_rcu_kthread_work;
- have_rcu_kthread_work = morework;
- local_irq_restore(flags);
- if (work)
- rcu_process_callbacks(NULL);
- schedule_timeout_interruptible(1); /* Leave CPU for others. */
- }
-
- return 0; /* Not reached, but needed to shut gcc up. */
-}
-
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
- struct sched_param sp;
-
- rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
- sp.sched_priority = RCU_BOOST_PRIO;
- sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
- return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Start up softirq processing of callbacks.
- */
-void invoke_rcu_callbacks(void)
-{
- raise_softirq(RCU_SOFTIRQ);
-}
-
-#ifdef CONFIG_RCU_TRACE
-
-/*
- * There is no callback kthread, so this thread is never it.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-void rcu_init(void)
-{
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#include <linux/kernel_stat.h>
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
-}
-
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-#ifdef CONFIG_RCU_TRACE
-
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_initiate_boost_trace(void)
-{
- if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
- rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
- else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
- rcu_preempt_ctrlblk.exp_tasks == NULL)
- rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
- else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
- rcu_preempt_ctrlblk.n_balk_boost_tasks++;
- else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
- rcu_preempt_ctrlblk.n_balk_notyet++;
- else
- rcu_preempt_ctrlblk.n_balk_nos++;
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
-{
- unsigned long flags;
-
- raw_local_irq_save(flags);
- rcp->qlen -= n;
- raw_local_irq_restore(flags);
-}
-
-/*
- * Dump statistics for TINY_RCU, such as they are.
- */
-static int show_tiny_stats(struct seq_file *m, void *unused)
-{
- show_tiny_preempt_stats(m);
- seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
- seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
- return 0;
-}
-
-static int show_tiny_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_tiny_stats, NULL);
-}
-
-static const struct file_operations show_tiny_stats_fops = {
- .owner = THIS_MODULE,
- .open = show_tiny_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutiny_trace_init(void)
-{
- struct dentry *retval;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
- retval = debugfs_create_file("rcudata", 0444, rcudir,
- NULL, &show_tiny_stats_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-
-static void __exit rcutiny_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-module_init(rcutiny_trace_init);
-module_exit(rcutiny_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
-MODULE_LICENSE("GPL");
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
deleted file mode 100644
index a58ac285fc6..00000000000
--- a/kernel/rcutorture.c
+++ /dev/null
@@ -1,1833 +0,0 @@
-/*
- * Read-Copy Update module-based torture test facility
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005, 2006
- *
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- * Josh Triplett <josh@freedesktop.org>
- *
- * See also: Documentation/RCU/torture.txt
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
- "Josh Triplett <josh@freedesktop.org>");
-
-static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
-static int nfakewriters = 4; /* # fake writer threads */
-static int stat_interval; /* Interval between stats, in seconds. */
- /* Defaults to "only at end of test". */
-static bool verbose; /* Print more debug info. */
-static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
-static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
-static int stutter = 5; /* Start/stop testing interval (in sec) */
-static int irqreader = 1; /* RCU readers from irq (timers). */
-static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff; /* Hold time within burst (us). */
-static int fqs_stutter = 3; /* Wait time between bursts (s). */
-static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
-static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
-static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
-static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
-static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
-static char *torture_type = "rcu"; /* What RCU implementation to torture. */
-
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
-module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
-module_param(fqs_holdoff, int, 0444);
-MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
-module_param(fqs_stutter, int, 0444);
-MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
-module_param(test_boost, int, 0444);
-MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
-module_param(test_boost_duration, int, 0444);
-MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
-module_param(torture_type, charp, 0444);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
-
-#define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
- do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
- do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
- do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
-
-static char printk_buf[4096];
-
-static int nrealreaders;
-static struct task_struct *writer_task;
-static struct task_struct **fakewriter_tasks;
-static struct task_struct **reader_tasks;
-static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
-static struct task_struct *stutter_task;
-static struct task_struct *fqs_task;
-static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-#define RCU_TORTURE_PIPE_LEN 10
-
-struct rcu_torture {
- struct rcu_head rtort_rcu;
- int rtort_pipe_count;
- struct list_head rtort_free;
- int rtort_mbtest;
-};
-
-static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture __rcu *rcu_torture_current;
-static unsigned long rcu_torture_current_version;
-static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
-static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
- { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
- { 0 };
-static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
-static atomic_t n_rcu_torture_alloc;
-static atomic_t n_rcu_torture_alloc_fail;
-static atomic_t n_rcu_torture_free;
-static atomic_t n_rcu_torture_mberror;
-static atomic_t n_rcu_torture_error;
-static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
-static long n_rcu_torture_boost_failure;
-static long n_rcu_torture_boosts;
-static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static long n_online_attempts;
-static long n_online_successes;
-static struct list_head rcu_torture_removed;
-static cpumask_var_t shuffle_tmp_mask;
-
-static int stutter_pause_test;
-
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(rcutorture_runnable, int, 0444);
-MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
-
-#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
-#define rcu_can_boost() 1
-#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-#define rcu_can_boost() 0
-#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-
-static unsigned long shutdown_time; /* jiffies to system shutdown. */
-static unsigned long boost_starttime; /* jiffies of next boost test start. */
-DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
- /* and boost task create/destroy. */
-
-/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
-
-#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
-static int fullstop = FULLSTOP_RMMOD;
-/*
- * Protect fullstop transitions and spawning of kthreads.
- */
-static DEFINE_MUTEX(fullstop_mutex);
-
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
-
-/*
- * Detect and respond to a system shutdown.
- */
-static int
-rcutorture_shutdown_notify(struct notifier_block *unused1,
- unsigned long unused2, void *unused3)
-{
- mutex_lock(&fullstop_mutex);
- if (fullstop == FULLSTOP_DONTSTOP)
- fullstop = FULLSTOP_SHUTDOWN;
- else
- printk(KERN_WARNING /* but going down anyway, so... */
- "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
- mutex_unlock(&fullstop_mutex);
- return NOTIFY_DONE;
-}
-
-/*
- * Absorb kthreads into a kernel function that won't return, so that
- * they won't ever access module text or data again.
- */
-static void rcutorture_shutdown_absorb(char *title)
-{
- if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
- printk(KERN_NOTICE
- "rcutorture thread %s parking due to system shutdown\n",
- title);
- schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
- }
-}
-
-/*
- * Allocate an element from the rcu_tortures pool.
- */
-static struct rcu_torture *
-rcu_torture_alloc(void)
-{
- struct list_head *p;
-
- spin_lock_bh(&rcu_torture_lock);
- if (list_empty(&rcu_torture_freelist)) {
- atomic_inc(&n_rcu_torture_alloc_fail);
- spin_unlock_bh(&rcu_torture_lock);
- return NULL;
- }
- atomic_inc(&n_rcu_torture_alloc);
- p = rcu_torture_freelist.next;
- list_del_init(p);
- spin_unlock_bh(&rcu_torture_lock);
- return container_of(p, struct rcu_torture, rtort_free);
-}
-
-/*
- * Free an element to the rcu_tortures pool.
- */
-static void
-rcu_torture_free(struct rcu_torture *p)
-{
- atomic_inc(&n_rcu_torture_free);
- spin_lock_bh(&rcu_torture_lock);
- list_add_tail(&p->rtort_free, &rcu_torture_freelist);
- spin_unlock_bh(&rcu_torture_lock);
-}
-
-struct rcu_random_state {
- unsigned long rrs_state;
- long rrs_count;
-};
-
-#define RCU_RANDOM_MULT 39916801 /* prime */
-#define RCU_RANDOM_ADD 479001701 /* prime */
-#define RCU_RANDOM_REFRESH 10000
-
-#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
-
-/*
- * Crude but fast random-number generator. Uses a linear congruential
- * generator, with occasional help from cpu_clock().
- */
-static unsigned long
-rcu_random(struct rcu_random_state *rrsp)
-{
- if (--rrsp->rrs_count < 0) {
- rrsp->rrs_state += (unsigned long)local_clock();
- rrsp->rrs_count = RCU_RANDOM_REFRESH;
- }
- rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
- return swahw32(rrsp->rrs_state);
-}
-
-static void
-rcu_stutter_wait(char *title)
-{
- while (stutter_pause_test || !rcutorture_runnable) {
- if (rcutorture_runnable)
- schedule_timeout_interruptible(1);
- else
- schedule_timeout_interruptible(round_jiffies_relative(HZ));
- rcutorture_shutdown_absorb(title);
- }
-}
-
-/*
- * Operations vector for selecting different types of tests.
- */
-
-struct rcu_torture_ops {
- void (*init)(void);
- void (*cleanup)(void);
- int (*readlock)(void);
- void (*read_delay)(struct rcu_random_state *rrsp);
- void (*readunlock)(int idx);
- int (*completed)(void);
- void (*deferred_free)(struct rcu_torture *p);
- void (*sync)(void);
- void (*cb_barrier)(void);
- void (*fqs)(void);
- int (*stats)(char *page);
- int irq_capable;
- int can_boost;
- char *name;
-};
-
-static struct rcu_torture_ops *cur_ops;
-
-/*
- * Definitions for rcu torture testing.
- */
-
-static int rcu_torture_read_lock(void) __acquires(RCU)
-{
- rcu_read_lock();
- return 0;
-}
-
-static void rcu_read_delay(struct rcu_random_state *rrsp)
-{
- const unsigned long shortdelay_us = 200;
- const unsigned long longdelay_ms = 50;
-
- /* We want a short delay sometimes to make a reader delay the grace
- * period, and we want a long delay occasionally to trigger
- * force_quiescent_state. */
-
- if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
- udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
- if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
- preempt_schedule(); /* No QS if preempt_disable() in effect */
-#endif
-}
-
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static int rcu_torture_completed(void)
-{
- return rcu_batches_completed();
-}
-
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
- int i;
- struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
- if (fullstop != FULLSTOP_DONTSTOP) {
- /* Test is ending, just drop callbacks on the floor. */
- /* The next initialization will pick up the pieces. */
- return;
- }
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- rcu_torture_free(rp);
- } else
- cur_ops->deferred_free(rp);
-}
-
-static int rcu_no_completed(void)
-{
- return 0;
-}
-
-static void rcu_torture_deferred_free(struct rcu_torture *p)
-{
- call_rcu(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops rcu_ops = {
- .init = NULL,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .name = "rcu"
-};
-
-static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
-{
- int i;
- struct rcu_torture *rp;
- struct rcu_torture *rp1;
-
- cur_ops->sync();
- list_add(&p->rtort_free, &rcu_torture_removed);
- list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- list_del(&rp->rtort_free);
- rcu_torture_free(rp);
- }
- }
-}
-
-static void rcu_sync_torture_init(void)
-{
- INIT_LIST_HEAD(&rcu_torture_removed);
-}
-
-static struct rcu_torture_ops rcu_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu,
- .cb_barrier = NULL,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .name = "rcu_sync"
-};
-
-static struct rcu_torture_ops rcu_expedited_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu_expedited,
- .cb_barrier = NULL,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .name = "rcu_expedited"
-};
-
-/*
- * Definitions for rcu_bh torture testing.
- */
-
-static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
-{
- rcu_read_lock_bh();
- return 0;
-}
-
-static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
-{
- rcu_read_unlock_bh();
-}
-
-static int rcu_bh_torture_completed(void)
-{
- return rcu_batches_completed_bh();
-}
-
-static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
-{
- call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops rcu_bh_ops = {
- .init = NULL,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferred_free = rcu_bh_torture_deferred_free,
- .sync = synchronize_rcu_bh,
- .cb_barrier = rcu_barrier_bh,
- .fqs = rcu_bh_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "rcu_bh"
-};
-
-static struct rcu_torture_ops rcu_bh_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu_bh,
- .cb_barrier = NULL,
- .fqs = rcu_bh_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "rcu_bh_sync"
-};
-
-static struct rcu_torture_ops rcu_bh_expedited_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu_bh_expedited,
- .cb_barrier = NULL,
- .fqs = rcu_bh_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "rcu_bh_expedited"
-};
-
-/*
- * Definitions for srcu torture testing.
- */
-
-static struct srcu_struct srcu_ctl;
-
-static void srcu_torture_init(void)
-{
- init_srcu_struct(&srcu_ctl);
- rcu_sync_torture_init();
-}
-
-static void srcu_torture_cleanup(void)
-{
- synchronize_srcu(&srcu_ctl);
- cleanup_srcu_struct(&srcu_ctl);
-}
-
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
-{
- return srcu_read_lock(&srcu_ctl);
-}
-
-static void srcu_read_delay(struct rcu_random_state *rrsp)
-{
- long delay;
- const long uspertick = 1000000 / HZ;
- const long longdelay = 10;
-
- /* We want there to be long-running readers, but not all the time. */
-
- delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
- schedule_timeout_interruptible(longdelay);
- else
- rcu_read_delay(rrsp);
-}
-
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
-{
- srcu_read_unlock(&srcu_ctl, idx);
-}
-
-static int srcu_torture_completed(void)
-{
- return srcu_batches_completed(&srcu_ctl);
-}
-
-static void srcu_torture_synchronize(void)
-{
- synchronize_srcu(&srcu_ctl);
-}
-
-static int srcu_torture_stats(char *page)
-{
- int cnt = 0;
- int cpu;
- int idx = srcu_ctl.completed & 0x1;
-
- cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
- per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
- per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
- }
- cnt += sprintf(&page[cnt], "\n");
- return cnt;
-}
-
-static struct rcu_torture_ops srcu_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
- .readlock = srcu_torture_read_lock,
- .read_delay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock,
- .completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu"
-};
-
-static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
-{
- return srcu_read_lock_raw(&srcu_ctl);
-}
-
-static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
-{
- srcu_read_unlock_raw(&srcu_ctl, idx);
-}
-
-static struct rcu_torture_ops srcu_raw_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
- .readlock = srcu_torture_read_lock_raw,
- .read_delay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock_raw,
- .completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu_raw"
-};
-
-static void srcu_torture_synchronize_expedited(void)
-{
- synchronize_srcu_expedited(&srcu_ctl);
-}
-
-static struct rcu_torture_ops srcu_expedited_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
- .readlock = srcu_torture_read_lock,
- .read_delay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock,
- .completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize_expedited,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu_expedited"
-};
-
-/*
- * Definitions for sched torture testing.
- */
-
-static int sched_torture_read_lock(void)
-{
- preempt_disable();
- return 0;
-}
-
-static void sched_torture_read_unlock(int idx)
-{
- preempt_enable();
-}
-
-static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
-{
- call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops sched_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sched_torture_deferred_free,
- .sync = synchronize_sched,
- .cb_barrier = rcu_barrier_sched,
- .fqs = rcu_sched_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "sched"
-};
-
-static struct rcu_torture_ops sched_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_sched,
- .cb_barrier = NULL,
- .fqs = rcu_sched_force_quiescent_state,
- .stats = NULL,
- .name = "sched_sync"
-};
-
-static struct rcu_torture_ops sched_expedited_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
- .sync = synchronize_sched_expedited,
- .cb_barrier = NULL,
- .fqs = rcu_sched_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "sched_expedited"
-};
-
-/*
- * RCU torture priority-boost testing. Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked. If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
- */
-
-struct rcu_boost_inflight {
- struct rcu_head rcu;
- int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
- struct rcu_boost_inflight *rbip =
- container_of(head, struct rcu_boost_inflight, rcu);
-
- smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
- rbip->inflight = 0;
-}
-
-static int rcu_torture_boost(void *arg)
-{
- unsigned long call_rcu_time;
- unsigned long endtime;
- unsigned long oldstarttime;
- struct rcu_boost_inflight rbi = { .inflight = 0 };
- struct sched_param sp;
-
- VERBOSE_PRINTK_STRING("rcu_torture_boost started");
-
- /* Set real-time priority. */
- sp.sched_priority = 1;
- if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
- n_rcu_torture_boost_rterror++;
- }
-
- init_rcu_head_on_stack(&rbi.rcu);
- /* Each pass through the following loop does one boost-test cycle. */
- do {
- /* Wait for the next test interval. */
- oldstarttime = boost_starttime;
- while (ULONG_CMP_LT(jiffies, oldstarttime)) {
- schedule_timeout_uninterruptible(1);
- rcu_stutter_wait("rcu_torture_boost");
- if (kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP)
- goto checkwait;
- }
-
- /* Do one boost-test interval. */
- endtime = oldstarttime + test_boost_duration * HZ;
- call_rcu_time = jiffies;
- while (ULONG_CMP_LT(jiffies, endtime)) {
- /* If we don't have a callback in flight, post one. */
- if (!rbi.inflight) {
- smp_mb(); /* RCU core before ->inflight = 1. */
- rbi.inflight = 1;
- call_rcu(&rbi.rcu, rcu_torture_boost_cb);
- if (jiffies - call_rcu_time >
- test_boost_duration * HZ - HZ / 2) {
- VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
- n_rcu_torture_boost_failure++;
- }
- call_rcu_time = jiffies;
- }
- cond_resched();
- rcu_stutter_wait("rcu_torture_boost");
- if (kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP)
- goto checkwait;
- }
-
- /*
- * Set the start time of the next test interval.
- * Yes, this is vulnerable to long delays, but such
- * delays simply cause a false negative for the next
- * interval. Besides, we are running at RT priority,
- * so delays should be relatively rare.
- */
- while (oldstarttime == boost_starttime &&
- !kthread_should_stop()) {
- if (mutex_trylock(&boost_mutex)) {
- boost_starttime = jiffies +
- test_boost_interval * HZ;
- n_rcu_torture_boosts++;
- mutex_unlock(&boost_mutex);
- break;
- }
- schedule_timeout_uninterruptible(1);
- }
-
- /* Go do the stutter. */
-checkwait: rcu_stutter_wait("rcu_torture_boost");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-
- /* Clean up and exit. */
- VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
- rcutorture_shutdown_absorb("rcu_torture_boost");
- while (!kthread_should_stop() || rbi.inflight)
- schedule_timeout_uninterruptible(1);
- smp_mb(); /* order accesses to ->inflight before stack-frame death. */
- destroy_rcu_head_on_stack(&rbi.rcu);
- return 0;
-}
-
-/*
- * RCU torture force-quiescent-state kthread. Repeatedly induces
- * bursts of calls to force_quiescent_state(), increasing the probability
- * of occurrence of some important types of race conditions.
- */
-static int
-rcu_torture_fqs(void *arg)
-{
- unsigned long fqs_resume_time;
- int fqs_burst_remaining;
-
- VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
- do {
- fqs_resume_time = jiffies + fqs_stutter * HZ;
- while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
- !kthread_should_stop()) {
- schedule_timeout_interruptible(1);
- }
- fqs_burst_remaining = fqs_duration;
- while (fqs_burst_remaining > 0 &&
- !kthread_should_stop()) {
- cur_ops->fqs();
- udelay(fqs_holdoff);
- fqs_burst_remaining -= fqs_holdoff;
- }
- rcu_stutter_wait("rcu_torture_fqs");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
- rcutorture_shutdown_absorb("rcu_torture_fqs");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
- return 0;
-}
-
-/*
- * RCU torture writer kthread. Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
- */
-static int
-rcu_torture_writer(void *arg)
-{
- int i;
- long oldbatch = rcu_batches_completed();
- struct rcu_torture *rp;
- struct rcu_torture *old_rp;
- static DEFINE_RCU_RANDOM(rand);
-
- VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
- set_user_nice(current, 19);
-
- do {
- schedule_timeout_uninterruptible(1);
- rp = rcu_torture_alloc();
- if (rp == NULL)
- continue;
- rp->rtort_pipe_count = 0;
- udelay(rcu_random(&rand) & 0x3ff);
- old_rp = rcu_dereference_check(rcu_torture_current,
- current == writer_task);
- rp->rtort_mbtest = 1;
- rcu_assign_pointer(rcu_torture_current, rp);
- smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
- if (old_rp) {
- i = old_rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- old_rp->rtort_pipe_count++;
- cur_ops->deferred_free(old_rp);
- }
- rcutorture_record_progress(++rcu_torture_current_version);
- oldbatch = cur_ops->completed();
- rcu_stutter_wait("rcu_torture_writer");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
- rcutorture_shutdown_absorb("rcu_torture_writer");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
- return 0;
-}
-
-/*
- * RCU torture fake writer kthread. Repeatedly calls sync, with a random
- * delay between calls.
- */
-static int
-rcu_torture_fakewriter(void *arg)
-{
- DEFINE_RCU_RANDOM(rand);
-
- VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
- set_user_nice(current, 19);
-
- do {
- schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
- udelay(rcu_random(&rand) & 0x3ff);
- cur_ops->sync();
- rcu_stutter_wait("rcu_torture_fakewriter");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-
- VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
- rcutorture_shutdown_absorb("rcu_torture_fakewriter");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
- return 0;
-}
-
-void rcutorture_trace_dump(void)
-{
- static atomic_t beenhere = ATOMIC_INIT(0);
-
- if (atomic_read(&beenhere))
- return;
- if (atomic_xchg(&beenhere, 1) != 0)
- return;
- do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
- ftrace_dump(DUMP_ALL);
-}
-
-/*
- * RCU torture reader from timer handler. Dereferences rcu_torture_current,
- * incrementing the corresponding element of the pipeline array. The
- * counter in the element should never be greater than 1, otherwise, the
- * RCU implementation is broken.
- */
-static void rcu_torture_timer(unsigned long unused)
-{
- int idx;
- int completed;
- static DEFINE_RCU_RANDOM(rand);
- static DEFINE_SPINLOCK(rand_lock);
- struct rcu_torture *p;
- int pipe_count;
-
- idx = cur_ops->readlock();
- completed = cur_ops->completed();
- p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
- if (p == NULL) {
- /* Leave because rcu_torture_writer is not yet underway */
- cur_ops->readunlock(idx);
- return;
- }
- if (p->rtort_mbtest == 0)
- atomic_inc(&n_rcu_torture_mberror);
- spin_lock(&rand_lock);
- cur_ops->read_delay(&rand);
- n_rcu_torture_timers++;
- spin_unlock(&rand_lock);
- preempt_disable();
- pipe_count = p->rtort_pipe_count;
- if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
- pipe_count = RCU_TORTURE_PIPE_LEN;
- }
- if (pipe_count > 1)
- rcutorture_trace_dump();
- __this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = cur_ops->completed() - completed;
- if (completed > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
- completed = RCU_TORTURE_PIPE_LEN;
- }
- __this_cpu_inc(rcu_torture_batch[completed]);
- preempt_enable();
- cur_ops->readunlock(idx);
-}
-
-/*
- * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
- * incrementing the corresponding element of the pipeline array. The
- * counter in the element should never be greater than 1, otherwise, the
- * RCU implementation is broken.
- */
-static int
-rcu_torture_reader(void *arg)
-{
- int completed;
- int idx;
- DEFINE_RCU_RANDOM(rand);
- struct rcu_torture *p;
- int pipe_count;
- struct timer_list t;
-
- VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
- set_user_nice(current, 19);
- if (irqreader && cur_ops->irq_capable)
- setup_timer_on_stack(&t, rcu_torture_timer, 0);
-
- do {
- if (irqreader && cur_ops->irq_capable) {
- if (!timer_pending(&t))
- mod_timer(&t, jiffies + 1);
- }
- idx = cur_ops->readlock();
- completed = cur_ops->completed();
- p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
- if (p == NULL) {
- /* Wait for rcu_torture_writer to get underway */
- cur_ops->readunlock(idx);
- schedule_timeout_interruptible(HZ);
- continue;
- }
- if (p->rtort_mbtest == 0)
- atomic_inc(&n_rcu_torture_mberror);
- cur_ops->read_delay(&rand);
- preempt_disable();
- pipe_count = p->rtort_pipe_count;
- if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
- pipe_count = RCU_TORTURE_PIPE_LEN;
- }
- if (pipe_count > 1)
- rcutorture_trace_dump();
- __this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = cur_ops->completed() - completed;
- if (completed > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
- completed = RCU_TORTURE_PIPE_LEN;
- }
- __this_cpu_inc(rcu_torture_batch[completed]);
- preempt_enable();
- cur_ops->readunlock(idx);
- schedule();
- rcu_stutter_wait("rcu_torture_reader");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
- rcutorture_shutdown_absorb("rcu_torture_reader");
- if (irqreader && cur_ops->irq_capable)
- del_timer_sync(&t);
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
- return 0;
-}
-
-/*
- * Create an RCU-torture statistics message in the specified buffer.
- */
-static int
-rcu_torture_printk(char *page)
-{
- int cnt = 0;
- int cpu;
- int i;
- long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
- long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-
- for_each_possible_cpu(cpu) {
- for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
- batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
- }
- }
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
- if (pipesummary[i] != 0)
- break;
- }
- cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
- cnt += sprintf(&page[cnt],
- "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d rtbke: %ld rtbre: %ld "
- "rtbf: %ld rtb: %ld nt: %ld "
- "onoff: %ld/%ld:%ld/%ld",
- rcu_torture_current,
- rcu_torture_current_version,
- list_empty(&rcu_torture_freelist),
- atomic_read(&n_rcu_torture_alloc),
- atomic_read(&n_rcu_torture_alloc_fail),
- atomic_read(&n_rcu_torture_free),
- atomic_read(&n_rcu_torture_mberror),
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror,
- n_rcu_torture_boost_failure,
- n_rcu_torture_boosts,
- n_rcu_torture_timers,
- n_online_successes,
- n_online_attempts,
- n_offline_successes,
- n_offline_attempts);
- if (atomic_read(&n_rcu_torture_mberror) != 0 ||
- n_rcu_torture_boost_ktrerror != 0 ||
- n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_failure != 0)
- cnt += sprintf(&page[cnt], " !!!");
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- if (i > 1) {
- cnt += sprintf(&page[cnt], "!!! ");
- atomic_inc(&n_rcu_torture_error);
- WARN_ON_ONCE(1);
- }
- cnt += sprintf(&page[cnt], "Reader Pipe: ");
- for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- cnt += sprintf(&page[cnt], "Reader Batch: ");
- for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
- for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- cnt += sprintf(&page[cnt], " %d",
- atomic_read(&rcu_torture_wcount[i]));
- }
- cnt += sprintf(&page[cnt], "\n");
- if (cur_ops->stats)
- cnt += cur_ops->stats(&page[cnt]);
- return cnt;
-}
-
-/*
- * Print torture statistics. Caller must ensure that there is only
- * one call to this function at a given time!!! This is normally
- * accomplished by relying on the module system to only have one copy
- * of the module loaded, and then by giving the rcu_torture_stats
- * kthread full control (or the init/cleanup functions when rcu_torture_stats
- * thread is not running).
- */
-static void
-rcu_torture_stats_print(void)
-{
- int cnt;
-
- cnt = rcu_torture_printk(printk_buf);
- printk(KERN_ALERT "%s", printk_buf);
-}
-
-/*
- * Periodically prints torture statistics, if periodic statistics printing
- * was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
- */
-static int
-rcu_torture_stats(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
- do {
- schedule_timeout_interruptible(stat_interval * HZ);
- rcu_torture_stats_print();
- rcutorture_shutdown_absorb("rcu_torture_stats");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
- return 0;
-}
-
-static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
-
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
- int i;
-
- cpumask_setall(shuffle_tmp_mask);
- get_online_cpus();
-
- /* No point in shuffling if there is only one online CPU (ex: UP) */
- if (num_online_cpus() == 1) {
- put_online_cpus();
- return;
- }
-
- if (rcu_idle_cpu != -1)
- cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
-
- set_cpus_allowed_ptr(current, shuffle_tmp_mask);
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- if (reader_tasks[i])
- set_cpus_allowed_ptr(reader_tasks[i],
- shuffle_tmp_mask);
- }
-
- if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
- if (fakewriter_tasks[i])
- set_cpus_allowed_ptr(fakewriter_tasks[i],
- shuffle_tmp_mask);
- }
-
- if (writer_task)
- set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-
- if (stats_task)
- set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
-
- if (rcu_idle_cpu == -1)
- rcu_idle_cpu = num_online_cpus() - 1;
- else
- rcu_idle_cpu--;
-
- put_online_cpus();
-}
-
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
- do {
- schedule_timeout_interruptible(shuffle_interval * HZ);
- rcu_torture_shuffle_tasks();
- rcutorture_shutdown_absorb("rcu_torture_shuffle");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
- return 0;
-}
-
-/* Cause the rcutorture test to "stutter", starting and stopping all
- * threads periodically.
- */
-static int
-rcu_torture_stutter(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
- do {
- schedule_timeout_interruptible(stutter * HZ);
- stutter_pause_test = 1;
- if (!kthread_should_stop())
- schedule_timeout_interruptible(stutter * HZ);
- stutter_pause_test = 0;
- rcutorture_shutdown_absorb("rcu_torture_stutter");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
- return 0;
-}
-
-static inline void
-rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
-{
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "--- %s: nreaders=%d nfakewriters=%d "
- "stat_interval=%d verbose=%d test_no_idle_hz=%d "
- "shuffle_interval=%d stutter=%d irqreader=%d "
- "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
- "test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
- "onoff_interval=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
- stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
- test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
- onoff_interval);
-}
-
-static struct notifier_block rcutorture_shutdown_nb = {
- .notifier_call = rcutorture_shutdown_notify,
-};
-
-static void rcutorture_booster_cleanup(int cpu)
-{
- struct task_struct *t;
-
- if (boost_tasks[cpu] == NULL)
- return;
- mutex_lock(&boost_mutex);
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
- t = boost_tasks[cpu];
- boost_tasks[cpu] = NULL;
- mutex_unlock(&boost_mutex);
-
- /* This must be outside of the mutex, otherwise deadlock! */
- kthread_stop(t);
-}
-
-static int rcutorture_booster_init(int cpu)
-{
- int retval;
-
- if (boost_tasks[cpu] != NULL)
- return 0; /* Already created, nothing more to do. */
-
- /* Don't allow time recalculation while creating a new task. */
- mutex_lock(&boost_mutex);
- VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
- if (IS_ERR(boost_tasks[cpu])) {
- retval = PTR_ERR(boost_tasks[cpu]);
- VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
- n_rcu_torture_boost_ktrerror++;
- boost_tasks[cpu] = NULL;
- mutex_unlock(&boost_mutex);
- return retval;
- }
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
- mutex_unlock(&boost_mutex);
- return 0;
-}
-
-/*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
- long delta;
- unsigned long jiffies_snap;
-
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
- jiffies_snap = ACCESS_ONCE(jiffies);
- while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
- !kthread_should_stop()) {
- delta = shutdown_time - jiffies_snap;
- if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_shutdown task: %lu "
- "jiffies remaining\n",
- torture_type, delta);
- schedule_timeout_interruptible(delta);
- jiffies_snap = ACCESS_ONCE(jiffies);
- }
- if (kthread_should_stop()) {
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
- return 0;
- }
-
- /* OK, shut down the system. */
-
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
- shutdown_task = NULL; /* Avoid self-kill deadlock. */
- rcu_torture_cleanup(); /* Get the success/failure message. */
- kernel_power_off(); /* Shut down the system. */
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int __cpuinit
-rcu_torture_onoff(void *arg)
-{
- int cpu;
- int maxcpu = -1;
- DEFINE_RCU_RANDOM(rand);
-
- VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
- for_each_online_cpu(cpu)
- maxcpu = cpu;
- WARN_ON(maxcpu < 0);
- while (!kthread_should_stop()) {
- cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
- if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
- if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlining %d\n",
- torture_type, cpu);
- n_offline_attempts++;
- if (cpu_down(cpu) == 0) {
- if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: "
- "offlined %d\n",
- torture_type, cpu);
- n_offline_successes++;
- }
- } else if (cpu_is_hotpluggable(cpu)) {
- if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlining %d\n",
- torture_type, cpu);
- n_online_attempts++;
- if (cpu_up(cpu) == 0) {
- if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: "
- "onlined %d\n",
- torture_type, cpu);
- n_online_successes++;
- }
- }
- schedule_timeout_interruptible(onoff_interval * HZ);
- }
- VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
- return 0;
-}
-
-static int __cpuinit
-rcu_torture_onoff_init(void)
-{
- if (onoff_interval <= 0)
- return 0;
- onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
- if (IS_ERR(onoff_task)) {
- onoff_task = NULL;
- return PTR_ERR(onoff_task);
- }
- return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
- if (onoff_task == NULL)
- return;
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
- kthread_stop(onoff_task);
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void
-rcu_torture_onoff_init(void)
-{
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-static int rcutorture_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- (void)rcutorture_booster_init(cpu);
- break;
- case CPU_DOWN_PREPARE:
- rcutorture_booster_cleanup(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block rcutorture_cpu_nb = {
- .notifier_call = rcutorture_cpu_notify,
-};
-
-static void
-rcu_torture_cleanup(void)
-{
- int i;
-
- mutex_lock(&fullstop_mutex);
- rcutorture_record_test_transition();
- if (fullstop == FULLSTOP_SHUTDOWN) {
- printk(KERN_WARNING /* but going down anyway, so... */
- "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
- mutex_unlock(&fullstop_mutex);
- schedule_timeout_uninterruptible(10);
- if (cur_ops->cb_barrier != NULL)
- cur_ops->cb_barrier();
- return;
- }
- fullstop = FULLSTOP_RMMOD;
- mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_shutdown_nb);
- if (stutter_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
- kthread_stop(stutter_task);
- }
- stutter_task = NULL;
- if (shuffler_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
- kthread_stop(shuffler_task);
- free_cpumask_var(shuffle_tmp_mask);
- }
- shuffler_task = NULL;
-
- if (writer_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
- kthread_stop(writer_task);
- }
- writer_task = NULL;
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++) {
- if (reader_tasks[i]) {
- VERBOSE_PRINTK_STRING(
- "Stopping rcu_torture_reader task");
- kthread_stop(reader_tasks[i]);
- }
- reader_tasks[i] = NULL;
- }
- kfree(reader_tasks);
- reader_tasks = NULL;
- }
- rcu_torture_current = NULL;
-
- if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++) {
- if (fakewriter_tasks[i]) {
- VERBOSE_PRINTK_STRING(
- "Stopping rcu_torture_fakewriter task");
- kthread_stop(fakewriter_tasks[i]);
- }
- fakewriter_tasks[i] = NULL;
- }
- kfree(fakewriter_tasks);
- fakewriter_tasks = NULL;
- }
-
- if (stats_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
- kthread_stop(stats_task);
- }
- stats_task = NULL;
-
- if (fqs_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
- kthread_stop(fqs_task);
- }
- fqs_task = NULL;
- if ((test_boost == 1 && cur_ops->can_boost) ||
- test_boost == 2) {
- unregister_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i)
- rcutorture_booster_cleanup(i);
- }
- if (shutdown_task != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
- kthread_stop(shutdown_task);
- }
- rcu_torture_onoff_cleanup();
-
- /* Wait for all RCU callbacks to fire. */
-
- if (cur_ops->cb_barrier != NULL)
- cur_ops->cb_barrier();
-
- rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
-
- if (cur_ops->cleanup)
- cur_ops->cleanup();
- if (atomic_read(&n_rcu_torture_error))
- rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
- else
- rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
-}
-
-static int __init
-rcu_torture_init(void)
-{
- int i;
- int cpu;
- int firsterr = 0;
- static struct rcu_torture_ops *torture_ops[] =
- { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
- &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
- &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
- &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
-
- mutex_lock(&fullstop_mutex);
-
- /* Process args and tell the world that the torturer is on the job. */
- for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
- cur_ops = torture_ops[i];
- if (strcmp(torture_type, cur_ops->name) == 0)
- break;
- }
- if (i == ARRAY_SIZE(torture_ops)) {
- printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
- torture_type);
- printk(KERN_ALERT "rcu-torture types:");
- for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
- printk(KERN_ALERT " %s", torture_ops[i]->name);
- printk(KERN_ALERT "\n");
- mutex_unlock(&fullstop_mutex);
- return -EINVAL;
- }
- if (cur_ops->fqs == NULL && fqs_duration != 0) {
- printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
- "fqs_duration, fqs disabled.\n");
- fqs_duration = 0;
- }
- if (cur_ops->init)
- cur_ops->init(); /* no "goto unwind" prior to this point!!! */
-
- if (nreaders >= 0)
- nrealreaders = nreaders;
- else
- nrealreaders = 2 * num_online_cpus();
- rcu_torture_print_module_parms(cur_ops, "Start of test");
- fullstop = FULLSTOP_DONTSTOP;
-
- /* Set up the freelist. */
-
- INIT_LIST_HEAD(&rcu_torture_freelist);
- for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
- rcu_tortures[i].rtort_mbtest = 0;
- list_add_tail(&rcu_tortures[i].rtort_free,
- &rcu_torture_freelist);
- }
-
- /* Initialize the statistics so that each run gets its own numbers. */
-
- rcu_torture_current = NULL;
- rcu_torture_current_version = 0;
- atomic_set(&n_rcu_torture_alloc, 0);
- atomic_set(&n_rcu_torture_alloc_fail, 0);
- atomic_set(&n_rcu_torture_free, 0);
- atomic_set(&n_rcu_torture_mberror, 0);
- atomic_set(&n_rcu_torture_error, 0);
- n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
- n_rcu_torture_boost_failure = 0;
- n_rcu_torture_boosts = 0;
- for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- atomic_set(&rcu_torture_wcount[i], 0);
- for_each_possible_cpu(cpu) {
- for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- per_cpu(rcu_torture_count, cpu)[i] = 0;
- per_cpu(rcu_torture_batch, cpu)[i] = 0;
- }
- }
-
- /* Start up the kthreads. */
-
- VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
- writer_task = kthread_run(rcu_torture_writer, NULL,
- "rcu_torture_writer");
- if (IS_ERR(writer_task)) {
- firsterr = PTR_ERR(writer_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
- writer_task = NULL;
- goto unwind;
- }
- fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
- if (fakewriter_tasks == NULL) {
- VERBOSE_PRINTK_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < nfakewriters; i++) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
- fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
- "rcu_torture_fakewriter");
- if (IS_ERR(fakewriter_tasks[i])) {
- firsterr = PTR_ERR(fakewriter_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
- fakewriter_tasks[i] = NULL;
- goto unwind;
- }
- }
- reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
- GFP_KERNEL);
- if (reader_tasks == NULL) {
- VERBOSE_PRINTK_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < nrealreaders; i++) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
- reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
- "rcu_torture_reader");
- if (IS_ERR(reader_tasks[i])) {
- firsterr = PTR_ERR(reader_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
- reader_tasks[i] = NULL;
- goto unwind;
- }
- }
- if (stat_interval > 0) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
- stats_task = kthread_run(rcu_torture_stats, NULL,
- "rcu_torture_stats");
- if (IS_ERR(stats_task)) {
- firsterr = PTR_ERR(stats_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
- stats_task = NULL;
- goto unwind;
- }
- }
- if (test_no_idle_hz) {
- rcu_idle_cpu = num_online_cpus() - 1;
-
- if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
- firsterr = -ENOMEM;
- VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
- goto unwind;
- }
-
- /* Create the shuffler thread */
- shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
- "rcu_torture_shuffle");
- if (IS_ERR(shuffler_task)) {
- free_cpumask_var(shuffle_tmp_mask);
- firsterr = PTR_ERR(shuffler_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
- shuffler_task = NULL;
- goto unwind;
- }
- }
- if (stutter < 0)
- stutter = 0;
- if (stutter) {
- /* Create the stutter thread */
- stutter_task = kthread_run(rcu_torture_stutter, NULL,
- "rcu_torture_stutter");
- if (IS_ERR(stutter_task)) {
- firsterr = PTR_ERR(stutter_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
- stutter_task = NULL;
- goto unwind;
- }
- }
- if (fqs_duration < 0)
- fqs_duration = 0;
- if (fqs_duration) {
- /* Create the stutter thread */
- fqs_task = kthread_run(rcu_torture_fqs, NULL,
- "rcu_torture_fqs");
- if (IS_ERR(fqs_task)) {
- firsterr = PTR_ERR(fqs_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
- fqs_task = NULL;
- goto unwind;
- }
- }
- if (test_boost_interval < 1)
- test_boost_interval = 1;
- if (test_boost_duration < 2)
- test_boost_duration = 2;
- if ((test_boost == 1 && cur_ops->can_boost) ||
- test_boost == 2) {
- int retval;
-
- boost_starttime = jiffies + test_boost_interval * HZ;
- register_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i) {
- if (cpu_is_offline(i))
- continue; /* Heuristic: CPU can go offline. */
- retval = rcutorture_booster_init(i);
- if (retval < 0) {
- firsterr = retval;
- goto unwind;
- }
- }
- }
- if (shutdown_secs > 0) {
- shutdown_time = jiffies + shutdown_secs * HZ;
- shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
- "rcu_torture_shutdown");
- if (IS_ERR(shutdown_task)) {
- firsterr = PTR_ERR(shutdown_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
- shutdown_task = NULL;
- goto unwind;
- }
- }
- rcu_torture_onoff_init();
- register_reboot_notifier(&rcutorture_shutdown_nb);
- rcutorture_record_test_transition();
- mutex_unlock(&fullstop_mutex);
- return 0;
-
-unwind:
- mutex_unlock(&fullstop_mutex);
- rcu_torture_cleanup();
- return firsterr;
-}
-
-module_init(rcu_torture_init);
-module_exit(rcu_torture_cleanup);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
deleted file mode 100644
index 6c4a6722abf..00000000000
--- a/kernel/rcutree.c
+++ /dev/null
@@ -1,2289 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2008
- *
- * Authors: Dipankar Sarma <dipankar@in.ibm.com>
- * Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
- *
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/nmi.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/export.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/time.h>
-#include <linux/kernel_stat.h>
-#include <linux/wait.h>
-#include <linux/kthread.h>
-#include <linux/prefetch.h>
-
-#include "rcutree.h"
-#include <trace/events/rcu.h>
-
-#include "rcu.h"
-
-/* Data structures. */
-
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-
-#define RCU_STATE_INITIALIZER(structname) { \
- .level = { &structname##_state.node[0] }, \
- .levelcnt = { \
- NUM_RCU_LVL_0, /* root of hierarchy. */ \
- NUM_RCU_LVL_1, \
- NUM_RCU_LVL_2, \
- NUM_RCU_LVL_3, \
- NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
- }, \
- .fqs_state = RCU_GP_IDLE, \
- .gpnum = -300, \
- .completed = -300, \
- .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
- .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
- .n_force_qs = 0, \
- .n_force_qs_ngp = 0, \
- .name = #structname, \
-}
-
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
-DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-static struct rcu_state *rcu_state;
-
-/*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned. So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier(). When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods. This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
- */
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * The rcu_scheduler_fully_active variable transitions from zero to one
- * during the early_initcall() processing, which is after the scheduler
- * is capable of creating new tasks. So RCU processing (for example,
- * creating tasks for RCU priority boosting) must be delayed until after
- * rcu_scheduler_fully_active transitions from zero to one. We also
- * currently delay invocation of any RCU callbacks until after this point.
- *
- * It might later prove better for people registering RCU callbacks during
- * early boot to take responsibility for these callbacks, but one step at
- * a time.
- */
-static int rcu_scheduler_fully_active __read_mostly;
-
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads. These
- * handle all flavors of RCU.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
-static void invoke_rcu_core(void);
-static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
-
-/*
- * Track the rcutorture test sequence number and the update version
- * number within a given test. The rcutorture_testseq is incremented
- * on every rcutorture module load and unload, so has an odd value
- * when a test is running. The rcutorture_vernum is set to zero
- * when rcutorture starts and is incremented on each rcutorture update.
- * These variables enable correlating rcutorture output with the
- * RCU tracing information.
- */
-unsigned long rcutorture_testseq;
-unsigned long rcutorture_vernum;
-
-/*
- * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
- * permit this function to be invoked without holding the root rcu_node
- * structure's ->lock, but of course results can be subject to change.
- */
-static int rcu_gp_in_progress(struct rcu_state *rsp)
-{
- return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
-}
-
-/*
- * Note a quiescent state. Because we do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period, this just sets a flag.
- * The caller must have disabled preemption.
- */
-void rcu_sched_qs(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-
- rdp->passed_quiesce_gpnum = rdp->gpnum;
- barrier();
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
- rdp->passed_quiesce = 1;
-}
-
-void rcu_bh_qs(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-
- rdp->passed_quiesce_gpnum = rdp->gpnum;
- barrier();
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
- rdp->passed_quiesce = 1;
-}
-
-/*
- * Note a context switch. This is a quiescent state for RCU-sched,
- * and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
- */
-void rcu_note_context_switch(int cpu)
-{
- trace_rcu_utilization("Start context switch");
- rcu_sched_qs(cpu);
- rcu_preempt_note_context_switch(cpu);
- trace_rcu_utilization("End context switch");
-}
-EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-
-DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_NESTING,
- .dynticks = ATOMIC_INIT(1),
-};
-
-static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
-static int qhimark = 10000; /* If this many pending, ignore blimit. */
-static int qlowmark = 100; /* Once only this many pending, use blimit. */
-
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-
-int rcu_cpu_stall_suppress __read_mostly;
-module_param(rcu_cpu_stall_suppress, int, 0644);
-
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
-static int rcu_pending(int cpu);
-
-/*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
- */
-long rcu_batches_completed_sched(void)
-{
- return rcu_sched_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
-
-/*
- * Return the number of RCU BH batches processed thus far for debug & stats.
- */
-long rcu_batches_completed_bh(void)
-{
- return rcu_bh_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
-/*
- * Force a quiescent state for RCU BH.
- */
-void rcu_bh_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_bh_state, 0);
-}
-EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
-
-/*
- * Record the number of times rcutorture tests have been initiated and
- * terminated. This information allows the debugfs tracing stats to be
- * correlated to the rcutorture messages, even when the rcutorture module
- * is being repeatedly loaded and unloaded. In other words, we cannot
- * store this state in rcutorture itself.
- */
-void rcutorture_record_test_transition(void)
-{
- rcutorture_testseq++;
- rcutorture_vernum = 0;
-}
-EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
-
-/*
- * Record the number of writer passes through the current rcutorture test.
- * This is also used to correlate debugfs tracing stats with the rcutorture
- * messages.
- */
-void rcutorture_record_progress(unsigned long vernum)
-{
- rcutorture_vernum++;
-}
-EXPORT_SYMBOL_GPL(rcutorture_record_progress);
-
-/*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_sched_state, 0);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-
-/*
- * Does the CPU have callbacks ready to be invoked?
- */
-static int
-cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
-{
- return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
-}
-
-/*
- * Does the current CPU require a yet-as-unscheduled grace period?
- */
-static int
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
-}
-
-/*
- * Return the root node of the specified rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
-{
- return &rsp->node[0];
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * If the specified CPU is offline, tell the caller that it is in
- * a quiescent state. Otherwise, whack it with a reschedule IPI.
- * Grace periods can end up waiting on an offline CPU when that
- * CPU is in the process of coming online -- it will be added to the
- * rcu_node bitmasks before it actually makes it online. The same thing
- * can happen while a CPU is in the process of coming online. Because this
- * race is quite rare, we check for it after detecting that the grace
- * period has been delayed rather than checking each and every CPU
- * each and every time we start a new grace period.
- */
-static int rcu_implicit_offline_qs(struct rcu_data *rdp)
-{
- /*
- * If the CPU is offline, it is in a quiescent state. We can
- * trust its state not to change because interrupts are disabled.
- */
- if (cpu_is_offline(rdp->cpu)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
- rdp->offline_fqs++;
- return 1;
- }
-
- /*
- * The CPU is online, so send it a reschedule IPI. This forces
- * it through the scheduler, and (inefficiently) also handles cases
- * where idle loops fail to inform RCU about the CPU being idle.
- */
- if (rdp->cpu != smp_processor_id())
- smp_send_reschedule(rdp->cpu);
- else
- set_need_resched();
- rdp->resched_ipi++;
- return 0;
-}
-
-#endif /* #ifdef CONFIG_SMP */
-
-/*
- * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
- *
- * If the new value of the ->dynticks_nesting counter now is zero,
- * we really have entered idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
- */
-static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
-{
- trace_rcu_dyntick("Start", oldval, 0);
- if (!is_idle_task(current)) {
- struct task_struct *idle = idle_task(smp_processor_id());
-
- trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
- rcu_prepare_for_idle(smp_processor_id());
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
-}
-
-/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * We crowbar the ->dynticks_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- */
-void rcu_idle_enter(void)
-{
- unsigned long flags;
- long long oldval;
- struct rcu_dynticks *rdtp;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting = 0;
- rcu_idle_enter_common(rdtp, oldval);
- local_irq_restore(flags);
-}
-
-/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture violates this assumption, RCU will give you what you
- * deserve, good and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- */
-void rcu_irq_exit(void)
-{
- unsigned long flags;
- long long oldval;
- struct rcu_dynticks *rdtp;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
- if (rdtp->dynticks_nesting)
- trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
- else
- rcu_idle_enter_common(rdtp, oldval);
- local_irq_restore(flags);
-}
-
-/*
- * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
- *
- * If the new value of the ->dynticks_nesting counter was previously zero,
- * we really have exited idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
- */
-static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
-{
- smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic_inc(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- rcu_cleanup_after_idle(smp_processor_id());
- trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
- if (!is_idle_task(current)) {
- struct task_struct *idle = idle_task(smp_processor_id());
-
- trace_rcu_dyntick("Error on exit: not idle task",
- oldval, rdtp->dynticks_nesting);
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
-}
-
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
- * allow for the possibility of usermode upcalls messing up our count
- * of interrupt nesting level during the busy period that is just
- * now starting.
- */
-void rcu_idle_exit(void)
-{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
- long long oldval;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval != 0);
- rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
- rcu_idle_exit_common(rdtp, oldval);
- local_irq_restore(flags);
-}
-
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to
- * user mode! This code assumes that the idle loop never does upcalls to
- * user mode. If your architecture does do upcalls from the idle loop (or
- * does anything else that results in unbalanced calls to the irq_enter()
- * and irq_exit() functions), RCU will give you what you deserve, good
- * and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- */
-void rcu_irq_enter(void)
-{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
- long long oldval;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
- if (oldval)
- trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
- else
- rcu_idle_exit_common(rdtp, oldval);
- local_irq_restore(flags);
-}
-
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
- */
-void rcu_nmi_enter(void)
-{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (rdtp->dynticks_nmi_nesting == 0 &&
- (atomic_read(&rdtp->dynticks) & 0x1))
- return;
- rdtp->dynticks_nmi_nesting++;
- smp_mb__before_atomic_inc(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic_inc(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
-}
-
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
- *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
- */
-void rcu_nmi_exit(void)
-{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (rdtp->dynticks_nmi_nesting == 0 ||
- --rdtp->dynticks_nmi_nesting != 0)
- return;
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force delay to next write. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
-}
-
-#ifdef CONFIG_PROVE_RCU
-
-/**
- * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
- *
- * If the current CPU is in its idle loop and is neither in an interrupt
- * or NMI handler, return true.
- */
-int rcu_is_cpu_idle(void)
-{
- int ret;
-
- preempt_disable();
- ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
- preempt_enable();
- return ret;
-}
-EXPORT_SYMBOL(rcu_is_cpu_idle);
-
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
-/**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
- *
- * If the current CPU is idle or running at a first-level (not nested)
- * interrupt from idle, return true. The caller must have at least
- * disabled preemption.
- */
-int rcu_is_cpu_rrupt_from_idle(void)
-{
- return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * Snapshot the specified CPU's dynticks counter so that we can later
- * credit them with an implicit quiescent state. Return 1 if this CPU
- * is in dynticks idle mode, which is an extended quiescent state.
- */
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
-{
- rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
- return (rdp->dynticks_snap & 0x1) == 0;
-}
-
-/*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU.
- */
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
- unsigned int curr;
- unsigned int snap;
-
- curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
- snap = (unsigned int)rdp->dynticks_snap;
-
- /*
- * If the CPU passed through or entered a dynticks idle phase with
- * no active irq/NMI handlers, then we can safely pretend that the CPU
- * already acknowledged the request to pass through a quiescent
- * state. Either way, that CPU cannot possibly be in an RCU
- * read-side critical section that started before the beginning
- * of the current RCU grace period.
- */
- if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
- rdp->dynticks_fqs++;
- return 1;
- }
-
- /* Go check for the CPU being offline. */
- return rcu_implicit_offline_qs(rdp);
-}
-
-#endif /* #ifdef CONFIG_SMP */
-
-static void record_gp_stall_check_time(struct rcu_state *rsp)
-{
- rsp->gp_start = jiffies;
- rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
-}
-
-static void print_other_cpu_stall(struct rcu_state *rsp)
-{
- int cpu;
- long delta;
- unsigned long flags;
- int ndetected;
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- /* Only let one CPU complain about others per time interval. */
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - rsp->jiffies_stall;
- if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-
- /*
- * Now rat on any tasks that got kicked up to the root rcu_node
- * due to CPU offlining.
- */
- ndetected = rcu_print_task_stall(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
- /*
- * OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
- rsp->name);
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected += rcu_print_task_stall(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (rnp->qsmask == 0)
- continue;
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu)) {
- printk(" %d", rnp->grplo + cpu);
- ndetected++;
- }
- }
- printk("} (detected by %d, t=%ld jiffies)\n",
- smp_processor_id(), (long)(jiffies - rsp->gp_start));
- if (ndetected == 0)
- printk(KERN_ERR "INFO: Stall ended before state dump start\n");
- else if (!trigger_all_cpu_backtrace())
- dump_stack();
-
- /* If so configured, complain about tasks blocking the grace period. */
-
- rcu_print_detail_task_stall(rsp);
-
- force_quiescent_state(rsp, 0); /* Kick them all. */
-}
-
-static void print_cpu_stall(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- /*
- * OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
- rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
- if (!trigger_all_cpu_backtrace())
- dump_stack();
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
- rsp->jiffies_stall =
- jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
- set_need_resched(); /* kick ourselves to get things going. */
-}
-
-static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- unsigned long j;
- unsigned long js;
- struct rcu_node *rnp;
-
- if (rcu_cpu_stall_suppress)
- return;
- j = ACCESS_ONCE(jiffies);
- js = ACCESS_ONCE(rsp->jiffies_stall);
- rnp = rdp->mynode;
- if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
-
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(rsp);
-
- } else if (rcu_gp_in_progress(rsp) &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
-
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(rsp);
- }
-}
-
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
- rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_preempt_stall_reset();
-}
-
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static void __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
-
-/*
- * Update CPU-local rcu_data state to record the newly noticed grace period.
- * This is used both when we started the grace period and when we notice
- * that someone else started the grace period. The caller must hold the
- * ->lock of the leaf rcu_node structure corresponding to the current CPU,
- * and must have irqs disabled.
- */
-static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-{
- if (rdp->gpnum != rnp->gpnum) {
- /*
- * If the current grace period is waiting for this CPU,
- * set up to detect a quiescent state, otherwise don't
- * go looking for one.
- */
- rdp->gpnum = rnp->gpnum;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
- if (rnp->qsmask & rdp->grpmask) {
- rdp->qs_pending = 1;
- rdp->passed_quiesce = 0;
- } else
- rdp->qs_pending = 0;
- }
-}
-
-static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_node *rnp;
-
- local_irq_save(flags);
- rnp = rdp->mynode;
- if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
- !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
- local_irq_restore(flags);
- return;
- }
- __note_new_gpnum(rsp, rnp, rdp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * Did someone else start a new RCU grace period start since we last
- * checked? Update local state appropriately if so. Must be called
- * on the CPU corresponding to rdp.
- */
-static int
-check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- unsigned long flags;
- int ret = 0;
-
- local_irq_save(flags);
- if (rdp->gpnum != rsp->gpnum) {
- note_new_gpnum(rsp, rdp);
- ret = 1;
- }
- local_irq_restore(flags);
- return ret;
-}
-
-/*
- * Advance this CPU's callbacks, but only if the current grace period
- * has ended. This may be called only from the CPU to whom the rdp
- * belongs. In addition, the corresponding leaf rcu_node structure's
- * ->lock must be held by the caller, with irqs disabled.
- */
-static void
-__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-{
- /* Did another grace period end? */
- if (rdp->completed != rnp->completed) {
-
- /* Advance callbacks. No harm if list empty. */
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
- /* Remember that we saw this grace-period completion. */
- rdp->completed = rnp->completed;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
-
- /*
- * If we were in an extended quiescent state, we may have
- * missed some grace periods that others CPUs handled on
- * our behalf. Catch up with this state to avoid noting
- * spurious new grace periods. If another grace period
- * has started, then rnp->gpnum will have advanced, so
- * we will detect this later on.
- */
- if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
- rdp->gpnum = rdp->completed;
-
- /*
- * If RCU does not need a quiescent state from this CPU,
- * then make sure that this CPU doesn't go looking for one.
- */
- if ((rnp->qsmask & rdp->grpmask) == 0)
- rdp->qs_pending = 0;
- }
-}
-
-/*
- * Advance this CPU's callbacks, but only if the current grace period
- * has ended. This may be called only from the CPU to whom the rdp
- * belongs.
- */
-static void
-rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_node *rnp;
-
- local_irq_save(flags);
- rnp = rdp->mynode;
- if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
- !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
- local_irq_restore(flags);
- return;
- }
- __rcu_process_gp_end(rsp, rnp, rdp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * Do per-CPU grace-period initialization for running CPU. The caller
- * must hold the lock of the leaf rcu_node structure corresponding to
- * this CPU.
- */
-static void
-rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-{
- /* Prior grace period ended, so advance callbacks for current CPU. */
- __rcu_process_gp_end(rsp, rnp, rdp);
-
- /*
- * Because this CPU just now started the new grace period, we know
- * that all of its callbacks will be covered by this upcoming grace
- * period, even the ones that were registered arbitrarily recently.
- * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
- *
- * Other CPUs cannot be sure exactly when the grace period started.
- * Therefore, their recently registered callbacks must pass through
- * an additional RCU_NEXT_READY stage, so that they will be handled
- * by the next RCU grace period.
- */
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
- /* Set state so that this CPU will detect the next quiescent state. */
- __note_new_gpnum(rsp, rnp, rdp);
-}
-
-/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock, which is released before return. Hard irqs must
- * be disabled.
- */
-static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
-{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- if (!rcu_scheduler_fully_active ||
- !cpu_needs_another_gp(rsp, rdp)) {
- /*
- * Either the scheduler hasn't yet spawned the first
- * non-idle task or this CPU does not need another
- * grace period. Either way, don't start a new grace
- * period.
- */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
-
- if (rsp->fqs_active) {
- /*
- * This CPU needs a grace period, but force_quiescent_state()
- * is running. Tell it to start one on this CPU's behalf.
- */
- rsp->fqs_need_gp = 1;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
-
- /* Advance to a new grace period and initialize state. */
- rsp->gpnum++;
- trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
- WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
- rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
- rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
- record_gp_stall_check_time(rsp);
-
- /* Special-case the common single-level case. */
- if (NUM_RCU_NODES == 1) {
- rcu_preempt_check_blocked_tasks(rnp);
- rnp->qsmask = rnp->qsmaskinit;
- rnp->gpnum = rsp->gpnum;
- rnp->completed = rsp->completed;
- rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
- rcu_start_gp_per_cpu(rsp, rnp, rdp);
- rcu_preempt_boost_start_gp(rnp);
- trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
- rnp->level, rnp->grplo,
- rnp->grphi, rnp->qsmask);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
-
- raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
-
-
- /* Exclude any concurrent CPU-hotplug operations. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
-
- /*
- * Set the quiescent-state-needed bits in all the rcu_node
- * structures for all currently online CPUs in breadth-first
- * order, starting from the root rcu_node structure. This
- * operation relies on the layout of the hierarchy within the
- * rsp->node[] array. Note that other CPUs will access only
- * the leaves of the hierarchy, which still indicate that no
- * grace period is in progress, at least until the corresponding
- * leaf node has been initialized. In addition, we have excluded
- * CPU-hotplug operations.
- *
- * Note that the grace period cannot complete until we finish
- * the initialization process, as there will be at least one
- * qsmask bit set in the root node until that time, namely the
- * one corresponding to this CPU, due to the fact that we have
- * irqs disabled.
- */
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rcu_preempt_check_blocked_tasks(rnp);
- rnp->qsmask = rnp->qsmaskinit;
- rnp->gpnum = rsp->gpnum;
- rnp->completed = rsp->completed;
- if (rnp == rdp->mynode)
- rcu_start_gp_per_cpu(rsp, rnp, rdp);
- rcu_preempt_boost_start_gp(rnp);
- trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
- rnp->level, rnp->grplo,
- rnp->grphi, rnp->qsmask);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
-
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-}
-
-/*
- * Report a full set of quiescent states to the specified rcu_state
- * data structure. This involves cleaning up after the prior grace
- * period and letting rcu_start_gp() start up the next grace period
- * if one is needed. Note that the caller must hold rnp->lock, as
- * required by rcu_start_gp(), which will release it.
- */
-static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
-{
- unsigned long gp_duration;
- struct rcu_node *rnp = rcu_get_root(rsp);
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-
- WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
-
- /*
- * Ensure that all grace-period and pre-grace-period activity
- * is seen before the assignment to rsp->completed.
- */
- smp_mb(); /* See above block comment. */
- gp_duration = jiffies - rsp->gp_start;
- if (gp_duration > rsp->gp_max)
- rsp->gp_max = gp_duration;
-
- /*
- * We know the grace period is complete, but to everyone else
- * it appears to still be ongoing. But it is also the case
- * that to everyone else it looks like there is nothing that
- * they can do to advance the grace period. It is therefore
- * safe for us to drop the lock in order to mark the grace
- * period as completed in all of the rcu_node structures.
- *
- * But if this CPU needs another grace period, it will take
- * care of this while initializing the next grace period.
- * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
- * because the callbacks have not yet been advanced: Those
- * callbacks are waiting on the grace period that just now
- * completed.
- */
- if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-
- /*
- * Propagate new ->completed value to rcu_node structures
- * so that other CPUs don't have to wait until the start
- * of the next grace period to process their callbacks.
- */
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->completed = rsp->gpnum;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- }
-
- rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
- trace_rcu_grace_period(rsp->name, rsp->completed, "end");
- rsp->fqs_state = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
-}
-
-/*
- * Similar to rcu_report_qs_rdp(), for which it is a helper function.
- * Allows quiescent states for a group of CPUs to be reported at one go
- * to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
- * a leaf rcu_node structure, though it often will be). That structure's
- * lock must be held upon entry, and it is released before return.
- */
-static void
-rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
- struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
-{
- struct rcu_node *rnp_c;
-
- /* Walk up the rcu_node hierarchy. */
- for (;;) {
- if (!(rnp->qsmask & mask)) {
-
- /* Our bit has already been cleared, so done. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- rnp->qsmask &= ~mask;
- trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
- mask, rnp->qsmask, rnp->level,
- rnp->grplo, rnp->grphi,
- !!rnp->gp_tasks);
- if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
-
- /* Other bits still set at this level, so done. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- mask = rnp->grpmask;
- if (rnp->parent == NULL) {
-
- /* No more levels. Exit loop holding root lock. */
-
- break;
- }
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rnp_c = rnp;
- rnp = rnp->parent;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- WARN_ON_ONCE(rnp_c->qsmask);
- }
-
- /*
- * Get here if we are the last CPU to pass through a quiescent
- * state for this grace period. Invoke rcu_report_qs_rsp()
- * to clean up and start the next grace period if one is needed.
- */
- rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
-}
-
-/*
- * Record a quiescent state for the specified CPU to that CPU's rcu_data
- * structure. This must be either called from the specified CPU, or
- * called when the specified CPU is known to be offline (and when it is
- * also known that no other CPU is concurrently trying to help the offline
- * CPU). The lastcomp argument is used to make sure we are still in the
- * grace period of interest. We don't want to end the current grace period
- * based on quiescent states detected in an earlier grace period!
- */
-static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
-{
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp;
-
- rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
-
- /*
- * The grace period in which this quiescent state was
- * recorded has ended, so don't report it upwards.
- * We will instead need a new quiescent state that lies
- * within the current grace period.
- */
- rdp->passed_quiesce = 0; /* need qs for new gp. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- mask = rdp->grpmask;
- if ((rnp->qsmask & mask) == 0) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- } else {
- rdp->qs_pending = 0;
-
- /*
- * This GP can't end until cpu checks in, so all of our
- * callbacks can be processed during the next GP.
- */
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
- rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
- }
-}
-
-/*
- * Check to see if there is a new grace period of which this CPU
- * is not yet aware, and if so, set up local rcu_data state for it.
- * Otherwise, see if this CPU has just passed through its first
- * quiescent state for this grace period, and record that fact if so.
- */
-static void
-rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- /* If there is now a new grace period, record and return. */
- if (check_for_new_grace_period(rsp, rdp))
- return;
-
- /*
- * Does this CPU still need to do its part for current grace period?
- * If no, return and let the other CPUs do their part as well.
- */
- if (!rdp->qs_pending)
- return;
-
- /*
- * Was there a quiescent state since the beginning of the grace
- * period? If no, then exit and wait for the next call.
- */
- if (!rdp->passed_quiesce)
- return;
-
- /*
- * Tell RCU we are done (but rcu_report_qs_rdp() will be the
- * judge of that).
- */
- rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Synchronization is not required because this function executes
- * in stop_machine() context.
- */
-static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-{
- int i;
- /* current DYING CPU is cleared in the cpu_online_mask */
- int receive_cpu = cpumask_any(cpu_online_mask);
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
-
- if (rdp->nxtlist == NULL)
- return; /* irqs disabled, so comparison is stable. */
-
- *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
- receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- receive_rdp->qlen += rdp->qlen;
- receive_rdp->n_cbs_adopted += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
-
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
- rdp->qlen = 0;
-}
-
-/*
- * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
- * and move all callbacks from the outgoing CPU to the current one.
- * There can only be one CPU hotplug operation at a time, so no other
- * CPU can be attempting to update rcu_cpu_kthread_task.
- */
-static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long mask;
- int need_report = 0;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp;
-
- rcu_stop_cpu_kthread(cpu);
-
- /* Exclude any attempts to start a new grace period. */
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
-
- /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
- rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
- mask = rdp->grpmask; /* rnp->grplo is constant. */
- do {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->qsmaskinit &= ~mask;
- if (rnp->qsmaskinit != 0) {
- if (rnp != rdp->mynode)
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- else
- trace_rcu_grace_period(rsp->name,
- rnp->gpnum + 1 -
- !!(rnp->qsmask & mask),
- "cpuofl");
- break;
- }
- if (rnp == rdp->mynode) {
- trace_rcu_grace_period(rsp->name,
- rnp->gpnum + 1 -
- !!(rnp->qsmask & mask),
- "cpuofl");
- need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
- } else
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- mask = rnp->grpmask;
- rnp = rnp->parent;
- } while (rnp != NULL);
-
- /*
- * We still hold the leaf rcu_node structure lock here, and
- * irqs are still disabled. The reason for this subterfuge is
- * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
- * held leads to deadlock.
- */
- raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
- rnp = rdp->mynode;
- if (need_report & RCU_OFL_TASKS_NORM_GP)
- rcu_report_unblock_qs_rnp(rnp, flags);
- else
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (need_report & RCU_OFL_TASKS_EXP_GP)
- rcu_report_exp_rnp(rsp, rnp, true);
- rcu_node_kthread_setaffinity(rnp, -1);
-}
-
-/*
- * Remove the specified CPU from the RCU hierarchy and move any pending
- * callbacks that it might have to the current CPU. This code assumes
- * that at least one CPU in the system will remain running at all times.
- * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
- */
-static void rcu_offline_cpu(int cpu)
-{
- __rcu_offline_cpu(cpu, &rcu_sched_state);
- __rcu_offline_cpu(cpu, &rcu_bh_state);
- rcu_preempt_offline_cpu(cpu);
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-{
-}
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
- * Invoke any RCU callbacks that have made it to the end of their grace
- * period. Thottle as specified by rdp->blimit.
- */
-static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_head *next, *list, **tail;
- int bl, count;
-
- /* If no callbacks are ready, just return.*/
- if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
- trace_rcu_batch_start(rsp->name, 0, 0);
- trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
- need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
- return;
- }
-
- /*
- * Extract the list of ready callbacks, disabling to prevent
- * races with call_rcu() from interrupt handlers.
- */
- local_irq_save(flags);
- bl = rdp->blimit;
- trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
- list = rdp->nxtlist;
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- tail = rdp->nxttail[RCU_DONE_TAIL];
- for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
- if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[count] = &rdp->nxtlist;
- local_irq_restore(flags);
-
- /* Invoke callbacks. */
- count = 0;
- while (list) {
- next = list->next;
- prefetch(next);
- debug_rcu_head_unqueue(list);
- __rcu_reclaim(rsp->name, list);
- list = next;
- /* Stop only if limit reached and CPU has something to do. */
- if (++count >= bl &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- }
-
- local_irq_save(flags);
- trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
- is_idle_task(current),
- rcu_is_callbacks_kthread());
-
- /* Update count, and requeue any remaining callbacks. */
- rdp->qlen -= count;
- rdp->n_cbs_invoked += count;
- if (list != NULL) {
- *tail = rdp->nxtlist;
- rdp->nxtlist = list;
- for (count = 0; count < RCU_NEXT_SIZE; count++)
- if (&rdp->nxtlist == rdp->nxttail[count])
- rdp->nxttail[count] = tail;
- else
- break;
- }
-
- /* Reinstate batch limit if we have worked down the excess. */
- if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
- rdp->blimit = blimit;
-
- /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
- if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
- rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rsp->n_force_qs;
- } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
- rdp->qlen_last_fqs_check = rdp->qlen;
-
- local_irq_restore(flags);
-
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
- invoke_rcu_core();
-}
-
-/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt. If rcu_pending returns
- * false, there is no point in invoking rcu_check_callbacks().
- */
-void rcu_check_callbacks(int cpu, int user)
-{
- trace_rcu_utilization("Start scheduler-tick");
- if (user || rcu_is_cpu_rrupt_from_idle()) {
-
- /*
- * Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
- *
- * No memory barrier is required here because both
- * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
- * variables that other CPUs neither access nor modify,
- * at least not while the corresponding CPU is online.
- */
-
- rcu_sched_qs(cpu);
- rcu_bh_qs(cpu);
-
- } else if (!in_softirq()) {
-
- /*
- * Get here if this CPU did not take its interrupt from
- * softirq, in other words, if it is not interrupting
- * a rcu_bh read-side critical section. This is an _bh
- * critical section, so note it.
- */
-
- rcu_bh_qs(cpu);
- }
- rcu_preempt_check_callbacks(cpu);
- if (rcu_pending(cpu))
- invoke_rcu_core();
- trace_rcu_utilization("End scheduler-tick");
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
- * have not yet encountered a quiescent state, using the function specified.
- * Also initiate boosting for any threads blocked on the root rcu_node.
- *
- * The caller must have suppressed start of new grace periods.
- */
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
-{
- unsigned long bit;
- int cpu;
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rsp, rnp) {
- mask = 0;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (!rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- if (rnp->qsmask == 0) {
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- continue;
- }
- cpu = rnp->grplo;
- bit = 1;
- for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
- if ((rnp->qsmask & bit) != 0 &&
- f(per_cpu_ptr(rsp->rda, cpu)))
- mask |= bit;
- }
- if (mask != 0) {
-
- /* rcu_report_qs_rnp() releases rnp->lock. */
- rcu_report_qs_rnp(mask, rsp, rnp, flags);
- continue;
- }
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
- rnp = rcu_get_root(rsp);
- if (rnp->qsmask == 0) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- }
-}
-
-/*
- * Force quiescent states on reluctant CPUs, and also detect which
- * CPUs are in dyntick-idle mode.
- */
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
-{
- unsigned long flags;
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- trace_rcu_utilization("Start fqs");
- if (!rcu_gp_in_progress(rsp)) {
- trace_rcu_utilization("End fqs");
- return; /* No grace period in progress, nothing to force. */
- }
- if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
- rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
- trace_rcu_utilization("End fqs");
- return; /* Someone else is already on the job. */
- }
- if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
- goto unlock_fqs_ret; /* no emergency and done recently. */
- rsp->n_force_qs++;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
- if(!rcu_gp_in_progress(rsp)) {
- rsp->n_force_qs_ngp++;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- goto unlock_fqs_ret; /* no GP in progress, time updated. */
- }
- rsp->fqs_active = 1;
- switch (rsp->fqs_state) {
- case RCU_GP_IDLE:
- case RCU_GP_INIT:
-
- break; /* grace period idle or initializing, ignore. */
-
- case RCU_SAVE_DYNTICK:
- if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
- break; /* So gcc recognizes the dead code. */
-
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
-
- /* Record dyntick-idle state. */
- force_qs_rnp(rsp, dyntick_save_progress_counter);
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- if (rcu_gp_in_progress(rsp))
- rsp->fqs_state = RCU_FORCE_QS;
- break;
-
- case RCU_FORCE_QS:
-
- /* Check dyntick-idle state, send IPI to laggarts. */
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
-
- /* Leave state in case more forcing is required. */
-
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- break;
- }
- rsp->fqs_active = 0;
- if (rsp->fqs_need_gp) {
- raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
- rsp->fqs_need_gp = 0;
- rcu_start_gp(rsp, flags); /* releases rnp->lock */
- trace_rcu_utilization("End fqs");
- return;
- }
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
-unlock_fqs_ret:
- raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
- trace_rcu_utilization("End fqs");
-}
-
-#else /* #ifdef CONFIG_SMP */
-
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
-{
- set_need_resched();
-}
-
-#endif /* #else #ifdef CONFIG_SMP */
-
-/*
- * This does the RCU core processing work for the specified rcu_state
- * and rcu_data structures. This may be called only from the CPU to
- * whom the rdp belongs.
- */
-static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- unsigned long flags;
-
- WARN_ON_ONCE(rdp->beenonline == 0);
-
- /*
- * If an RCU GP has gone long enough, go check for dyntick
- * idle CPUs and, if needed, send resched IPIs.
- */
- if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
- force_quiescent_state(rsp, 1);
-
- /*
- * Advance callbacks in response to end of earlier grace
- * period that some other CPU ended.
- */
- rcu_process_gp_end(rsp, rdp);
-
- /* Update RCU state based on any recent quiescent states. */
- rcu_check_quiescent_state(rsp, rdp);
-
- /* Does this CPU require a not-yet-started grace period? */
- if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
- rcu_start_gp(rsp, flags); /* releases above lock */
- }
-
- /* If there are callbacks ready, invoke them. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
- invoke_rcu_callbacks(rsp, rdp);
-}
-
-/*
- * Do RCU core processing for the current CPU.
- */
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
- trace_rcu_utilization("Start RCU core");
- __rcu_process_callbacks(&rcu_sched_state,
- &__get_cpu_var(rcu_sched_data));
- __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
- rcu_preempt_process_callbacks();
- trace_rcu_utilization("End RCU core");
-}
-
-/*
- * Schedule RCU callback invocation. If the specified type of RCU
- * does not support RCU priority boosting, just do a direct call,
- * otherwise wake up the per-CPU kernel kthread. Note that because we
- * are running on the current CPU with interrupts disabled, the
- * rcu_cpu_kthread_task cannot disappear out from under us.
- */
-static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
- return;
- if (likely(!rsp->boost)) {
- rcu_do_batch(rsp, rdp);
- return;
- }
- invoke_rcu_callbacks_kthread();
-}
-
-static void invoke_rcu_core(void)
-{
- raise_softirq(RCU_SOFTIRQ);
-}
-
-static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
- struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- debug_rcu_head_queue(head);
- head->func = func;
- head->next = NULL;
-
- smp_mb(); /* Ensure RCU update seen before callback registry. */
-
- /*
- * Opportunistically note grace-period endings and beginnings.
- * Note that we might see a beginning right after we see an
- * end, but never vice versa, since this CPU has to pass through
- * a quiescent state betweentimes.
- */
- local_irq_save(flags);
- rdp = this_cpu_ptr(rsp->rda);
-
- /* Add the callback to our list. */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
- rdp->qlen++;
-
- if (__is_kfree_rcu_offset((unsigned long)func))
- trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
- rdp->qlen);
- else
- trace_rcu_callback(rsp->name, head, rdp->qlen);
-
- /* If interrupts were disabled, don't dive into RCU core. */
- if (irqs_disabled_flags(flags)) {
- local_irq_restore(flags);
- return;
- }
-
- /*
- * Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
- * if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
- * is the only one waiting for a grace period to complete.
- */
- if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-
- /* Are we ignoring a completed grace period? */
- rcu_process_gp_end(rsp, rdp);
- check_for_new_grace_period(rsp, rdp);
-
- /* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
- } else {
- /* Give the grace period a kick. */
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
- }
- } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
- force_quiescent_state(rsp, 1);
- local_irq_restore(flags);
-}
-
-/*
- * Queue an RCU-sched callback for invocation after a grace period.
- */
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(call_rcu_sched);
-
-/*
- * Queue an RCU for invocation after a quicker grace period.
- */
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_bh_state);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-/**
- * synchronize_sched - wait until an rcu-sched grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu-sched
- * grace period has elapsed, in other words after all currently executing
- * rcu-sched read-side critical sections have completed. These read-side
- * critical sections are delimited by rcu_read_lock_sched() and
- * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
- * local_irq_disable(), and so on may be used in place of
- * rcu_read_lock_sched().
- *
- * This means that all preempt_disable code sequences, including NMI and
- * hardware-interrupt handlers, in progress on entry will have completed
- * before this primitive returns. However, this does not guarantee that
- * softirq handlers will have completed, since in some kernels, these
- * handlers can run in process context, and can block.
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API. In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
- */
-void synchronize_sched(void)
-{
- if (rcu_blocking_is_gp())
- return;
- wait_rcu_gp(call_rcu_sched);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched);
-
-/**
- * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu_bh grace
- * period has elapsed, in other words after all currently executing rcu_bh
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
- * and may be nested.
- */
-void synchronize_rcu_bh(void)
-{
- if (rcu_blocking_is_gp())
- return;
- wait_rcu_gp(call_rcu_bh);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, for the specified type of RCU, returning 1 if so.
- * The checks are in order of increasing expense: checks that can be
- * carried out against CPU-local state are performed first. However,
- * we must check for CPU stalls first, else we might not get a chance.
- */
-static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- rdp->n_rcu_pending++;
-
- /* Check for CPU stalls, if enabled. */
- check_cpu_stall(rsp, rdp);
-
- /* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rcu_scheduler_fully_active &&
- rdp->qs_pending && !rdp->passed_quiesce) {
-
- /*
- * If force_quiescent_state() coming soon and this CPU
- * needs a quiescent state, and this is either RCU-sched
- * or RCU-bh, force a local reschedule.
- */
- rdp->n_rp_qs_pending++;
- if (!rdp->preemptible &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
- jiffies))
- set_need_resched();
- } else if (rdp->qs_pending && rdp->passed_quiesce) {
- rdp->n_rp_report_qs++;
- return 1;
- }
-
- /* Does this CPU have callbacks ready to invoke? */
- if (cpu_has_callbacks_ready_to_invoke(rdp)) {
- rdp->n_rp_cb_ready++;
- return 1;
- }
-
- /* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rdp->n_rp_cpu_needs_gp++;
- return 1;
- }
-
- /* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
- rdp->n_rp_gp_completed++;
- return 1;
- }
-
- /* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
- rdp->n_rp_gp_started++;
- return 1;
- }
-
- /* Has an RCU GP gone long enough to send resched IPIs &c? */
- if (rcu_gp_in_progress(rsp) &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
- rdp->n_rp_need_fqs++;
- return 1;
- }
-
- /* nothing to do */
- rdp->n_rp_need_nothing++;
- return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so. This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-static int rcu_pending(int cpu)
-{
- return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
- __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
- rcu_preempt_pending(cpu);
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.
- */
-static int rcu_cpu_has_callbacks(int cpu)
-{
- /* RCU callbacks either ready or pending? */
- return per_cpu(rcu_sched_data, cpu).nxtlist ||
- per_cpu(rcu_bh_data, cpu).nxtlist ||
- rcu_preempt_needs_cpu(cpu);
-}
-
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
-static void rcu_barrier_callback(struct rcu_head *notused)
-{
- if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- complete(&rcu_barrier_completion);
-}
-
-/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
- */
-static void rcu_barrier_func(void *type)
-{
- int cpu = smp_processor_id();
- struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
- void (*call_rcu_func)(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
-
- atomic_inc(&rcu_barrier_cpu_count);
- call_rcu_func = type;
- call_rcu_func(head, rcu_barrier_callback);
-}
-
-/*
- * Orchestrate the specified type of RCU barrier, waiting for all
- * RCU callbacks of the specified type to complete.
- */
-static void _rcu_barrier(struct rcu_state *rsp,
- void (*call_rcu_func)(struct rcu_head *head,
- void (*func)(struct rcu_head *head)))
-{
- BUG_ON(in_interrupt());
- /* Take mutex to serialize concurrent rcu_barrier() requests. */
- mutex_lock(&rcu_barrier_mutex);
- init_completion(&rcu_barrier_completion);
- /*
- * Initialize rcu_barrier_cpu_count to 1, then invoke
- * rcu_barrier_func() on each CPU, so that each CPU also has
- * incremented rcu_barrier_cpu_count. Only then is it safe to
- * decrement rcu_barrier_cpu_count -- otherwise the first CPU
- * might complete its grace period before all of the other CPUs
- * did their increment, causing this function to return too
- * early. Note that on_each_cpu() disables irqs, which prevents
- * any CPUs from coming online or going offline until each online
- * CPU has queued its RCU-barrier callback.
- */
- atomic_set(&rcu_barrier_cpu_count, 1);
- on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
- if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- complete(&rcu_barrier_completion);
- wait_for_completion(&rcu_barrier_completion);
- mutex_unlock(&rcu_barrier_mutex);
-}
-
-/**
- * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
- */
-void rcu_barrier_bh(void)
-{
- _rcu_barrier(&rcu_bh_state, call_rcu_bh);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-
-/**
- * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
- */
-void rcu_barrier_sched(void)
-{
- _rcu_barrier(&rcu_sched_state, call_rcu_sched);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-
-/*
- * Do boot-time initialization of a CPU's per-CPU RCU data.
- */
-static void __init
-rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
-{
- unsigned long flags;
- int i;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- /* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
- rdp->qlen = 0;
- rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
- WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
- WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
- rdp->cpu = cpu;
- rdp->rsp = rsp;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * Initialize a CPU's per-CPU RCU data. Note that only one online or
- * offline event can be happening at a given time. Note also that we
- * can accept some slop in the rsp->completed access due to the fact
- * that this CPU cannot possibly have any RCU callbacks in flight yet.
- */
-static void __cpuinit
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
-{
- unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- /* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rdp->beenonline = 1; /* We have now been online. */
- rdp->preemptible = preemptible;
- rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->blimit = blimit;
- rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
- atomic_set(&rdp->dynticks->dynticks,
- (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
- rcu_prepare_for_idle_init(cpu);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-
- /*
- * A new grace period might start here. If so, we won't be part
- * of it, but that is OK, as we are currently in a quiescent state.
- */
-
- /* Exclude any attempts to start a new GP on large systems. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
-
- /* Add CPU to rcu_node bitmasks. */
- rnp = rdp->mynode;
- mask = rdp->grpmask;
- do {
- /* Exclude any attempts to start a new GP on small systems. */
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->qsmaskinit |= mask;
- mask = rnp->grpmask;
- if (rnp == rdp->mynode) {
- /*
- * If there is a grace period in progress, we will
- * set up to wait for it next time we run the
- * RCU core code.
- */
- rdp->gpnum = rnp->completed;
- rdp->completed = rnp->completed;
- rdp->passed_quiesce = 0;
- rdp->qs_pending = 0;
- rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
- }
- raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
- rnp = rnp->parent;
- } while (rnp != NULL && !(rnp->qsmaskinit & mask));
-
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-}
-
-static void __cpuinit rcu_prepare_cpu(int cpu)
-{
- rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
- rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
- rcu_preempt_init_percpu_data(cpu);
-}
-
-/*
- * Handle CPU online/offline notification events.
- */
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- trace_rcu_utilization("Start CPU hotplug");
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_prepare_cpu(cpu);
- rcu_prepare_kthreads(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- rcu_node_kthread_setaffinity(rnp, -1);
- rcu_cpu_kthread_setrt(cpu, 1);
- break;
- case CPU_DOWN_PREPARE:
- rcu_node_kthread_setaffinity(rnp, cpu);
- rcu_cpu_kthread_setrt(cpu, 0);
- break;
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- /*
- * The whole machine is "stopped" except this CPU, so we can
- * touch any data without introducing corruption. We send the
- * dying CPU's callbacks to an arbitrarily chosen online CPU.
- */
- rcu_send_cbs_to_online(&rcu_bh_state);
- rcu_send_cbs_to_online(&rcu_sched_state);
- rcu_preempt_send_cbs_to_online();
- rcu_cleanup_after_idle(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- rcu_offline_cpu(cpu);
- break;
- default:
- break;
- }
- trace_rcu_utilization("End CPU hotplug");
- return NOTIFY_OK;
-}
-
-/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections. This function also enables RCU lockdep checking.
- */
-void rcu_scheduler_starting(void)
-{
- WARN_ON(num_online_cpus() != 1);
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
-}
-
-/*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
- */
-#ifdef CONFIG_RCU_FANOUT_EXACT
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
- int i;
-
- for (i = NUM_RCU_LVLS - 1; i > 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = RCU_FANOUT_LEAF;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
- int ccur;
- int cprv;
- int i;
-
- cprv = NR_CPUS;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
- }
-}
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
-
-/*
- * Helper function for rcu_init() that initializes one rcu_state structure.
- */
-static void __init rcu_init_one(struct rcu_state *rsp,
- struct rcu_data __percpu *rda)
-{
- static char *buf[] = { "rcu_node_level_0",
- "rcu_node_level_1",
- "rcu_node_level_2",
- "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
- int cpustride = 1;
- int i;
- int j;
- struct rcu_node *rnp;
-
- BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
-
- /* Initialize the level-tracking arrays. */
-
- for (i = 1; i < NUM_RCU_LVLS; i++)
- rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
- rcu_init_levelspread(rsp);
-
- /* Initialize the elements themselves, starting from the leaves. */
-
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
- cpustride *= rsp->levelspread[i];
- rnp = rsp->level[i];
- for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
- raw_spin_lock_init(&rnp->lock);
- lockdep_set_class_and_name(&rnp->lock,
- &rcu_node_class[i], buf[i]);
- rnp->gpnum = 0;
- rnp->qsmask = 0;
- rnp->qsmaskinit = 0;
- rnp->grplo = j * cpustride;
- rnp->grphi = (j + 1) * cpustride - 1;
- if (rnp->grphi >= NR_CPUS)
- rnp->grphi = NR_CPUS - 1;
- if (i == 0) {
- rnp->grpnum = 0;
- rnp->grpmask = 0;
- rnp->parent = NULL;
- } else {
- rnp->grpnum = j % rsp->levelspread[i - 1];
- rnp->grpmask = 1UL << rnp->grpnum;
- rnp->parent = rsp->level[i - 1] +
- j / rsp->levelspread[i - 1];
- }
- rnp->level = i;
- INIT_LIST_HEAD(&rnp->blkd_tasks);
- }
- }
-
- rsp->rda = rda;
- rnp = rsp->level[NUM_RCU_LVLS - 1];
- for_each_possible_cpu(i) {
- while (i > rnp->grphi)
- rnp++;
- per_cpu_ptr(rsp->rda, i)->mynode = rnp;
- rcu_boot_init_percpu_data(i, rsp);
- }
-}
-
-void __init rcu_init(void)
-{
- int cpu;
-
- rcu_bootup_announce();
- rcu_init_one(&rcu_sched_state, &rcu_sched_data);
- rcu_init_one(&rcu_bh_state, &rcu_bh_data);
- __rcu_init_preempt();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-
- /*
- * We don't need protection against CPU-hotplug here because
- * this is called early in boot, before either interrupts
- * or the scheduler are operational.
- */
- cpu_notifier(rcu_cpu_notify, 0);
- for_each_online_cpu(cpu)
- rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- check_cpu_stall_init();
-}
-
-#include "rcutree_plugin.h"
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
deleted file mode 100644
index 8bb35d73e1f..00000000000
--- a/kernel/rcutree_plugin.h
+++ /dev/null
@@ -1,2199 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright Red Hat, 2009
- * Copyright IBM Corporation, 2009
- *
- * Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#include <linux/delay.h>
-#include <linux/stop_machine.h>
-
-#define RCU_KTHREAD_PRIO 1
-
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else
-#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
-#endif
-
-/*
- * Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary. If you like #ifdef, you
- * will love this function.
- */
-static void __init rcu_bootup_announce_oddness(void)
-{
-#ifdef CONFIG_RCU_TRACE
- printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
-#endif
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
- printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
-#endif
-#ifdef CONFIG_RCU_FANOUT_EXACT
- printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
-#endif
-#ifdef CONFIG_RCU_FAST_NO_HZ
- printk(KERN_INFO
- "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#endif
-#ifdef CONFIG_PROVE_RCU
- printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
-#endif
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
- printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
-#endif
-#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
- printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
-#endif
-#if NUM_RCU_LVL_4 != 0
- printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
-#endif
-}
-
-#ifdef CONFIG_TREE_PREEMPT_RCU
-
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
-DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
-static struct rcu_state *rcu_state = &rcu_preempt_state;
-
-static void rcu_read_unlock_special(struct task_struct *t);
-static int rcu_preempted_readers_exp(struct rcu_node *rnp);
-
-/*
- * Tell them what RCU they are running.
- */
-static void __init rcu_bootup_announce(void)
-{
- printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
- rcu_bootup_announce_oddness();
-}
-
-/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-long rcu_batches_completed_preempt(void)
-{
- return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
- * Force a quiescent state for preemptible RCU.
- */
-void rcu_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_preempt_state, 0);
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
- * Record a preemptible-RCU quiescent state for the specified CPU. Note
- * that this just means that the task currently running on the CPU is
- * not in a quiescent state. There might be any number of tasks blocked
- * while in an RCU read-side critical section.
- *
- * Unlike the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
- * ->rcu_read_unlock_special.
- */
-static void rcu_preempt_qs(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-
- rdp->passed_quiesce_gpnum = rdp->gpnum;
- barrier();
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
- rdp->passed_quiesce = 1;
- current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
-}
-
-/*
- * We have entered the scheduler, and the current task might soon be
- * context-switched away from. If this task is in an RCU read-side
- * critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the blkd_tasks list.
- * The task will dequeue itself when it exits the outermost enclosing
- * RCU read-side critical section. Therefore, the current grace period
- * cannot be permitted to complete until the blkd_tasks list entries
- * predating the current grace period drain, in other words, until
- * rnp->gp_tasks becomes NULL.
- *
- * Caller must disable preemption.
- */
-static void rcu_preempt_note_context_switch(int cpu)
-{
- struct task_struct *t = current;
- unsigned long flags;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- if (t->rcu_read_lock_nesting > 0 &&
- (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
-
- /* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
- rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
- t->rcu_blocked_node = rnp;
-
- /*
- * If this CPU has already checked in, then this task
- * will hold up the next grace period rather than the
- * current grace period. Queue the task accordingly.
- * If the task is queued for the current grace period
- * (i.e., this CPU has not yet passed through a quiescent
- * state for the current grace period), then as long
- * as that task remains queued, the current grace period
- * cannot end. Note that there is some uncertainty as
- * to exactly when the current grace period started.
- * We take a conservative approach, which can result
- * in unnecessarily waiting on tasks that started very
- * slightly after the current grace period began. C'est
- * la vie!!!
- *
- * But first, note that the current CPU must still be
- * on line!
- */
- WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
- WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
- if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
- list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
- rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
- if (rnp->boost_tasks != NULL)
- rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- } else {
- list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
- if (rnp->qsmask & rdp->grpmask)
- rnp->gp_tasks = &t->rcu_node_entry;
- }
- trace_rcu_preempt_task(rdp->rsp->name,
- t->pid,
- (rnp->qsmask & rdp->grpmask)
- ? rnp->gpnum
- : rnp->gpnum + 1);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- } else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special) {
-
- /*
- * Complete exit from RCU read-side critical section on
- * behalf of preempted instance of __rcu_read_unlock().
- */
- rcu_read_unlock_special(t);
- }
-
- /*
- * Either we were not in an RCU read-side critical section to
- * begin with, or we have now recorded that critical section
- * globally. Either way, we can now note a quiescent state
- * for this CPU. Again, if we were in an RCU read-side critical
- * section, and if that critical section was blocking the current
- * grace period, then the fact that the task has been enqueued
- * means that we continue to block the current grace period.
- */
- local_irq_save(flags);
- rcu_preempt_qs(cpu);
- local_irq_restore(flags);
-}
-
-/*
- * Tree-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Check for preempted RCU readers blocking the current grace period
- * for the specified rcu_node structure. If the caller needs a reliable
- * answer, it must hold the rcu_node's ->lock.
- */
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
-{
- return rnp->gp_tasks != NULL;
-}
-
-/*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period. The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
-{
- unsigned long mask;
- struct rcu_node *rnp_p;
-
- if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return; /* Still need more quiescent states! */
- }
-
- rnp_p = rnp->parent;
- if (rnp_p == NULL) {
- /*
- * Either there is only one rcu_node in the tree,
- * or tasks were kicked up to root rcu_node due to
- * CPUs going offline.
- */
- rcu_report_qs_rsp(&rcu_preempt_state, flags);
- return;
- }
-
- /* Report up the rest of the hierarchy. */
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-
-/*
- * Advance a ->blkd_tasks-list pointer to the next entry, instead
- * returning NULL if at the end of the list.
- */
-static struct list_head *rcu_next_node_entry(struct task_struct *t,
- struct rcu_node *rnp)
-{
- struct list_head *np;
-
- np = t->rcu_node_entry.next;
- if (np == &rnp->blkd_tasks)
- np = NULL;
- return np;
-}
-
-/*
- * Handle special cases during rcu_read_unlock(), such as needing to
- * notify RCU core processing or task having blocked during the RCU
- * read-side critical section.
- */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
-{
- int empty;
- int empty_exp;
- int empty_exp_now;
- unsigned long flags;
- struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rbmp = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- struct rcu_node *rnp;
- int special;
-
- /* NMI handlers cannot block and cannot safely manipulate state. */
- if (in_nmi())
- return;
-
- local_irq_save(flags);
-
- /*
- * If RCU core is waiting for this CPU to exit critical section,
- * let it know that we have done so.
- */
- special = t->rcu_read_unlock_special;
- if (special & RCU_READ_UNLOCK_NEED_QS) {
- rcu_preempt_qs(smp_processor_id());
- }
-
- /* Hardware IRQ handlers cannot block. */
- if (in_irq() || in_serving_softirq()) {
- local_irq_restore(flags);
- return;
- }
-
- /* Clean up if blocked during RCU read-side critical section. */
- if (special & RCU_READ_UNLOCK_BLOCKED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
-
- /*
- * Remove this task from the list it blocked on. The
- * task can migrate while we acquire the lock, but at
- * most one time. So at most two passes through loop.
- */
- for (;;) {
- rnp = t->rcu_blocked_node;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- if (rnp == t->rcu_blocked_node)
- break;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
- empty = !rcu_preempt_blocked_readers_cgp(rnp);
- empty_exp = !rcu_preempted_readers_exp(rnp);
- smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
- np = rcu_next_node_entry(t, rnp);
- list_del_init(&t->rcu_node_entry);
- t->rcu_blocked_node = NULL;
- trace_rcu_unlock_preempted_task("rcu_preempt",
- rnp->gpnum, t->pid);
- if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp->gp_tasks = np;
- if (&t->rcu_node_entry == rnp->exp_tasks)
- rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
- /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
- if (t->rcu_boost_mutex) {
- rbmp = t->rcu_boost_mutex;
- t->rcu_boost_mutex = NULL;
- }
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- /*
- * If this was the last task on the current list, and if
- * we aren't waiting on any CPUs, report the quiescent state.
- * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
- * so we must take a snapshot of the expedited state.
- */
- empty_exp_now = !rcu_preempted_readers_exp(rnp);
- if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
- trace_rcu_quiescent_state_report("preempt_rcu",
- rnp->gpnum,
- 0, rnp->qsmask,
- rnp->level,
- rnp->grplo,
- rnp->grphi,
- !!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(rnp, flags);
- } else
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
-#ifdef CONFIG_RCU_BOOST
- /* Unboost if we were boosted. */
- if (rbmp)
- rt_mutex_unlock(rbmp);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- /*
- * If this was the last task on the expedited lists,
- * then we need to report up the rcu_node hierarchy.
- */
- if (!empty_exp && empty_exp_now)
- rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/*
- * Tree-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting != 1)
- --t->rcu_read_lock_nesting;
- else {
- barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
-
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
- */
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
-{
- unsigned long flags;
- struct task_struct *t;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- t = list_entry(rnp->gp_tasks,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
- sched_show_task(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- rcu_print_detail_task_stall_rnp(rnp);
- rcu_for_each_leaf_node(rsp, rnp)
- rcu_print_detail_task_stall_rnp(rnp);
-}
-
-#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return 0;
- t = list_entry(rnp->gp_tasks,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- printk(" P%d", t->pid);
- ndetected++;
- }
- return ndetected;
-}
-
-/*
- * Suppress preemptible RCU's CPU stall warnings by pushing the
- * time of the next stall-warning message comfortably far into the
- * future.
- */
-static void rcu_preempt_stall_reset(void)
-{
- rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-}
-
-/*
- * Check that the list of blocked tasks for the newly completed grace
- * period is in fact empty. It is a serious bug to complete a grace
- * period that still has RCU readers blocked! This function must be
- * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
- * must be held by the caller.
- *
- * Also, if there are blocked tasks on the list, they automatically
- * block the newly created grace period, so set up ->gp_tasks accordingly.
- */
-static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
-{
- WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (!list_empty(&rnp->blkd_tasks))
- rnp->gp_tasks = rnp->blkd_tasks.next;
- WARN_ON_ONCE(rnp->qsmask);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline. Move them up to the root
- * rcu_node. The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- struct list_head *lp;
- struct list_head *lp_root;
- int retval = 0;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
- struct task_struct *t;
-
- if (rnp == rnp_root) {
- WARN_ONCE(1, "Last CPU thought to be offlined?");
- return 0; /* Shouldn't happen: at least one CPU online. */
- }
-
- /* If we are on an internal node, complain bitterly. */
- WARN_ON_ONCE(rnp != rdp->mynode);
-
- /*
- * Move tasks up to root rcu_node. Don't try to get fancy for
- * this corner-case operation -- just put this node's tasks
- * at the head of the root node's list, and update the root node's
- * ->gp_tasks and ->exp_tasks pointers to those of this node's,
- * if non-NULL. This might result in waiting for more tasks than
- * absolutely necessary, but this is a good performance/complexity
- * tradeoff.
- */
- if (rcu_preempt_blocked_readers_cgp(rnp))
- retval |= RCU_OFL_TASKS_NORM_GP;
- if (rcu_preempted_readers_exp(rnp))
- retval |= RCU_OFL_TASKS_EXP_GP;
- lp = &rnp->blkd_tasks;
- lp_root = &rnp_root->blkd_tasks;
- while (!list_empty(lp)) {
- t = list_entry(lp->next, typeof(*t), rcu_node_entry);
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- list_del(&t->rcu_node_entry);
- t->rcu_blocked_node = rnp_root;
- list_add(&t->rcu_node_entry, lp_root);
- if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp_root->gp_tasks = rnp->gp_tasks;
- if (&t->rcu_node_entry == rnp->exp_tasks)
- rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
- }
-
-#ifdef CONFIG_RCU_BOOST
- /* In case root is being boosted and leaf is not. */
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- if (rnp_root->boost_tasks != NULL &&
- rnp_root->boost_tasks != rnp_root->gp_tasks)
- rnp_root->boost_tasks = rnp_root->gp_tasks;
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- rnp->gp_tasks = NULL;
- rnp->exp_tasks = NULL;
- return retval;
-}
-
-/*
- * Do CPU-offline processing for preemptible RCU.
- */
-static void rcu_preempt_offline_cpu(int cpu)
-{
- __rcu_offline_cpu(cpu, &rcu_preempt_state);
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
- * Check for a quiescent state from the current CPU. When a task blocks,
- * the task is recorded in the corresponding CPU's rcu_node structure,
- * which is checked elsewhere.
- *
- * Caller must disable hard irqs.
- */
-static void rcu_preempt_check_callbacks(int cpu)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0) {
- rcu_preempt_qs(cpu);
- return;
- }
- if (t->rcu_read_lock_nesting > 0 &&
- per_cpu(rcu_preempt_data, cpu).qs_pending)
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
-}
-
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
- __rcu_process_callbacks(&rcu_preempt_state,
- &__get_cpu_var(rcu_preempt_data));
-}
-
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
- rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Queue a preemptible-RCU callback for invocation after a grace period.
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_preempt_state);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. Note, however, that
- * upon return from synchronize_rcu(), the caller might well be executing
- * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- */
-void synchronize_rcu(void)
-{
- if (!rcu_scheduler_active)
- return;
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
-
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
- return rnp->exp_tasks != NULL;
-}
-
-/*
- * return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
- return !rcu_preempted_readers_exp(rnp) &&
- ACCESS_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period. This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree. (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
- unsigned long flags;
- unsigned long mask;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- break;
- }
- if (rnp->parent == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (wake)
- wake_up(&sync_rcu_preempt_exp_wq);
- break;
- }
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- rnp = rnp->parent;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- rnp->expmask &= ~mask;
- }
-}
-
-/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure. If there are no such
- * tasks, report it up the rcu_node hierarchy.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
- */
-static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
-{
- unsigned long flags;
- int must_wait = 0;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (list_empty(&rnp->blkd_tasks))
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- else {
- rnp->exp_tasks = rnp->blkd_tasks.next;
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- must_wait = 1;
- }
- if (!must_wait)
- rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
-}
-
-/*
- * Wait for an rcu-preempt grace period, but expedite it. The basic idea
- * is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain.
- */
-void synchronize_rcu_expedited(void)
-{
- unsigned long flags;
- struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_preempt_state;
- long snap;
- int trycount = 0;
-
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
- smp_mb(); /* Above access cannot bleed into critical section. */
-
- /*
- * Acquire lock, falling back to synchronize_rcu() if too many
- * lock-acquisition failures. Of course, if someone does the
- * expedited grace period for us, just leave.
- */
- while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_rcu();
- return;
- }
- if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
- goto mb_ret; /* Others did our work for us. */
- }
- if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
- goto unlock_mb_ret; /* Others did our work for us. */
-
- /* force all RCU readers onto ->blkd_tasks lists. */
- synchronize_sched_expedited();
-
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
-
- /* Initialize ->expmask for all non-leaf rcu_node structures. */
- rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->expmask = rnp->qsmaskinit;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
-
- /* Snapshot current state of ->blkd_tasks lists. */
- rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init(rsp, rnp);
- if (NUM_RCU_NODES > 1)
- sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
-
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-
- /* Wait for snapshotted ->blkd_tasks lists to drain. */
- rnp = rcu_get_root(rsp);
- wait_event(sync_rcu_preempt_exp_wq,
- sync_rcu_preempt_exp_done(rnp));
-
- /* Clean up and exit. */
- smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
- smp_mb(); /* ensure subsequent action seen after grace period. */
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-/*
- * Check to see if there is any immediate preemptible-RCU-related work
- * to be done.
- */
-static int rcu_preempt_pending(int cpu)
-{
- return __rcu_pending(&rcu_preempt_state,
- &per_cpu(rcu_preempt_data, cpu));
-}
-
-/*
- * Does preemptible RCU need the CPU to stay out of dynticks mode?
- */
-static int rcu_preempt_needs_cpu(int cpu)
-{
- return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
-}
-
-/**
- * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
- */
-void rcu_barrier(void)
-{
- _rcu_barrier(&rcu_preempt_state, call_rcu);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
-/*
- * Initialize preemptible RCU's per-CPU data.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
- rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
-}
-
-/*
- * Move preemptible RCU's callbacks from dying CPU to other online CPU.
- */
-static void rcu_preempt_send_cbs_to_online(void)
-{
- rcu_send_cbs_to_online(&rcu_preempt_state);
-}
-
-/*
- * Initialize preemptible RCU's state structures.
- */
-static void __init __rcu_init_preempt(void)
-{
- rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
-}
-
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-
-static struct rcu_state *rcu_state = &rcu_sched_state;
-
-/*
- * Tell them what RCU they are running.
- */
-static void __init rcu_bootup_announce(void)
-{
- printk(KERN_INFO "Hierarchical RCU implementation.\n");
- rcu_bootup_announce_oddness();
-}
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
- * Force a quiescent state for RCU, which, because there is no preemptible
- * RCU, becomes the same as rcu-sched.
- */
-void rcu_force_quiescent_state(void)
-{
- rcu_sched_force_quiescent_state();
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * CPUs being in quiescent states.
- */
-static void rcu_preempt_note_context_switch(int cpu)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, there are never any preempted
- * RCU readers.
- */
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
-{
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Because preemptible RCU does not exist, no quieting of tasks. */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
-{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, there is no need to suppress
- * its CPU stall warnings.
- */
-static void rcu_preempt_stall_reset(void)
-{
-}
-
-/*
- * Because there is no preemptible RCU, there can be no readers blocked,
- * so there is no need to check for blocked tasks. So check only for
- * bogus qsmask values.
- */
-static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
-{
- WARN_ON_ONCE(rnp->qsmask);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, it never needs CPU-offline
- * processing.
- */
-static void rcu_preempt_offline_cpu(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to check.
- */
-static void rcu_preempt_check_callbacks(int cpu)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-
-/*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
- */
-void synchronize_rcu_expedited(void)
-{
- synchronize_sched_expedited();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
- * Because preemptible RCU does not exist, it never has any work to do.
- */
-static int rcu_preempt_pending(int cpu)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, it never needs any CPU.
- */
-static int rcu_preempt_needs_cpu(int cpu)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, rcu_barrier() is just
- * another name for rcu_barrier_sched().
- */
-void rcu_barrier(void)
-{
- rcu_barrier_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
-/*
- * Because preemptible RCU does not exist, there is no per-CPU
- * data to initialize.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-}
-
-/*
- * Because there is no preemptible RCU, there are no callbacks to move.
- */
-static void rcu_preempt_send_cbs_to_online(void)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, it need not be initialized.
- */
-static void __init __rcu_init_preempt(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
-
-#ifdef CONFIG_RCU_BOOST
-
-#include "rtmutex_common.h"
-
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
- if (list_empty(&rnp->blkd_tasks))
- rnp->n_balk_blkd_tasks++;
- else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
- rnp->n_balk_exp_gp_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
- rnp->n_balk_boost_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
- rnp->n_balk_notblocked++;
- else if (rnp->gp_tasks != NULL &&
- ULONG_CMP_LT(jiffies, rnp->boost_time))
- rnp->n_balk_notyet++;
- else
- rnp->n_balk_nos++;
-}
-
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Carry out RCU priority boosting on the task indicated by ->exp_tasks
- * or ->boost_tasks, advancing the pointer to the next task in the
- * ->blkd_tasks list.
- *
- * Note that irqs must be enabled: boosting the task can block.
- * Returns 1 if there are more tasks needing to be boosted.
- */
-static int rcu_boost(struct rcu_node *rnp)
-{
- unsigned long flags;
- struct rt_mutex mtx;
- struct task_struct *t;
- struct list_head *tb;
-
- if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
- return 0; /* Nothing left to boost. */
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
-
- /*
- * Recheck under the lock: all tasks in need of boosting
- * might exit their RCU read-side critical sections on their own.
- */
- if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return 0;
- }
-
- /*
- * Preferentially boost tasks blocking expedited grace periods.
- * This cannot starve the normal grace periods because a second
- * expedited grace period must boost all blocked tasks, including
- * those blocking the pre-existing normal grace period.
- */
- if (rnp->exp_tasks != NULL) {
- tb = rnp->exp_tasks;
- rnp->n_exp_boosts++;
- } else {
- tb = rnp->boost_tasks;
- rnp->n_normal_boosts++;
- }
- rnp->n_tasks_boosted++;
-
- /*
- * We boost task t by manufacturing an rt_mutex that appears to
- * be held by task t. We leave a pointer to that rt_mutex where
- * task t can find it, and task t will release the mutex when it
- * exits its outermost RCU read-side critical section. Then
- * simply acquiring this artificial rt_mutex will boost task
- * t's priority. (Thanks to tglx for suggesting this approach!)
- *
- * Note that task t must acquire rnp->lock to remove itself from
- * the ->blkd_tasks list, which it will do from exit() if from
- * nowhere else. We therefore are guaranteed that task t will
- * stay around at least until we drop rnp->lock. Note that
- * rnp->lock also resolves races between our priority boosting
- * and task t's exiting its outermost RCU read-side critical
- * section.
- */
- t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&mtx, t);
- t->rcu_boost_mutex = &mtx;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
- rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
-
- return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
- ACCESS_ONCE(rnp->boost_tasks) != NULL;
-}
-
-/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost. We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
- invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
-/*
- * Priority-boosting kthread. One per leaf rcu_node and one for the
- * root rcu_node.
- */
-static int rcu_boost_kthread(void *arg)
-{
- struct rcu_node *rnp = (struct rcu_node *)arg;
- int spincnt = 0;
- int more2boost;
-
- trace_rcu_utilization("Start boost kthread@init");
- for (;;) {
- rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
- trace_rcu_utilization("End boost kthread@rcu_wait");
- rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
- trace_rcu_utilization("Start boost kthread@rcu_wait");
- rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
- more2boost = rcu_boost(rnp);
- if (more2boost)
- spincnt++;
- else
- spincnt = 0;
- if (spincnt > 10) {
- trace_rcu_utilization("End boost kthread@rcu_yield");
- rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
- trace_rcu_utilization("Start boost kthread@rcu_yield");
- spincnt = 0;
- }
- }
- /* NOTREACHED */
- trace_rcu_utilization("End boost kthread@notreached");
- return 0;
-}
-
-/*
- * Check to see if it is time to start boosting RCU readers that are
- * blocking the current grace period, and, if so, tell the per-rcu_node
- * kthread to start boosting them. If there is an expedited grace
- * period in progress, it is always time to boost.
- *
- * The caller must hold rnp->lock, which this function releases,
- * but irqs remain disabled. The ->boost_kthread_task is immortal,
- * so we don't need to worry about it going away.
- */
-static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
-{
- struct task_struct *t;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
- rnp->n_balk_exp_gp_tasks++;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- if (rnp->exp_tasks != NULL ||
- (rnp->gp_tasks != NULL &&
- rnp->boost_tasks == NULL &&
- rnp->qsmask == 0 &&
- ULONG_CMP_GE(jiffies, rnp->boost_time))) {
- if (rnp->exp_tasks == NULL)
- rnp->boost_tasks = rnp->gp_tasks;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- t = rnp->boost_kthread_task;
- if (t != NULL)
- wake_up_process(t);
- } else {
- rcu_initiate_boost_trace(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
-}
-
-/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task))
- wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
- local_irq_restore(flags);
-}
-
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __get_cpu_var(rcu_cpu_kthread_task) == current;
-}
-
-/*
- * Set the affinity of the boost kthread. The CPU-hotplug locks are
- * held, so no one should be messing with the existence of the boost
- * kthread.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm)
-{
- struct task_struct *t;
-
- t = rnp->boost_kthread_task;
- if (t != NULL)
- set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
-}
-
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
-
-/*
- * Do priority-boost accounting for the start of a new grace period.
- */
-static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
-{
- rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
-}
-
-/*
- * Create an RCU-boost kthread for the specified node if one does not
- * already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
- */
-static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index)
-{
- unsigned long flags;
- struct sched_param sp;
- struct task_struct *t;
-
- if (&rcu_preempt_state != rsp)
- return 0;
- rsp->boost = 1;
- if (rnp->boost_kthread_task != NULL)
- return 0;
- t = kthread_create(rcu_boost_kthread, (void *)rnp,
- "rcub/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->boost_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = RCU_BOOST_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
- struct task_struct *t;
-
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_kthread_do_work(void)
-{
- rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
- rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
- rcu_preempt_do_callbacks();
-}
-
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
- struct task_struct *t;
-
- t = rnp->node_kthread_task;
- if (t != NULL)
- wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
- int policy;
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
- return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
- }
- sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
- struct rcu_node *rnp = rdp->mynode;
-
- atomic_or(rdp->grpmask, &rnp->wakemask);
- invoke_rcu_node_kthread(rnp);
-}
-
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted. Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
-{
- struct sched_param sp;
- struct timer_list yield_timer;
- int prio = current->rt_priority;
-
- setup_timer_on_stack(&yield_timer, f, arg);
- mod_timer(&yield_timer, jiffies + 2);
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
- set_user_nice(current, 19);
- schedule();
- set_user_nice(current, 0);
- sp.sched_priority = prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
- del_timer(&yield_timer);
-}
-
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
- * RCU softirq used in flavors and configurations of RCU that do not
- * support RCU priority boosting.
- */
-static int rcu_cpu_kthread(void *arg)
-{
- int cpu = (int)(long)arg;
- unsigned long flags;
- int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
- char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
-
- trace_rcu_utilization("Start CPU kthread@init");
- for (;;) {
- *statusp = RCU_KTHREAD_WAITING;
- trace_rcu_utilization("End CPU kthread@rcu_wait");
- rcu_wait(*workp != 0 || kthread_should_stop());
- trace_rcu_utilization("Start CPU kthread@rcu_wait");
- local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
- local_bh_enable();
- break;
- }
- *statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
- local_irq_save(flags);
- work = *workp;
- *workp = 0;
- local_irq_restore(flags);
- if (work)
- rcu_kthread_do_work();
- local_bh_enable();
- if (*workp != 0)
- spincnt++;
- else
- spincnt = 0;
- if (spincnt > 10) {
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
- trace_rcu_utilization("Start CPU kthread@rcu_yield");
- spincnt = 0;
- }
- }
- *statusp = RCU_KTHREAD_STOPPED;
- trace_rcu_utilization("End CPU kthread@term");
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online. We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods. So we bind the per-CPU kthread to its CPU only if the CPU
- * is online. If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create_on_node(rcu_cpu_kthread,
- (void *)(long)cpu,
- cpu_to_node(cpu),
- "rcuc/%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (cpu_online(cpu))
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
- return 0;
-}
-
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed. We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp = (struct rcu_node *)arg;
- struct sched_param sp;
- struct task_struct *t;
-
- for (;;) {
- rnp->node_kthread_status = RCU_KTHREAD_WAITING;
- rcu_wait(atomic_read(&rnp->wakemask) != 0);
- rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- mask = atomic_xchg(&rnp->wakemask, 0);
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
- if ((mask & 0x1) == 0)
- continue;
- preempt_disable();
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (!cpu_online(cpu) || t == NULL) {
- preempt_enable();
- continue;
- }
- per_cpu(rcu_cpu_has_work, cpu) = 1;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- preempt_enable();
- }
- }
- /* NOTREACHED */
- rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
- return 0;
-}
-
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
- cpumask_var_t cm;
- int cpu;
- unsigned long mask = rnp->qsmaskinit;
-
- if (rnp->node_kthread_task == NULL)
- return;
- if (!alloc_cpumask_var(&cm, GFP_KERNEL))
- return;
- cpumask_clear(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
- if ((mask & 0x1) && cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0) {
- cpumask_setall(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
- cpumask_clear_cpu(cpu, cm);
- WARN_ON_ONCE(cpumask_weight(cm) == 0);
- }
- set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
- rcu_boost_kthread_setaffinity(rnp, cm);
- free_cpumask_var(cm);
-}
-
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held. So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
-{
- unsigned long flags;
- int rnp_index = rnp - &rsp->node[0];
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- rnp->qsmaskinit == 0)
- return 0;
- if (rnp->node_kthread_task == NULL) {
- t = kthread_create(rcu_node_kthread, (void *)rnp,
- "rcun/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->node_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = 99;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- }
- return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
-
-/*
- * Spawn all kthreads -- called as soon as the scheduler is running.
- */
-static int __init rcu_spawn_kthreads(void)
-{
- int cpu;
- struct rcu_node *rnp;
-
- rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu) {
- per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
- rnp = rcu_get_root(rcu_state);
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- if (NUM_RCU_NODES > 1) {
- rcu_for_each_leaf_node(rcu_state, rnp)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
- return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-
-static void __cpuinit rcu_prepare_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
-}
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
-{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-static void invoke_rcu_callbacks_kthread(void)
-{
- WARN_ON_ONCE(1);
-}
-
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
-static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
-{
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-}
-
-static int __init rcu_scheduler_really_started(void)
-{
- rcu_scheduler_fully_active = 1;
- return 0;
-}
-early_initcall(rcu_scheduler_really_started);
-
-static void __cpuinit rcu_prepare_kthreads(int cpu)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- cond_resched();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
-static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- *
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
- */
-void synchronize_sched_expedited(void)
-{
- int firstsnap, s, snap, trycount = 0;
-
- /* Note that atomic_inc_return() implies full memory barrier. */
- firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
- get_online_cpus();
-
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
-
- /* Check to see if someone else did our work for us. */
- s = atomic_read(&sync_sched_expedited_done);
- if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
-
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We subtract
- * 1 to get the same token that the last incrementer got.
- * We retry after they started, so our grace period works
- * for them, and they started after our first try, so their
- * grace period works for us.
- */
- get_online_cpus();
- snap = atomic_read(&sync_sched_expedited_started);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
- }
-
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did beat us to the punch.
- */
- do {
- s = atomic_read(&sync_sched_expedited_done);
- if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
- smp_mb(); /* ensure test happens before caller kfree */
- break;
- }
- } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
-
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
-
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
- * any flavor of RCU.
- */
-int rcu_needs_cpu(int cpu)
-{
- return rcu_cpu_has_callbacks(cpu);
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
- * is nothing.
- */
-static void rcu_prepare_for_idle(int cpu)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode. This is handled by a
- * state machine implemented by rcu_prepare_for_idle() below.
- *
- * The following three proprocessor symbols control this state machine:
- *
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
- * to satisfy RCU. Beyond this point, it is better to incur a periodic
- * scheduling-clock interrupt than to loop through the state machine
- * at full power.
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
- * optional if RCU does not need anything immediately from this
- * CPU, even if this CPU still has RCU callbacks queued. The first
- * times through the state machine are mandatory: we need to give
- * the state machine a chance to communicate a quiescent state
- * to the RCU core.
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- *
- * The values below work well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
-#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
-#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
-
-static DEFINE_PER_CPU(int, rcu_dyntick_drain);
-static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
-static ktime_t rcu_idle_gp_wait;
-
-/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it. After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
- */
-int rcu_needs_cpu(int cpu)
-{
- /* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu))
- return 0;
- /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
- return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
-}
-
-/*
- * Timer handler used to force CPU to start pushing its remaining RCU
- * callbacks in the case where it entered dyntick-idle mode with callbacks
- * pending. The hander doesn't really need to do anything because the
- * real work is done upon re-entry to idle, or by the next scheduling-clock
- * interrupt should idle not be re-entered.
- */
-static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
-{
- trace_rcu_prep_idle("Timer");
- return HRTIMER_NORESTART;
-}
-
-/*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
- static int firsttime = 1;
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
-
- hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtp->function = rcu_idle_gp_timer_func;
- if (firsttime) {
- unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
-
- rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
- firsttime = 0;
- }
-}
-
-/*
- * Clean up for exit from idle. Because we are exiting from idle, there
- * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
- hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
-}
-
-/*
- * Check to see if any RCU-related work can be done by the current CPU,
- * and if so, schedule a softirq to get it done. This function is part
- * of the RCU implementation; it is -not- an exported member of the RCU API.
- *
- * The idea is for the current CPU to clear out all work required by the
- * RCU core for the current grace period, so that this CPU can be permitted
- * to enter dyntick-idle mode. In some cases, it will need to be awakened
- * at the end of the grace period by whatever CPU ends the grace period.
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
- * number of wakeups by a modest integer factor.
- *
- * Because it is not legal to invoke rcu_process_callbacks() with irqs
- * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(int cpu)
-{
- unsigned long flags;
-
- local_irq_save(flags);
-
- /*
- * If there are no callbacks on this CPU, enter dyntick-idle mode.
- * Also reset state to avoid prejudicing later attempts.
- */
- if (!rcu_cpu_has_callbacks(cpu)) {
- per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
- per_cpu(rcu_dyntick_drain, cpu) = 0;
- local_irq_restore(flags);
- trace_rcu_prep_idle("No callbacks");
- return;
- }
-
- /*
- * If in holdoff mode, just return. We will presumably have
- * refrained from disabling the scheduling-clock tick.
- */
- if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
- local_irq_restore(flags);
- trace_rcu_prep_idle("In holdoff");
- return;
- }
-
- /* Check and update the rcu_dyntick_drain sequencing. */
- if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
- /* First time through, initialize the counter. */
- per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
- } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
- !rcu_pending(cpu)) {
- /* Can we go dyntick-idle despite still having callbacks? */
- trace_rcu_prep_idle("Dyntick with callbacks");
- per_cpu(rcu_dyntick_drain, cpu) = 0;
- per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_gp_wait, HRTIMER_MODE_REL);
- return; /* Nothing more to do immediately. */
- } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
- /* We have hit the limit, so time to give up. */
- per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
- local_irq_restore(flags);
- trace_rcu_prep_idle("Begin holdoff");
- invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
- return;
- }
-
- /*
- * Do one step of pushing the remaining RCU callbacks through
- * the RCU core state machine.
- */
-#ifdef CONFIG_TREE_PREEMPT_RCU
- if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
- local_irq_restore(flags);
- rcu_preempt_qs(cpu);
- force_quiescent_state(&rcu_preempt_state, 0);
- local_irq_save(flags);
- }
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- if (per_cpu(rcu_sched_data, cpu).nxtlist) {
- local_irq_restore(flags);
- rcu_sched_qs(cpu);
- force_quiescent_state(&rcu_sched_state, 0);
- local_irq_save(flags);
- }
- if (per_cpu(rcu_bh_data, cpu).nxtlist) {
- local_irq_restore(flags);
- rcu_bh_qs(cpu);
- force_quiescent_state(&rcu_bh_state, 0);
- local_irq_save(flags);
- }
-
- /*
- * If RCU callbacks are still pending, RCU still needs this CPU.
- * So try forcing the callbacks through the grace period.
- */
- if (rcu_cpu_has_callbacks(cpu)) {
- local_irq_restore(flags);
- trace_rcu_prep_idle("More callbacks");
- invoke_rcu_core();
- } else {
- local_irq_restore(flags);
- trace_rcu_prep_idle("Callbacks drained");
- }
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 00000000000..a3a9e240fcd
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,433 @@
+/*
+ * linux/kernel/reboot.c
+ *
+ * Copyright (C) 2013 Linus Torvalds
+ */
+
+#define pr_fmt(fmt) "reboot: " fmt
+
+#include <linux/ctype.h>
+#include <linux/export.h>
+#include <linux/kexec.h>
+#include <linux/kmod.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/syscore_ops.h>
+#include <linux/uaccess.h>
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
+
+#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#define DEFAULT_REBOOT_MODE = REBOOT_HARD
+#else
+#define DEFAULT_REBOOT_MODE
+#endif
+enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+
+/*
+ * This variable is used privately to keep track of whether or not
+ * reboot_type is still set to its default value (i.e., reboot= hasn't
+ * been set on the command line). This is needed so that we can
+ * suppress DMI scanning for reboot quirks. Without it, it's
+ * impossible to override a faulty reboot quirk without recompiling.
+ */
+int reboot_default = 1;
+int reboot_cpu;
+enum reboot_type reboot_type = BOOT_ACPI;
+int reboot_force;
+
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+
+void (*pm_power_off_prepare)(void);
+
+/**
+ * emergency_restart - reboot the system
+ *
+ * Without shutting down any hardware or taking any locks
+ * reboot the system. This is called when we know we are in
+ * trouble so this is our best effort to reboot. This is
+ * safe to call in interrupt context.
+ */
+void emergency_restart(void)
+{
+ kmsg_dump(KMSG_DUMP_EMERG);
+ machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
+void kernel_restart_prepare(char *cmd)
+{
+ blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+ system_state = SYSTEM_RESTART;
+ usermodehelper_disable();
+ device_shutdown();
+}
+
+/**
+ * register_reboot_notifier - Register function to be called at reboot time
+ * @nb: Info about notifier function to be called
+ *
+ * Registers a function with the list of functions
+ * to be called at reboot time.
+ *
+ * Currently always returns zero, as blocking_notifier_chain_register()
+ * always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ * unregister_reboot_notifier - Unregister previously registered reboot notifier
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered reboot
+ * notifier function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+void migrate_to_reboot_cpu(void)
+{
+ /* The boot cpu is always logical cpu 0 */
+ int cpu = reboot_cpu;
+
+ cpu_hotplug_disable();
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_online(cpu))
+ cpu = cpumask_first(cpu_online_mask);
+
+ /* Prevent races with other tasks migrating this task */
+ current->flags |= PF_NO_SETAFFINITY;
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
+/**
+ * kernel_restart - reboot the system
+ * @cmd: pointer to buffer containing command to execute for restart
+ * or %NULL
+ *
+ * Shutdown everything and perform a clean reboot.
+ * This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+ kernel_restart_prepare(cmd);
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+ if (!cmd)
+ pr_emerg("Restarting system\n");
+ else
+ pr_emerg("Restarting system with command '%s'\n", cmd);
+ kmsg_dump(KMSG_DUMP_RESTART);
+ machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+static void kernel_shutdown_prepare(enum system_states state)
+{
+ blocking_notifier_call_chain(&reboot_notifier_list,
+ (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
+ system_state = state;
+ usermodehelper_disable();
+ device_shutdown();
+}
+/**
+ * kernel_halt - halt the system
+ *
+ * Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt(void)
+{
+ kernel_shutdown_prepare(SYSTEM_HALT);
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+ pr_emerg("System halted\n");
+ kmsg_dump(KMSG_DUMP_HALT);
+ machine_halt();
+}
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+/**
+ * kernel_power_off - power_off the system
+ *
+ * Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off(void)
+{
+ kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+ if (pm_power_off_prepare)
+ pm_power_off_prepare();
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+ pr_emerg("Power down\n");
+ kmsg_dump(KMSG_DUMP_POWEROFF);
+ machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
+
+static DEFINE_MUTEX(reboot_mutex);
+
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+ void __user *, arg)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ char buffer[256];
+ int ret = 0;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
+ return -EPERM;
+
+ /* For safety, we require "magic" arguments. */
+ if (magic1 != LINUX_REBOOT_MAGIC1 ||
+ (magic2 != LINUX_REBOOT_MAGIC2 &&
+ magic2 != LINUX_REBOOT_MAGIC2A &&
+ magic2 != LINUX_REBOOT_MAGIC2B &&
+ magic2 != LINUX_REBOOT_MAGIC2C))
+ return -EINVAL;
+
+ /*
+ * If pid namespaces are enabled and the current task is in a child
+ * pid_namespace, the command is handled by reboot_pid_ns() which will
+ * call do_exit().
+ */
+ ret = reboot_pid_ns(pid_ns, cmd);
+ if (ret)
+ return ret;
+
+ /* Instead of trying to make the power_off code look like
+ * halt when pm_power_off is not set do it the easy way.
+ */
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ cmd = LINUX_REBOOT_CMD_HALT;
+
+ mutex_lock(&reboot_mutex);
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART:
+ kernel_restart(NULL);
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_ON:
+ C_A_D = 1;
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_OFF:
+ C_A_D = 0;
+ break;
+
+ case LINUX_REBOOT_CMD_HALT:
+ kernel_halt();
+ do_exit(0);
+ panic("cannot halt");
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ kernel_power_off();
+ do_exit(0);
+ break;
+
+ case LINUX_REBOOT_CMD_RESTART2:
+ ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
+ if (ret < 0) {
+ ret = -EFAULT;
+ break;
+ }
+ buffer[sizeof(buffer) - 1] = '\0';
+
+ kernel_restart(buffer);
+ break;
+
+#ifdef CONFIG_KEXEC
+ case LINUX_REBOOT_CMD_KEXEC:
+ ret = kernel_kexec();
+ break;
+#endif
+
+#ifdef CONFIG_HIBERNATION
+ case LINUX_REBOOT_CMD_SW_SUSPEND:
+ ret = hibernate();
+ break;
+#endif
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ mutex_unlock(&reboot_mutex);
+ return ret;
+}
+
+static void deferred_cad(struct work_struct *dummy)
+{
+ kernel_restart(NULL);
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+ static DECLARE_WORK(cad_work, deferred_cad);
+
+ if (C_A_D)
+ schedule_work(&cad_work);
+ else
+ kill_cad_pid(SIGINT, 1);
+}
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+
+static int __orderly_poweroff(bool force)
+{
+ char **argv;
+ static char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL
+ };
+ int ret;
+
+ argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+ if (argv) {
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ argv_free(argv);
+ } else {
+ ret = -ENOMEM;
+ }
+
+ if (ret && force) {
+ pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+ /*
+ * I guess this should try to kick off some daemon to sync and
+ * poweroff asap. Or not even bother syncing if we're doing an
+ * emergency shutdown?
+ */
+ emergency_sync();
+ kernel_power_off();
+ }
+
+ return ret;
+}
+
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+ __orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+ if (force) /* do not override the pending "true" */
+ poweroff_force = true;
+ schedule_work(&poweroff_work);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
+
+static int __init reboot_setup(char *str)
+{
+ for (;;) {
+ /*
+ * Having anything passed on the command line via
+ * reboot= will cause us to disable DMI checking
+ * below.
+ */
+ reboot_default = 0;
+
+ switch (*str) {
+ case 'w':
+ reboot_mode = REBOOT_WARM;
+ break;
+
+ case 'c':
+ reboot_mode = REBOOT_COLD;
+ break;
+
+ case 'h':
+ reboot_mode = REBOOT_HARD;
+ break;
+
+ case 's':
+ {
+ int rc;
+
+ if (isdigit(*(str+1))) {
+ rc = kstrtoint(str+1, 0, &reboot_cpu);
+ if (rc)
+ return rc;
+ } else if (str[1] == 'm' && str[2] == 'p' &&
+ isdigit(*(str+3))) {
+ rc = kstrtoint(str+3, 0, &reboot_cpu);
+ if (rc)
+ return rc;
+ } else
+ reboot_mode = REBOOT_SOFT;
+ break;
+ }
+ case 'g':
+ reboot_mode = REBOOT_GPIO;
+ break;
+
+ case 'b':
+ case 'a':
+ case 'k':
+ case 't':
+ case 'e':
+ case 'p':
+ reboot_type = *str;
+ break;
+
+ case 'f':
+ reboot_force = 1;
+ break;
+ }
+
+ str = strchr(str, ',');
+ if (str)
+ str++;
+ else
+ break;
+ }
+ return 1;
+}
+__setup("reboot=", reboot_setup);
diff --git a/kernel/relay.c b/kernel/relay.c
index ab56a1764d4..5a56d3c8dc0 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -227,14 +227,13 @@ static void relay_destroy_buf(struct rchan_buf *buf)
* relay_remove_buf - remove a channel buffer
* @kref: target kernel reference that contains the relay buffer
*
- * Removes the file from the fileystem, which also frees the
+ * Removes the file from the filesystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
* kref_put().
*/
static void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
- buf->chan->cb->remove_buf_file(buf->dentry);
relay_destroy_buf(buf);
}
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
del_timer_sync(&buf->timer);
+ buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,
*
* Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
*/
-static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
+static int relay_hotcpu_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
- chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+ chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
chan->parent = parent;
chan->private_data = private_data;
if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
static int subbuf_read_actor(size_t read_start,
struct rchan_buf *buf,
size_t avail,
- read_descriptor_t *desc,
- read_actor_t actor)
+ read_descriptor_t *desc)
{
void *from;
int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
typedef int (*subbuf_actor_t) (size_t read_start,
struct rchan_buf *buf,
size_t avail,
- read_descriptor_t *desc,
- read_actor_t actor);
+ read_descriptor_t *desc);
/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
subbuf_actor_t subbuf_actor,
- read_actor_t actor,
read_descriptor_t *desc)
{
struct rchan_buf *buf = filp->private_data;
@@ -1139,7 +1136,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_lock(&file_inode(filp)->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
break;
avail = min(desc->count, avail);
- ret = subbuf_actor(read_start, buf, avail, desc, actor);
+ ret = subbuf_actor(read_start, buf, avail, desc);
if (desc->error < 0)
break;
@@ -1159,7 +1156,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&file_inode(filp)->i_mutex);
return desc->written;
}
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
desc.count = count;
desc.arg.buf = buffer;
desc.error = 0;
- return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
- NULL, &desc);
+ return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
}
static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
@@ -1199,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
static const struct pipe_buf_operations relay_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = relay_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -1235,6 +1229,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
.flags = flags,
.ops = &relay_pipe_buf_ops,
@@ -1256,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
+ nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
@@ -1302,8 +1297,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
ret += padding;
out:
- splice_shrink_spd(pipe, &spd);
- return ret;
+ splice_shrink_spd(&spd);
+ return ret;
}
static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b..e791130f85a 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,102 +17,110 @@
void res_counter_init(struct res_counter *counter, struct res_counter *parent)
{
spin_lock_init(&counter->lock);
- counter->limit = RESOURCE_MAX;
- counter->soft_limit = RESOURCE_MAX;
+ counter->limit = RES_COUNTER_MAX;
+ counter->soft_limit = RES_COUNTER_MAX;
counter->parent = parent;
}
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+static u64 res_counter_uncharge_locked(struct res_counter *counter,
+ unsigned long val)
{
+ if (WARN_ON(counter->usage < val))
+ val = counter->usage;
+
+ counter->usage -= val;
+ return counter->usage;
+}
+
+static int res_counter_charge_locked(struct res_counter *counter,
+ unsigned long val, bool force)
+{
+ int ret = 0;
+
if (counter->usage + val > counter->limit) {
counter->failcnt++;
- return -ENOMEM;
+ ret = -ENOMEM;
+ if (!force)
+ return ret;
}
counter->usage += val;
if (counter->usage > counter->max_usage)
counter->max_usage = counter->usage;
- return 0;
+ return ret;
}
-int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
+static int __res_counter_charge(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at, bool force)
{
- int ret;
+ int ret, r;
unsigned long flags;
struct res_counter *c, *u;
+ r = ret = 0;
*limit_fail_at = NULL;
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
- ret = res_counter_charge_locked(c, val);
+ r = res_counter_charge_locked(c, val, force);
spin_unlock(&c->lock);
- if (ret < 0) {
+ if (r < 0 && !ret) {
+ ret = r;
*limit_fail_at = c;
- goto undo;
+ if (!force)
+ break;
}
}
- ret = 0;
- goto done;
-undo:
- for (u = counter; u != c; u = u->parent) {
- spin_lock(&u->lock);
- res_counter_uncharge_locked(u, val);
- spin_unlock(&u->lock);
- }
-done:
- local_irq_restore(flags);
- return ret;
-}
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- int ret, r;
- unsigned long flags;
- struct res_counter *c;
-
- r = ret = 0;
- *limit_fail_at = NULL;
- local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
- spin_lock(&c->lock);
- r = res_counter_charge_locked(c, val);
- if (r)
- c->usage += val;
- spin_unlock(&c->lock);
- if (r < 0 && ret == 0) {
- *limit_fail_at = c;
- ret = r;
+ if (ret < 0 && !force) {
+ for (u = counter; u != c; u = u->parent) {
+ spin_lock(&u->lock);
+ res_counter_uncharge_locked(u, val);
+ spin_unlock(&u->lock);
}
}
local_irq_restore(flags);
return ret;
}
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at)
{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
+ return __res_counter_charge(counter, val, limit_fail_at, false);
+}
- counter->usage -= val;
+int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at)
+{
+ return __res_counter_charge(counter, val, limit_fail_at, true);
}
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge_until(struct res_counter *counter,
+ struct res_counter *top,
+ unsigned long val)
{
unsigned long flags;
struct res_counter *c;
+ u64 ret = 0;
local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
+ for (c = counter; c != top; c = c->parent) {
+ u64 r;
spin_lock(&c->lock);
- res_counter_uncharge_locked(c, val);
+ r = res_counter_uncharge_locked(c, val);
+ if (c == counter)
+ ret = r;
spin_unlock(&c->lock);
}
local_irq_restore(flags);
+ return ret;
}
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+ return res_counter_uncharge_until(counter, NULL, val);
+}
static inline unsigned long long *
res_counter_member(struct res_counter *counter, int member)
@@ -171,45 +179,33 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
#endif
int res_counter_memparse_write_strategy(const char *buf,
- unsigned long long *res)
+ unsigned long long *resp)
{
char *end;
+ unsigned long long res;
- /* return RESOURCE_MAX(unlimited) if "-1" is specified */
+ /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
if (*buf == '-') {
- *res = simple_strtoull(buf + 1, &end, 10);
- if (*res != 1 || *end != '\0')
+ int rc = kstrtoull(buf + 1, 10, &res);
+
+ if (rc)
+ return rc;
+ if (res != 1)
return -EINVAL;
- *res = RESOURCE_MAX;
+ *resp = RES_COUNTER_MAX;
return 0;
}
- *res = memparse(buf, &end);
+ res = memparse(buf, &end);
if (*end != '\0')
return -EINVAL;
- *res = PAGE_ALIGN(*res);
- return 0;
-}
+ if (PAGE_ALIGN(res) >= res)
+ res = PAGE_ALIGN(res);
+ else
+ res = RES_COUNTER_MAX;
-int res_counter_write(struct res_counter *counter, int member,
- const char *buf, write_strategy_fn write_strategy)
-{
- char *end;
- unsigned long flags;
- unsigned long long tmp, *val;
+ *resp = res;
- if (write_strategy) {
- if (write_strategy(buf, &tmp))
- return -EINVAL;
- } else {
- tmp = simple_strtoull(buf, &end, 10);
- if (*end != '\0')
- return -EINVAL;
- }
- spin_lock_irqsave(&counter->lock, flags);
- val = res_counter_member(counter, member);
- *val = tmp;
- spin_unlock_irqrestore(&counter->lock, flags);
return 0;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d..3c2237ac32d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
* Arbitrary resource management.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/export.h>
#include <linux/errno.h>
#include <linux/ioport.h>
@@ -19,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/io.h>
@@ -48,6 +51,14 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -149,6 +160,40 @@ __initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
+static void free_resource(struct resource *res)
+{
+ if (!res)
+ return;
+
+ if (!PageSlab(virt_to_head_page(res))) {
+ spin_lock(&bootmem_resource_lock);
+ res->sibling = bootmem_resource_free;
+ bootmem_resource_free = res;
+ spin_unlock(&bootmem_resource_lock);
+ } else {
+ kfree(res);
+ }
+}
+
+static struct resource *alloc_resource(gfp_t flags)
+{
+ struct resource *res = NULL;
+
+ spin_lock(&bootmem_resource_lock);
+ if (bootmem_resource_free) {
+ res = bootmem_resource_free;
+ bootmem_resource_free = res->sibling;
+ }
+ spin_unlock(&bootmem_resource_lock);
+
+ if (res)
+ memset(res, 0, sizeof(struct resource));
+ else
+ res = kzalloc(sizeof(struct resource), flags);
+
+ return res;
+}
+
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
@@ -364,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)
{
return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
}
+EXPORT_SYMBOL_GPL(page_is_ram);
void __weak arch_remove_reservations(struct resource *avail)
{
@@ -386,11 +432,6 @@ static void resource_clip(struct resource *res, resource_size_t min,
res->end = max;
}
-static bool resource_contains(struct resource *res1, struct resource *res2)
-{
- return res1->start <= res2->start && res1->end >= res2->end;
-}
-
/*
* Find empty slot in the resource tree with the given range and
* alignment constraints
@@ -403,7 +444,6 @@ static int __find_resource(struct resource *root, struct resource *old,
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
- tmp.flags = new->flags;
tmp.start = root->start;
/*
* Skip past an allocated resource that starts at 0, since the assignment
@@ -426,10 +466,11 @@ static int __find_resource(struct resource *root, struct resource *old,
arch_remove_reservations(&tmp);
/* Check for overflow after ALIGN() */
- avail = *new;
avail.start = ALIGN(tmp.start, constraint->align);
avail.end = tmp.end;
+ avail.flags = new->flags & ~IORESOURCE_UNSET;
if (avail.start >= tmp.start) {
+ alloc.flags = avail.flags;
alloc.start = constraint->alignf(constraint->alignf_data, &avail,
size, constraint->align);
alloc.end = alloc.start + size - 1;
@@ -470,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new,
* @newsize: new size of the resource descriptor
* @constraint: the size and alignment constraints to be met.
*/
-int reallocate_resource(struct resource *root, struct resource *old,
+static int reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint *constraint)
{
@@ -515,8 +556,8 @@ out:
* @root: root resource descriptor
* @new: resource descriptor desired by caller
* @size: requested resource region size
- * @min: minimum size to allocate
- * @max: maximum size to allocate
+ * @min: minimum boundary to allocate
+ * @max: maximum boundary to allocate
* @align: alignment requested, in bytes
* @alignf: alignment function, optional, called if not NULL
* @alignf_data: arbitrary data to pass to the @alignf function
@@ -704,32 +745,19 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
write_unlock(&resource_lock);
}
-/**
- * adjust_resource - modify a resource's start and size
- * @res: resource to modify
- * @start: new start value
- * @size: new size
- *
- * Given an existing resource, change its start and size to match the
- * arguments. Returns 0 on success, -EBUSY if it can't fit.
- * Existing children of the resource are assumed to be immutable.
- */
-int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+static int __adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
{
struct resource *tmp, *parent = res->parent;
resource_size_t end = start + size - 1;
int result = -EBUSY;
- write_lock(&resource_lock);
+ if (!parent)
+ goto skip;
if ((start < parent->start) || (end > parent->end))
goto out;
- for (tmp = res->child; tmp; tmp = tmp->sibling) {
- if ((tmp->start < start) || (tmp->end > end))
- goto out;
- }
-
if (res->sibling && (res->sibling->start <= end))
goto out;
@@ -741,14 +769,40 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
goto out;
}
+skip:
+ for (tmp = res->child; tmp; tmp = tmp->sibling)
+ if ((tmp->start < start) || (tmp->end > end))
+ goto out;
+
res->start = start;
res->end = end;
result = 0;
out:
+ return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
+{
+ int result;
+
+ write_lock(&resource_lock);
+ result = __adjust_resource(res, start, size);
write_unlock(&resource_lock);
return result;
}
+EXPORT_SYMBOL(adjust_resource);
static void __init __reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
@@ -756,7 +810,8 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
- struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *res = alloc_resource(GFP_ATOMIC);
+ struct resource *next_res = NULL;
if (!res)
return;
@@ -766,34 +821,76 @@ static void __init __reserve_region_with_split(struct resource *root,
res->end = end;
res->flags = IORESOURCE_BUSY;
- conflict = __request_resource(parent, res);
- if (!conflict)
- return;
+ while (1) {
- /* failed, split and try again */
- kfree(res);
+ conflict = __request_resource(parent, res);
+ if (!conflict) {
+ if (!next_res)
+ break;
+ res = next_res;
+ next_res = NULL;
+ continue;
+ }
- /* conflict covered whole area */
- if (conflict->start <= start && conflict->end >= end)
- return;
+ /* conflict covered whole area */
+ if (conflict->start <= res->start &&
+ conflict->end >= res->end) {
+ free_resource(res);
+ WARN_ON(next_res);
+ break;
+ }
+
+ /* failed, split and try again */
+ if (conflict->start > res->start) {
+ end = res->end;
+ res->end = conflict->start - 1;
+ if (conflict->end < end) {
+ next_res = alloc_resource(GFP_ATOMIC);
+ if (!next_res) {
+ free_resource(res);
+ break;
+ }
+ next_res->name = name;
+ next_res->start = conflict->end + 1;
+ next_res->end = end;
+ next_res->flags = IORESOURCE_BUSY;
+ }
+ } else {
+ res->start = conflict->end + 1;
+ }
+ }
- if (conflict->start > start)
- __reserve_region_with_split(root, start, conflict->start-1, name);
- if (conflict->end < end)
- __reserve_region_with_split(root, conflict->end+1, end, name);
}
void __init reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
{
+ int abort = 0;
+
write_lock(&resource_lock);
- __reserve_region_with_split(root, start, end, name);
+ if (root->start > start || root->end < end) {
+ pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
+ (unsigned long long)start, (unsigned long long)end,
+ root);
+ if (start > root->end || end < root->start)
+ abort = 1;
+ else {
+ if (end > root->end)
+ end = root->end;
+ if (start < root->start)
+ start = root->start;
+ pr_err("fixing request to [0x%llx-0x%llx]\n",
+ (unsigned long long)start,
+ (unsigned long long)end);
+ }
+ dump_stack();
+ }
+ if (!abort)
+ __reserve_region_with_split(root, start, end, name);
write_unlock(&resource_lock);
}
-EXPORT_SYMBOL(adjust_resource);
-
/**
* resource_alignment - calculate resource's alignment
* @res: resource pointer
@@ -840,7 +937,7 @@ struct resource * __request_region(struct resource *parent,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = alloc_resource(GFP_KERNEL);
if (!res)
return NULL;
@@ -848,8 +945,8 @@ struct resource * __request_region(struct resource *parent,
res->name = name;
res->start = start;
res->end = start + n - 1;
- res->flags = IORESOURCE_BUSY;
- res->flags |= flags;
+ res->flags = resource_type(parent);
+ res->flags |= IORESOURCE_BUSY | flags;
write_lock(&resource_lock);
@@ -874,7 +971,7 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- kfree(res);
+ free_resource(res);
res = NULL;
break;
}
@@ -908,7 +1005,7 @@ int __check_region(struct resource *parent, resource_size_t start,
return -EBUSY;
release_resource(res);
- kfree(res);
+ free_resource(res);
return 0;
}
EXPORT_SYMBOL(__check_region);
@@ -948,7 +1045,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
if (res->flags & IORESOURCE_MUXED)
wake_up(&muxed_resource_wait);
- kfree(res);
+ free_resource(res);
return;
}
p = &res->sibling;
@@ -962,6 +1059,109 @@ void __release_region(struct resource *parent, resource_size_t start,
}
EXPORT_SYMBOL(__release_region);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete. The requested region
+ * is released from a currently busy memory resource. The requested region
+ * must either match exactly or fit into a single busy resource entry. In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ * supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ * assumes that all children remain in the lower address entry for
+ * simplicity. Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+ resource_size_t start, resource_size_t size)
+{
+ struct resource **p;
+ struct resource *res;
+ struct resource *new_res;
+ resource_size_t end;
+ int ret = -EINVAL;
+
+ end = start + size - 1;
+ if ((start < parent->start) || (end > parent->end))
+ return ret;
+
+ /* The alloc_resource() result gets checked later */
+ new_res = alloc_resource(GFP_KERNEL);
+
+ p = &parent->child;
+ write_lock(&resource_lock);
+
+ while ((res = *p)) {
+ if (res->start >= end)
+ break;
+
+ /* look for the next resource if it does not fit into */
+ if (res->start > start || res->end < end) {
+ p = &res->sibling;
+ continue;
+ }
+
+ if (!(res->flags & IORESOURCE_MEM))
+ break;
+
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+
+ /* found the target resource; let's adjust accordingly */
+ if (res->start == start && res->end == end) {
+ /* free the whole entry */
+ *p = res->sibling;
+ free_resource(res);
+ ret = 0;
+ } else if (res->start == start && res->end != end) {
+ /* adjust the start */
+ ret = __adjust_resource(res, end + 1,
+ res->end - end);
+ } else if (res->start != start && res->end == end) {
+ /* adjust the end */
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ } else {
+ /* split into two entries */
+ if (!new_res) {
+ ret = -ENOMEM;
+ break;
+ }
+ new_res->name = res->name;
+ new_res->start = end + 1;
+ new_res->end = res->end;
+ new_res->flags = res->flags;
+ new_res->parent = res->parent;
+ new_res->sibling = res->sibling;
+ new_res->child = NULL;
+
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ if (ret)
+ break;
+ res->sibling = new_res;
+ new_res = NULL;
+ }
+
+ break;
+ }
+
+ write_unlock(&resource_lock);
+ free_resource(new_res);
+ return ret;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
/*
* Managed region resource
*/
@@ -1088,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource map sanity check conflict: "
- "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
+ printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
(unsigned long long)addr,
(unsigned long long)(addr + size - 1),
- (unsigned long long)p->start,
- (unsigned long long)p->end,
- p->name);
+ p->name, p);
err = -1;
break;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a..ab32b7b0db5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,10 +11,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
-obj-$(CONFIG_SMP) += cpupri.o
+obj-y += core.o proc.o clock.o cputime.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
+obj-y += wait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
-
-
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e8a1f83ee0e..e73efba9830 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
+ sched_offline_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -95,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
#endif
tg->autogroup = ag;
+ sched_online_group(tg, &root_task_group);
return ag;
out_free:
@@ -195,20 +197,20 @@ __setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
-int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
int err;
- if (*nice < -20 || *nice > 19)
+ if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
- err = security_task_setnice(current, *nice);
+ err = security_task_setnice(current, nice);
if (err)
return err;
- if (*nice < 0 && !can_nice(current, *nice))
+ if (nice < 0 && !can_nice(current, nice))
return -EPERM;
/* this is a heavy operation taking global locks.. */
@@ -219,9 +221,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
ag = autogroup_task_get(p);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+ err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
if (!err)
- ag->nice = *nice;
+ ag->nice = nice;
up_write(&ag->lock);
autogroup_kref_put(ag);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492d..3ef6451e972 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
- * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
* local_clock() -- is cpu_clock() on the current cpu.
*
+ * sched_clock_cpu(i)
+ *
* How:
*
* The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
- *
- * Notes:
- *
- * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
- * like cpufreq interrupts that can change the base clock (TSC) multiplier
- * and cause funny jumps in time -- although the filtering provided by
- * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
- * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
- * sched_clock().
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
@@ -66,13 +58,16 @@
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
+#include <linux/static_key.h>
+#include <linux/workqueue.h>
+#include <linux/compiler.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __attribute__((weak)) sched_clock(void)
+unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -82,7 +77,52 @@ EXPORT_SYMBOL_GPL(sched_clock);
__read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__read_mostly int sched_clock_stable;
+static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+static int __sched_clock_stable_early;
+
+int sched_clock_stable(void)
+{
+ return static_key_false(&__sched_clock_stable);
+}
+
+static void __set_sched_clock_stable(void)
+{
+ if (!sched_clock_stable())
+ static_key_slow_inc(&__sched_clock_stable);
+}
+
+void set_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 1;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ __set_sched_clock_stable();
+}
+
+static void __clear_sched_clock_stable(struct work_struct *work)
+{
+ /* XXX worry about clock continuity */
+ if (sched_clock_stable())
+ static_key_slow_dec(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+
+void clear_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 0;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ schedule_work(&sched_clock_work);
+}
struct sched_clock_data {
u64 tick_raw;
@@ -116,6 +156,20 @@ void sched_clock_init(void)
}
sched_clock_running = 1;
+
+ /*
+ * Ensure that it is impossible to not do a static_key update.
+ *
+ * Either {set,clear}_sched_clock_stable() must see sched_clock_running
+ * and do the update, or we must see their __sched_clock_stable_early
+ * and do the update, or both.
+ */
+ smp_mb(); /* matches {set,clear}_sched_clock_stable() */
+
+ if (__sched_clock_stable_early)
+ __set_sched_clock_stable();
+ else
+ __clear_sched_clock_stable(NULL);
}
/*
@@ -176,10 +230,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
@@ -216,20 +296,20 @@ u64 sched_clock_cpu(int cpu)
struct sched_clock_data *scd;
u64 clock;
- WARN_ON_ONCE(!irqs_disabled());
-
- if (sched_clock_stable)
+ if (sched_clock_stable())
return sched_clock();
if (unlikely(!sched_clock_running))
return 0ull;
+ preempt_disable_notrace();
scd = cpu_sdc(cpu);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
+ preempt_enable_notrace();
return clock;
}
@@ -239,7 +319,7 @@ void sched_clock_tick(void)
struct sched_clock_data *scd;
u64 now, now_gtod;
- if (sched_clock_stable)
+ if (sched_clock_stable())
return;
if (unlikely(!sched_clock_running))
@@ -290,14 +370,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
*/
u64 cpu_clock(int cpu)
{
- u64 clock;
- unsigned long flags;
+ if (!sched_clock_stable())
+ return sched_clock_cpu(cpu);
- local_irq_save(flags);
- clock = sched_clock_cpu(cpu);
- local_irq_restore(flags);
-
- return clock;
+ return sched_clock();
}
/*
@@ -309,14 +385,10 @@ u64 cpu_clock(int cpu)
*/
u64 local_clock(void)
{
- u64 clock;
- unsigned long flags;
-
- local_irq_save(flags);
- clock = sched_clock_cpu(smp_processor_id());
- local_irq_restore(flags);
+ if (!sched_clock_stable())
+ return sched_clock_cpu(raw_smp_processor_id());
- return clock;
+ return sched_clock();
}
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -336,12 +408,12 @@ u64 sched_clock_cpu(int cpu)
u64 cpu_clock(int cpu)
{
- return sched_clock_cpu(cpu);
+ return sched_clock();
}
u64 local_clock(void)
{
- return sched_clock_cpu(0);
+ return sched_clock();
}
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 00000000000..a63f4dc2790
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e05..bc1638b3344 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,11 @@
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/compiler.h>
+#include <asm/switch_to.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
@@ -80,11 +84,28 @@
#endif
#include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+ smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
+
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -139,9 +160,8 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
#include "features.h"
- NULL
};
#undef SCHED_FEAT
@@ -162,13 +182,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
#ifdef HAVE_JUMP_LABEL
-#define jump_label_key__true jump_label_key_enabled
-#define jump_label_key__false jump_label_key_disabled
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
#define SCHED_FEAT(name, enabled) \
jump_label_key__##enabled ,
-struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h"
};
@@ -176,37 +196,24 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- if (jump_label_enabled(&sched_feat_keys[i]))
- jump_label_dec(&sched_feat_keys[i]);
+ if (static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_dec(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- if (!jump_label_enabled(&sched_feat_keys[i]))
- jump_label_inc(&sched_feat_keys[i]);
+ if (!static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_inc(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_set(char *cmp)
{
- char buf[64];
- char *cmp;
- int neg = 0;
int i;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
@@ -226,6 +233,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
}
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ i = sched_feat_set(cmp);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -285,8 +313,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -359,13 +385,6 @@ static struct rq *this_rq_lock(void)
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
*/
static void hrtick_clear(struct rq *rq)
@@ -393,6 +412,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
}
#ifdef CONFIG_SMP
+
+static int __hrtick_restart(struct rq *rq)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = hrtimer_get_softexpires(timer);
+
+ return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
+
/*
* called from hardirq (IPI) context
*/
@@ -401,7 +429,7 @@ static void __hrtick_start(void *arg)
struct rq *rq = arg;
raw_spin_lock(&rq->lock);
- hrtimer_restart(&rq->hrtick_timer);
+ __hrtick_restart(rq);
rq->hrtick_csd_pending = 0;
raw_spin_unlock(&rq->lock);
}
@@ -419,9 +447,9 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
- hrtimer_restart(timer);
+ __hrtick_restart(rq);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
rq->hrtick_csd_pending = 1;
}
}
@@ -494,37 +522,98 @@ static inline void init_hrtick(void)
#endif /* CONFIG_SCHED_HRTICK */
/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+/*
* resched_task - mark a task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
void resched_task(struct task_struct *p)
{
int cpu;
- assert_raw_spin_locked(&task_rq(p)->lock);
+ lockdep_assert_held(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
- set_tsk_need_resched(p);
-
cpu = task_cpu(p);
- if (cpu == smp_processor_id())
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
return;
+ }
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -538,7 +627,8 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
@@ -547,12 +637,15 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(void)
+int get_nohz_timer_target(int pinned)
{
int cpu = smp_processor_id();
int i;
struct sched_domain *sd;
+ if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ return cpu;
+
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
@@ -576,56 +669,87 @@ unlock:
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id())
return;
- /*
- * This is safe, as this function is called with the timer
- * wheel base lock of (cpu) held. When the CPU is on the way
- * to idle and has not yet set rq->curr to idle then it will
- * be serialized on the timer wheel base lock and take the new
- * timer into account automatically.
- */
- if (rq->curr != rq->idle)
- return;
+ if (set_nr_and_not_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ smp_send_reschedule(cpu);
+ return true;
+ }
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
- smp_send_reschedule(cpu);
+ return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
}
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
- return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+ if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ return false;
+
+ if (idle_cpu(cpu) && !need_resched())
+ return true;
+
+ /*
+ * We can't run Idle Load Balance on this CPU for this time so we
+ * cancel it and clear NOHZ_BALANCE_KICK
+ */
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ return false;
}
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
static inline bool got_nohz_idle_kick(void)
{
return false;
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+ struct rq *rq;
+
+ rq = this_rq();
+
+ /* Make sure rq->nr_running update is visible after the IPI */
+ smp_rmb();
+
+ /* More than one running task need preemption */
+ if (rq->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
- while ((s64)(rq->clock - rq->age_stamp) > period) {
+ while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
@@ -637,12 +761,6 @@ void sched_avg_update(struct rq *rq)
}
}
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
- assert_raw_spin_locked(&task_rq(p)->lock);
- set_tsk_need_resched(p);
-}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -690,8 +808,6 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-void update_cpu_load(struct rq *this_rq);
-
static void set_load_weight(struct task_struct *p)
{
int prio = p->static_prio - MAX_RT_PRIO;
@@ -713,14 +829,14 @@ static void set_load_weight(struct task_struct *p)
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(p);
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(p);
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -740,126 +856,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- u64 irq_time;
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void account_system_vtime(struct task_struct *curr)
-{
- unsigned long flags;
- s64 delta;
- int cpu;
-
- if (!sched_clock_irqtime)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
-
- irq_time_write_begin();
- /*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
- */
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
-
- irq_time_write_end();
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(account_system_vtime);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
- if (unlikely(steal > NSEC_PER_SEC))
- return div_u64(steal, TICK_NSEC);
-
- return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
@@ -894,20 +890,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_branch((&paravirt_steal_rq_enabled))) {
- u64 st;
-
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- st = steal_ticks(steal);
- steal = st * TICK_NSEC;
-
rq->prev_steal_time_rq += steal;
-
delta -= steal;
}
#endif
@@ -915,48 +905,11 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static int irqtime_account_hi_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
-
-static int irqtime_account_si_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
-
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime (0)
-
-#endif
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1006,7 +959,9 @@ static inline int normal_prio(struct task_struct *p)
{
int prio;
- if (task_has_rt_policy(p))
+ if (task_has_dl_policy(p))
+ prio = MAX_DL_PRIO-1;
+ else if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
@@ -1036,6 +991,8 @@ static int effective_prio(struct task_struct *p)
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
@@ -1050,7 +1007,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio)
+ } else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
@@ -1088,7 +1045,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+ !(task_preempt_count(p) & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
/*
@@ -1096,7 +1053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -1109,6 +1066,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1116,6 +1075,108 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+ if (p->on_rq) {
+ struct rq *src_rq, *dst_rq;
+
+ src_rq = task_rq(p);
+ dst_rq = cpu_rq(cpu);
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu);
+ activate_task(dst_rq, p, 0);
+ check_preempt_curr(dst_rq, p, 0);
+ } else {
+ /*
+ * Task isn't running anymore; make it appear like we migrated
+ * it before it went to sleep. This means on wakeup we make the
+ * previous cpu our targer instead of where it really is.
+ */
+ p->wake_cpu = cpu;
+ }
+}
+
+struct migration_swap_arg {
+ struct task_struct *src_task, *dst_task;
+ int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+ struct migration_swap_arg *arg = data;
+ struct rq *src_rq, *dst_rq;
+ int ret = -EAGAIN;
+
+ src_rq = cpu_rq(arg->src_cpu);
+ dst_rq = cpu_rq(arg->dst_cpu);
+
+ double_raw_lock(&arg->src_task->pi_lock,
+ &arg->dst_task->pi_lock);
+ double_rq_lock(src_rq, dst_rq);
+ if (task_cpu(arg->dst_task) != arg->dst_cpu)
+ goto unlock;
+
+ if (task_cpu(arg->src_task) != arg->src_cpu)
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ goto unlock;
+
+ __migrate_swap_task(arg->src_task, arg->dst_cpu);
+ __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+ ret = 0;
+
+unlock:
+ double_rq_unlock(src_rq, dst_rq);
+ raw_spin_unlock(&arg->dst_task->pi_lock);
+ raw_spin_unlock(&arg->src_task->pi_lock);
+
+ return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+ struct migration_swap_arg arg;
+ int ret = -EINVAL;
+
+ arg = (struct migration_swap_arg){
+ .src_task = cur,
+ .src_cpu = task_cpu(cur),
+ .dst_task = p,
+ .dst_cpu = task_cpu(p),
+ };
+
+ if (arg.src_cpu == arg.dst_cpu)
+ goto out;
+
+ /*
+ * These three tests are all lockless; this is OK since all of them
+ * will be re-checked with proper locks held further down the line.
+ */
+ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ goto out;
+
+ trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
+ ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+ return ret;
+}
+
struct migration_arg {
struct task_struct *task;
int dest_cpu;
@@ -1263,29 +1324,69 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
+ enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
-
- /* Look for allowed, online CPU in same node. */
- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return dest_cpu;
- /* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
- if (dest_cpu < nr_cpu_ids)
- return dest_cpu;
-
- /* No more Mr. Nice Guy. */
- dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
*/
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
+ }
+
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ goto out;
+ }
+
+ switch (state) {
+ case cpuset:
+ /* No more Mr. Nice Guy. */
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_deferred("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
}
return dest_cpu;
@@ -1295,9 +1396,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1379,8 +1480,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
+ trace_sched_wakeup(p, true);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -1388,13 +1489,14 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
p->sched_class->task_woken(rq, p);
if (rq->idle_stamp) {
- u64 delta = rq->clock - rq->idle_stamp;
- u64 max = 2*sysctl_sched_migration_cost;
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
- if (delta > max)
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
rq->avg_idle = max;
- else
- update_avg(&rq->avg_idle, delta);
+
rq->idle_stamp = 0;
}
#endif
@@ -1425,6 +1527,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p);
if (p->on_rq) {
+ /* check_preempt_curr() may use rq clock */
+ update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
ret = 1;
}
@@ -1434,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
- raw_spin_lock(&rq->lock);
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1448,12 +1556,21 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+
+ if (llist_empty(&this_rq()->wake_list)
+ && !tick_nohz_full_cpu(smp_processor_id())
+ && !got_nohz_idle_kick())
return;
/*
@@ -1470,12 +1587,13 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
+ tick_nohz_full_check();
sched_ttwu_pending();
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
- if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+ if (unlikely(got_nohz_idle_kick())) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
@@ -1484,30 +1602,17 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
-}
-
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
-{
- struct rq *rq;
- int ret = 0;
+ struct rq *rq = cpu_rq(cpu);
- rq = __task_rq_lock(p);
- if (p->on_cpu) {
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- ttwu_do_wakeup(rq, p, wake_flags);
- ret = 1;
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
- __task_rq_unlock(rq);
-
- return ret;
-
}
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
@@ -1518,7 +1623,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+ if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
@@ -1542,7 +1647,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
@@ -1551,7 +1656,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
@@ -1567,21 +1678,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*/
- while (p->on_cpu) {
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- /*
- * In case the architecture enables interrupts in
- * context_switch(), we cannot busy wait, since that
- * would lead to deadlocks when an interrupt hits and
- * tries to wake up @prev. So bail and do a complete
- * remote wakeup.
- */
- if (ttwu_activate_remote(p, wake_flags))
- goto stat;
-#else
+ while (p->on_cpu)
cpu_relax();
-#endif
- }
/*
* Pairs with the smp_wmb() in finish_lock_switch().
*/
@@ -1593,7 +1691,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1621,8 +1719,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1648,15 +1748,17 @@ out:
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
- * processes. Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -1671,7 +1773,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*
* __sched_fork() is basic setup used by init_idle() too:
*/
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -1687,22 +1789,94 @@ static void __sched_fork(struct task_struct *p)
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
+ RB_CLEAR_NODE(&p->dl.rb_node);
+ hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ p->dl.dl_runtime = p->dl.runtime = 0;
+ p->dl.dl_deadline = p->dl.deadline = 0;
+ p->dl.dl_period = 0;
+ p->dl.flags = 0;
+
INIT_LIST_HEAD(&p->rt.run_list);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ p->mm->numa_scan_seq = 0;
+ }
+
+ if (clone_flags & CLONE_VM)
+ p->numa_preferred_nid = current->numa_preferred_nid;
+ else
+ p->numa_preferred_nid = -1;
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ INIT_LIST_HEAD(&p->numa_entry);
+ p->numa_group = NULL;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = numabalancing_enabled;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_numabalancing_state(state);
+ return err;
+}
+#endif
+#endif
/*
* fork()/clone()-time setup:
*/
-void sched_fork(struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
- __sched_fork(p);
+ __sched_fork(clone_flags, p);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@@ -1719,7 +1893,7 @@ void sched_fork(struct task_struct *p)
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
- if (task_has_rt_policy(p)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
@@ -1736,8 +1910,14 @@ void sched_fork(struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- if (!rt_prio(p->prio))
+ if (dl_prio(p->prio)) {
+ put_cpu();
+ return -EAGAIN;
+ } else if (rt_prio(p->prio)) {
+ p->sched_class = &rt_sched_class;
+ } else {
p->sched_class = &fair_sched_class;
+ }
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
@@ -1760,18 +1940,128 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT_COUNT
- /* Want to start with kernel preemption disabled. */
- task_thread_info(p)->preempt_count = 1;
-#endif
+ init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
put_cpu();
+ return 0;
+}
+
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 20;
+
+ /*
+ * Doing this here saves a lot of checks in all
+ * the calling paths, and returning zero seems
+ * safe for them anyway.
+ */
+ if (period == 0)
+ return 0;
+
+ return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ if (new_bw == p->dl.dl_bw)
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
+/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
@@ -1790,9 +2080,11 @@ void wake_up_new_task(struct task_struct *p)
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
- set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
+ /* Initialize new task's runnable average */
+ init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = 1;
@@ -1832,9 +2124,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
@@ -1843,9 +2134,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
@@ -1880,12 +2170,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- sched_info_switch(prev, next);
+ trace_sched_switch(prev, next);
+ sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
- trace_sched_switch(prev, next);
}
/**
@@ -1923,21 +2213,19 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_state = prev->state;
+ vtime_task_switch(prev);
finish_arch_switch(prev);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_disable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
perf_event_task_sched_in(prev, current);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_enable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
- trace_sched_stat_sleeptime(current, rq->clock);
+ finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -1945,17 +2233,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
kprobe_flush_task(prev);
put_task_struct(prev);
}
+
+ tick_nohz_task_switch(current);
}
#ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-}
-
/* rq->lock is NOT held, but preemption is disabled */
static inline void post_schedule(struct rq *rq)
{
@@ -1973,10 +2256,6 @@ static inline void post_schedule(struct rq *rq)
#else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
static inline void post_schedule(struct rq *rq)
{
}
@@ -1987,7 +2266,7 @@ static inline void post_schedule(struct rq *rq)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
@@ -2050,6 +2329,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
+ context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -2063,11 +2343,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -2079,23 +2358,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
@@ -2123,385 +2385,6 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
-unsigned long this_cpu_load(void)
-{
- struct rq *this = this_rq();
- return this->cpu_load[0];
-}
-
-
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-
-static long calc_load_fold_active(struct rq *this_rq)
-{
- long nr_active, delta = 0;
-
- nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
-
- if (nr_active != this_rq->calc_load_active) {
- delta = nr_active - this_rq->calc_load_active;
- this_rq->calc_load_active = nr_active;
- }
-
- return delta;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_tasks_idle;
-
-void calc_load_account_idle(struct rq *this_rq)
-{
- long delta;
-
- delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
-}
-
-static long calc_load_fold_idle(void)
-{
- long delta = 0;
-
- /*
- * Its got a race, we don't care...
- */
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
-
- return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
- unsigned long result = 1UL << frac_bits;
-
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
-
- return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- * = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- * ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- * = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- * n 1 - x^(n+1)
- * S_n := \Sum x^i = -------------
- * i=0 1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
- unsigned long active, unsigned int n)
-{
-
- return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(unsigned long ticks)
-{
- long delta, active, n;
-
- if (time_before(jiffies, calc_load_update))
- return;
-
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * If we were idle for multiple load cycles, apply them.
- */
- if (ticks >= LOAD_FREQ) {
- n = ticks / LOAD_FREQ;
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
- calc_load_update += n * LOAD_FREQ;
- }
-
- /*
- * Its possible the remainder of the above division also crosses
- * a LOAD_FREQ period, the regular check in calc_global_load()
- * which comes after this will take care of that.
- *
- * Consider us being 11 ticks before a cycle completion, and us
- * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
- * age us 4 cycles, and the test in calc_global_load() will
- * pick up the final one.
- */
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-
-static inline long calc_load_fold_idle(void)
-{
- return 0;
-}
-
-static void calc_global_nohz(unsigned long ticks)
-{
-}
-#endif
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
- long active;
-
- calc_global_nohz(ticks);
-
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load(avenrun[0], EXP_1, active);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
- calc_load_update += LOAD_FREQ;
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
- long delta;
-
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-void update_cpu_load(struct rq *this_rq)
-{
- unsigned long this_load = this_rq->load.weight;
- unsigned long curr_jiffies = jiffies;
- unsigned long pending_updates;
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Avoid repeated calls on same jiffy, when moving in and out of idle */
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-static void update_cpu_load_active(struct rq *this_rq)
-{
- update_cpu_load(this_rq);
-
- calc_load_account_active(this_rq);
-}
-
#ifdef CONFIG_SMP
/*
@@ -2515,7 +2398,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2550,7 +2433,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock_task - p->se.exec_start;
+ ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
@@ -2582,404 +2465,26 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns = 0;
- rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-#ifdef CONFIG_CGROUP_CPUACCT
-struct cgroup_subsys cpuacct_subsys;
-struct cpuacct root_cpuacct;
-#endif
-
-static inline void task_group_account_field(struct task_struct *p, int index,
- u64 tmp)
-{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
- * Since all updates are sure to touch the root cgroup, we
- * get ourselves ahead and touch it first. If the root cgroup
- * is the only cgroup, then nothing else should be necessary.
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
*
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
*/
- __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
+ if (!p->on_cpu)
+ return p->se.sum_exec_runtime;
#endif
-}
-
-
-/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- int index;
-
- /* Add user time to process. */
- p->utime += cputime;
- p->utimescaled += cputime_scaled;
- account_group_user_time(p, cputime);
-
- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
-
- /* Add user time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
-
- /* Account for user time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- /* Add guest time to process. */
- p->utime += cputime;
- p->utimescaled += cputime_scaled;
- account_group_user_time(p, cputime);
- p->gtime += cputime;
-
- /* Add guest time to cpustat. */
- if (TASK_NICE(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
- } else {
- cpustat[CPUTIME_USER] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST] += (__force u64) cputime;
- }
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, int index)
-{
- /* Add system time to process. */
- p->stime += cputime;
- p->stimescaled += cputime_scaled;
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
-
- /* Account for system time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
-{
- int index;
-
- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- account_guest_time(p, cputime, cputime_scaled);
- return;
- }
-
- if (hardirq_count() - hardirq_offset)
- index = CPUTIME_IRQ;
- else if (in_serving_softirq())
- index = CPUTIME_SOFTIRQ;
- else
- index = CPUTIME_SYSTEM;
-
- __account_system_time(p, cputime, cputime_scaled, index);
-}
-
-/*
- * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- cpustat[CPUTIME_STEAL] += (__force u64) cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();
-
- if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
- else
- cpustat[CPUTIME_IDLE] += (__force u64) cputime;
-}
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
- if (static_branch(&paravirt_steal_enabled)) {
- u64 steal, st = 0;
-
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
-
- st = steal_ticks(steal);
- this_rq()->prev_steal_time += st * TICK_NSEC;
-
- account_steal_time(st);
- return st;
- }
-#endif
- return false;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
- * @user_tick: is the tick from userspace
- * @rq: the pointer to rq
- *
- * Tick demultiplexing follows the order
- * - pending hardirq update
- * - pending softirq update
- * - user_time
- * - idle_time
- * - system time
- * - check for guest_time
- * - else account as system_time
- *
- * Check for hardirq is done both for system and user time as there is
- * no timer going off while we are on hardirq and hence we may never get an
- * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
- * softirq as those do not count in task exec_runtime any more.
- */
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- if (steal_account_process_tick())
- return;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
- } else if (this_cpu_ksoftirqd() == p) {
- /*
- * ksoftirqd time do not get accounted in cpu_softirq_time.
- * So, we have to handle it separately here.
- * Also, p->stime needs to be updated for ksoftirqd.
- */
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
- } else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- } else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
- } else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
- } else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
- }
-}
-
-static void irqtime_account_idle_ticks(int ticks)
-{
- int i;
- struct rq *rq = this_rq();
-
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- struct rq *rq = this_rq();
-
- if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
- return;
- }
-
- if (steal_account_process_tick())
- return;
-
- if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
- else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
- if (sched_clock_irqtime) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- account_idle_time(jiffies_to_cputime(ticks));
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-#else
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
-#endif
-
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- cputime_t rtime, utime = p->utime, total = utime + p->stime;
-
- /*
- * Use CFS's precise accounting:
- */
- rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
- utime = rtime;
-
- /*
- * Compare with previous values, to keep monotonicity:
- */
- p->prev_utime = max(p->prev_utime, utime);
- p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
-
- *ut = p->prev_utime;
- *st = p->prev_stime;
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct signal_struct *sig = p->signal;
- struct task_cputime cputime;
- cputime_t rtime, utime, total;
-
- thread_group_cputime(p, &cputime);
-
- total = cputime.utime + cputime.stime;
- rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) cputime.utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
- utime = rtime;
-
- sig->prev_utime = max(sig->prev_utime, utime);
- sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, p, &flags);
- *ut = sig->prev_utime;
- *st = sig->prev_stime;
+ return ns;
}
-#endif
/*
* This function gets called by the timer code, with HZ frequency.
@@ -2995,18 +2500,47 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
+ update_cpu_load_active(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq, cpu);
+ trigger_load_balance(rq);
#endif
+ rq_last_tick_reset(rq);
}
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = ACCESS_ONCE(jiffies);
+
+ next = rq->last_sched_tick + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_nsecs(next - now);
+}
+#endif
+
notrace unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
@@ -3020,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
+void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -3029,7 +2563,7 @@ void __kprobes add_preempt_count(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
- preempt_count() += val;
+ __preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
@@ -3037,12 +2571,18 @@ void __kprobes add_preempt_count(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val)
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
}
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
-void __kprobes sub_preempt_count(int val)
+void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -3060,9 +2600,10 @@ void __kprobes sub_preempt_count(int val)
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
- preempt_count() -= val;
+ __preempt_count_sub(val);
}
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
#endif
@@ -3071,8 +2612,6 @@ EXPORT_SYMBOL(sub_preempt_count);
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
- struct pt_regs *regs = get_irq_regs();
-
if (oops_in_progress)
return;
@@ -3083,11 +2622,15 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
/*
@@ -3097,10 +2640,10 @@ static inline void schedule_debug(struct task_struct *prev)
{
/*
* Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path for now.
- * Otherwise, whine if we are scheduling when we should not be.
+ * schedule() atomically, we ignore that path. Otherwise whine
+ * if we are scheduling when we should not.
*/
- if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+ if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
__schedule_bug(prev);
rcu_sleep_check();
@@ -3109,36 +2652,40 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
- if (prev->on_rq || rq->skip_clock_update < 0)
- update_rq_clock(rq);
- prev->sched_class->put_prev_task(rq, prev);
-}
-
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
- const struct sched_class *class;
+ const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
- if (likely(p))
- return p;
+ if (likely(prev->sched_class == class &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev);
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
+again:
for_each_class(class) {
- p = class->pick_next_task(rq);
- if (p)
+ p = class->pick_next_task(rq, prev);
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
return p;
+ }
}
BUG(); /* the idle class will always have a runnable task */
@@ -3146,6 +2693,40 @@ pick_next_task(struct rq *rq)
/*
* __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
*/
static void __sched __schedule(void)
{
@@ -3166,6 +2747,12 @@ need_resched:
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
@@ -3192,14 +2779,12 @@ need_resched:
switch_count = &prev->nvcsw;
}
- pre_schedule(rq, prev);
-
- if (unlikely(!rq->nr_running))
- idle_balance(cpu, rq);
+ if (prev->on_rq || rq->skip_clock_update < 0)
+ update_rq_clock(rq);
- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -3221,14 +2806,14 @@ need_resched:
post_schedule(rq);
- preempt_enable_no_resched();
+ sched_preempt_enable_no_resched();
if (need_resched())
goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state)
+ if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
/*
* If we are going to sleep and we have plugged IO queued,
@@ -3238,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -3247,50 +2832,32 @@ asmlinkage void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
{
- if (lock->owner != owner)
- return false;
-
/*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
*/
- barrier();
-
- return owner->on_cpu;
+ user_exit();
+ schedule();
+ user_enter();
}
+#endif
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
*/
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+void __sched schedule_preempt_disabled(void)
{
- if (!sched_feat(OWNER_SPIN))
- return 0;
-
- rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
- break;
-
- arch_mutex_cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
}
-#endif
#ifdef CONFIG_PREEMPT
/*
@@ -3298,21 +2865,19 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
{
- struct thread_info *ti = current_thread_info();
-
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (likely(ti->preempt_count || irqs_disabled()))
+ if (likely(!preemptible()))
return;
do {
- add_preempt_count_notrace(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
__schedule();
- sub_preempt_count_notrace(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -3321,7 +2886,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
barrier();
} while (need_resched());
}
+NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -3329,19 +2896,21 @@ EXPORT_SYMBOL(preempt_schedule);
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
{
- struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_state;
/* Catch callers which need to be fixed */
- BUG_ON(ti->preempt_count || !irqs_disabled());
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
do {
- add_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
local_irq_enable();
__schedule();
local_irq_disable();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -3349,9 +2918,9 @@ asmlinkage void __sched preempt_schedule_irq(void)
*/
barrier();
} while (need_resched());
-}
-#endif /* CONFIG_PREEMPT */
+ exception_exit(prev_state);
+}
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
@@ -3360,392 +2929,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
-{
- wait_queue_t *curr, *next;
-
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
- break;
- }
-}
-
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
-{
- __wake_up_common(q, mode, 1, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
- __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
- int wake_flags = WF_SYNC;
-
- if (unlikely(!q))
- return;
-
- if (unlikely(!nr_exclusive))
- wake_flags = 0;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
- __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
-
-/**
- * complete: - signals a single thread waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
- __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-
-static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
-{
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
-
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
- do {
- if (signal_pending_state(state, current)) {
- timeout = -ERESTARTSYS;
- break;
- }
- __set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&x->wait.lock);
- } while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
- if (!x->done)
- return timeout;
- }
- x->done--;
- return timeout ?: 1;
-}
-
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
- might_sleep();
-
- spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, timeout, state);
- spin_unlock_irq(&x->wait.lock);
- return timeout;
-}
-
-/**
- * wait_for_completion: - waits for completion of a task
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
- wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x: holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-
-/**
- * try_wait_for_completion - try to decrement a completion without blocking
- * @x: completion structure
- *
- * Returns: 0 if a decrement cannot be done without blocking
- * 1 if a decrement succeeded.
- *
- * If a completion is being used as a counting completion,
- * attempt to decrement the counter without blocking. This
- * enables us to avoid waiting if the resource the completion
- * is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- else
- x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-
-/**
- * completion_done - Test to see if a completion has any waiters
- * @x: completion structure
- *
- * Returns: 0 if there are waiters (wait_for_completion() in progress)
- * 1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(completion_done);
-
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
- unsigned long flags;
- wait_queue_t wait;
-
- init_waitqueue_entry(&wait, current);
-
- __set_current_state(state);
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, &wait);
- spin_unlock(&q->lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&q->lock);
- __remove_wait_queue(q, &wait);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
#ifdef CONFIG_RT_MUTEXES
/*
@@ -3756,19 +2939,39 @@ EXPORT_SYMBOL(sleep_on_timeout);
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running;
+ int oldprio, on_rq, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
- BUG_ON(prio < 0 || prio > MAX_PRIO);
+ BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p);
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
trace_sched_pi_setprio(p, prio);
+ p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->on_rq;
@@ -3778,22 +2981,47 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->put_prev_task(rq, p);
- if (rt_prio(prio))
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+ dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+ p->dl.dl_boosted = 1;
+ p->dl.dl_throttled = 0;
+ enqueue_flag = ENQUEUE_REPLENISH;
+ } else
+ p->dl.dl_boosted = 0;
+ p->sched_class = &dl_sched_class;
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (oldprio < prio)
+ enqueue_flag = ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
- else
+ } else {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
p->sched_class = &fair_sched_class;
+ }
p->prio = prio;
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+ enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
__task_rq_unlock(rq);
}
-
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -3802,7 +3030,7 @@ void set_user_nice(struct task_struct *p, long nice)
unsigned long flags;
struct rq *rq;
- if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
/*
* We have to be careful, if called from sys_setpriority(),
@@ -3813,9 +3041,9 @@ void set_user_nice(struct task_struct *p, long nice)
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
- * SCHED_FIFO/SCHED_RR:
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
*/
- if (task_has_rt_policy(p)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
@@ -3851,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = 20 - nice;
+ int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@@ -3875,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < -40)
- increment = -40;
- if (increment > 40)
- increment = 40;
-
- nice = TASK_NICE(current) + increment;
- if (nice < -20)
- nice = -20;
- if (nice > 19)
- nice = 19;
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3903,7 +3124,7 @@ SYSCALL_DEFINE1(nice, int, increment)
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
@@ -3913,18 +3134,10 @@ int task_prio(const struct task_struct *p)
}
/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- */
-int task_nice(const struct task_struct *p)
-{
- return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
-/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
@@ -3947,6 +3160,8 @@ int idle_cpu(int cpu)
/**
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -3956,26 +3171,134 @@ struct task_struct *idle_task(int cpu)
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_vpid(pid) : current;
}
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ init_dl_task_timer(dl_se);
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
+}
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == -1) /* setparam */
+ policy = p->policy;
+
p->policy = policy;
- p->rt_priority = prio;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- /* we are holding p->pi_lock already */
- p->prio = rt_mutex_getprio(p);
- if (rt_prio(p->prio))
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * If we get here, there was no pi waiters boosting the
+ * task. It is safe to use the normal prio.
+ */
+ p->prio = normal_prio(p);
+
+ if (dl_prio(p->prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
- set_load_weight(p);
+}
+
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3988,19 +3311,20 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- if (cred->user->user_ns == pcred->user->user_ns)
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
- else
- match = false;
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
rcu_read_unlock();
return match;
}
-static int __sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool user)
+static int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user)
{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
struct rq *rq;
@@ -4014,31 +3338,40 @@ recheck:
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
- reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
- policy &= ~SCHED_RESET_ON_FORK;
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ if (policy != SCHED_DEADLINE &&
+ policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
policy != SCHED_IDLE)
return -EINVAL;
}
+ if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ return -EINVAL;
+
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if (rt_policy(policy) != (param->sched_priority != 0))
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (user && !capable(CAP_SYS_NICE)) {
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !can_nice(p, attr->sched_nice))
+ return -EPERM;
+ }
+
if (rt_policy(policy)) {
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
@@ -4048,17 +3381,26 @@ recheck:
return -EPERM;
/* can't increase priority */
- if (param->sched_priority > p->rt_priority &&
- param->sched_priority > rlim_rtprio)
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
return -EPERM;
}
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ return -EPERM;
+
/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- if (!can_nice(p, TASK_NICE(p)))
+ if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -4095,18 +3437,25 @@ recheck:
}
/*
- * If not changing anything there's no need to proceed further:
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
*/
- if (unlikely(policy == p->policy && (!rt_policy(policy) ||
- param->sched_priority == p->rt_priority))) {
-
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy))
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ task_rq_unlock(rq, p, &flags);
return 0;
}
+change:
-#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
@@ -4117,8 +3466,24 @@ recheck:
task_rq_unlock(rq, p, &flags);
return -EPERM;
}
- }
#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, &p->cpus_allowed) ||
+ rq->rd->dl_bw.bw == 0) {
+ task_rq_unlock(rq, p, &flags);
+ return -EPERM;
+ }
+ }
+#endif
+ }
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -4126,6 +3491,35 @@ recheck:
task_rq_unlock(rq, p, &flags);
goto recheck;
}
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ task_rq_unlock(rq, p, &flags);
+ return -EBUSY;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Special case for priority boosted tasks.
+ *
+ * If the new priority is lower or equal (user space view)
+ * than the current (boosted) priority, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ if (rt_mutex_check_prio(p, newprio)) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -4133,16 +3527,18 @@ recheck:
if (running)
p->sched_class->put_prev_task(rq, p);
- p->sched_reset_on_fork = reset_on_fork;
-
- oldprio = p->prio;
prev_class = p->sched_class;
- __setscheduler(rq, p, policy, param->sched_priority);
+ __setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
+ if (on_rq) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -4152,21 +3548,49 @@ recheck:
return 0;
}
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /*
+ * Fixup the legacy SCHED_RESET_ON_FORK hack
+ */
+ if (policy & SCHED_RESET_ON_FORK) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check);
+}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Return: 0 on success. An error code otherwise.
+ *
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, true);
+ return _sched_setscheduler(p, policy, param, true);
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
@@ -4177,11 +3601,13 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, false);
+ return _sched_setscheduler(p, policy, param, false);
}
static int
@@ -4206,11 +3632,84 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
return retval;
}
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
struct sched_param __user *, param)
@@ -4226,6 +3725,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -4233,8 +3734,44 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
}
/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
@@ -4261,10 +3798,13 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -4281,7 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -4296,19 +3837,103 @@ out_unlock:
return retval;
}
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ return -EFBIG;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, attr->size);
+ if (ret)
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, &attr);
+ else if (task_has_rt_policy(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
cpumask_var_t cpus_allowed, new_mask;
struct task_struct *p;
int retval;
- get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
- put_online_cpus();
return -ESRCH;
}
@@ -4316,6 +3941,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
get_task_struct(p);
rcu_read_unlock();
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
@@ -4325,15 +3954,39 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- goto out_unlock;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
retval = security_task_setscheduler(p);
if (retval)
goto out_unlock;
+
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+#ifdef CONFIG_SMP
+ if (task_has_dl_policy(p)) {
+ const struct cpumask *span = task_rq(p)->rd->span;
+
+ if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+ retval = -EBUSY;
+ goto out_unlock;
+ }
+ }
+#endif
again:
retval = set_cpus_allowed_ptr(p, new_mask);
@@ -4355,7 +4008,6 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
- put_online_cpus();
return retval;
}
@@ -4375,6 +4027,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4398,7 +4052,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
- get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -4411,12 +4064,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
- put_online_cpus();
return retval;
}
@@ -4426,6 +4078,8 @@ out_unlock:
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4460,6 +4114,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
@@ -4475,23 +4131,18 @@ SYSCALL_DEFINE0(sched_yield)
__release(rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
do_raw_spin_unlock(&rq->lock);
- preempt_enable_no_resched();
+ sched_preempt_enable_no_resched();
schedule();
return 0;
}
-static inline int should_resched(void)
-{
- return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
static void __cond_resched(void)
{
- add_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
__schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -4549,8 +4200,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
/**
* yield - yield the current processor to other threads.
*
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
*/
void __sched yield(void)
{
@@ -4569,34 +4236,46 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns true if we indeed boosted the target task.
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
*/
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
unsigned long flags;
- bool yielded = 0;
+ int yielded = 0;
local_irq_save(flags);
rq = this_rq();
again:
p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
double_rq_lock(rq, p_rq);
- while (task_rq(p) != p_rq) {
+ if (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
goto again;
}
if (!curr->sched_class->yield_to_task)
- goto out;
+ goto out_unlock;
if (curr->sched_class != p->sched_class)
- goto out;
+ goto out_unlock;
if (task_running(p_rq, p) || p->state)
- goto out;
+ goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
@@ -4607,20 +4286,14 @@ again:
*/
if (preempt && rq != p_rq)
resched_task(p_rq->curr);
- } else {
- /*
- * We might have set it in task_yield_fair(), but are
- * not going to schedule(), so don't want to skip
- * the next update.
- */
- rq->skip_clock_update = 0;
}
-out:
+out_unlock:
double_rq_unlock(rq, p_rq);
+out_irq:
local_irq_restore(flags);
- if (yielded)
+ if (yielded > 0)
schedule();
return yielded;
@@ -4666,8 +4339,9 @@ long __sched io_schedule_timeout(long timeout)
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
@@ -4678,6 +4352,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
case SCHED_RR:
ret = MAX_USER_RT_PRIO-1;
break;
+ case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
@@ -4691,8 +4366,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
@@ -4703,6 +4379,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
case SCHED_RR:
ret = 1;
break;
+ case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
@@ -4718,6 +4395,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
*
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
@@ -4743,7 +4423,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
goto out_unlock;
rq = task_rq_lock(p, &flags);
- time_slice = p->sched_class->get_rr_interval(rq, p);
+ time_slice = 0;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
@@ -4761,6 +4443,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
+ int ppid;
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4780,10 +4463,14 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ rcu_read_lock();
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
+ task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
+ print_worker_info(KERN_INFO, p);
show_stack(p, NULL);
}
@@ -4822,7 +4509,7 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
{
idle->sched_class = &idle_sched_class;
}
@@ -4835,14 +4522,14 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
- __sched_fork(idle);
+ __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -4862,19 +4549,21 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
+ idle->on_rq = 1;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
- task_thread_info(idle)->preempt_count = 0;
+ init_idle_preempt_count(idle, cpu);
/*
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle, cpu);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
@@ -4887,7 +4576,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
/*
@@ -4930,11 +4619,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
- ret = -EINVAL;
- goto out;
- }
-
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
@@ -5006,6 +4690,54 @@ fail:
return ret;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+ struct migration_arg arg = { p, target_cpu };
+ int curr_cpu = task_cpu(p);
+
+ if (curr_cpu == target_cpu)
+ return 0;
+
+ if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ return -EINVAL;
+
+ /* TODO: This is not properly updating schedstats */
+
+ trace_sched_move_numa(p, curr_cpu, target_cpu);
+ return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+ struct rq *rq;
+ unsigned long flags;
+ bool on_rq, running;
+
+ rq = task_rq_lock(p, &flags);
+ on_rq = p->on_rq;
+ running = task_current(rq, p);
+
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+
+ p->numa_preferred_nid = nid;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq)
+ enqueue_task(rq, p, 0);
+ task_rq_unlock(rq, p, &flags);
+}
+#endif
+
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -5037,35 +4769,43 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
- if (mm != &init_mm)
+ if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
mmdrop(mm);
}
/*
- * While a dead CPU has no uninterruptible tasks queued at this point,
- * it might still have a nonzero ->nr_uninterruptible counter, because
- * for performance reasons the counter is not stricly tracking tasks to
- * their home CPUs. So we just add the counter to another CPU's counter,
- * to keep the global sum constant after CPU-down:
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable.
+ *
+ * Also see the comment "Global load-average calculations".
*/
-static void migrate_nr_uninterruptible(struct rq *rq_src)
+static void calc_load_migrate(struct rq *rq)
{
- struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-
- rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
- rq_src->nr_uninterruptible = 0;
+ long delta = calc_load_fold_active(rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
}
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -5091,8 +4831,12 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
rq->stop = NULL;
- /* Ensure any throttled groups are reachable by pick_next_task */
- unthrottle_offline_cfs_rqs(rq);
+ /*
+ * put_prev_task() and pick_next_task() sched
+ * class method both need to have an up-to-date
+ * value of rq->clock[_task]
+ */
+ update_rq_clock(rq);
for ( ; ; ) {
/*
@@ -5102,7 +4846,7 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
- next = pick_next_task(rq);
+ next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5168,57 +4912,69 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler)
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(13);
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[11], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring);
- /* &table[12] is terminator */
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
return table;
}
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
@@ -5320,7 +5076,7 @@ static void set_rq_offline(struct rq *rq)
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int __cpuinit
+static int
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
@@ -5356,9 +5112,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
migrate_tasks(cpu);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ break;
- migrate_nr_uninterruptible(rq);
- calc_global_load_remove(rq);
+ case CPU_DEAD:
+ calc_load_migrate(rq);
break;
#endif
}
@@ -5373,16 +5130,25 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
* happens before everything else. This has to be lower priority than
* the notifier in the perf_event subsystem, though.
*/
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
.notifier_call = migration_call,
.priority = CPU_PRI_MIGRATION,
};
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+}
+
+static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
+ case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5391,16 +5157,34 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
}
}
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ unsigned long flags;
+ long cpu = (long)hcpu;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
+ set_cpu_active(cpu, false);
+
+ /* explicitly allow suspend */
+ if (!(action & CPU_TASKS_FROZEN)) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ bool overflow;
+ int cpus;
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ }
return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
}
+
+ return NOTIFY_DONE;
}
static int __init migration_init(void)
@@ -5429,15 +5213,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
{
- sched_domain_debug_enabled = 1;
+ sched_debug_enabled = 1;
return 0;
}
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
@@ -5477,10 +5266,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->sgp->power) {
+ /*
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
+ * domain iteration is still funny without causing /0 traps.
+ */
+ if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5490,7 +5283,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
@@ -5501,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->sgp->power != SCHED_POWER_SCALE) {
- printk(KERN_CONT " (cpu_power = %d)",
- group->sgp->power);
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
}
group = group->next;
@@ -5524,7 +5318,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_domain_debug_enabled)
+ if (!sched_debug_enabled)
return;
if (!sd) {
@@ -5545,6 +5339,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd)
@@ -5557,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5587,8 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES);
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5603,6 +5404,8 @@ static void free_rootdomain(struct rcu_head *rcu)
struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
@@ -5625,7 +5428,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rt yet then
+ * If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -5654,8 +5457,14 @@ static int init_rootdomain(struct root_domain *rd)
goto out;
if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
goto free_online;
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_dlo_mask;
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
@@ -5663,6 +5472,8 @@ static int init_rootdomain(struct root_domain *rd)
free_rto_mask:
free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
free_online:
free_cpumask_var(rd->online);
free_span:
@@ -5700,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5711,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
- if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
- kfree(sg->sgp);
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5730,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgp);
+ kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5754,22 +5565,39 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
*
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see ttwu_share_cache().
+ * two cpus are in the same cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
+ struct sched_domain *busy_sd = NULL;
int id = cpu;
+ int size = 1;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ busy_sd = sd->parent; /* sd_busy */
+ }
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
@@ -5792,6 +5620,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
@@ -5828,105 +5663,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
- int i, n, val, min_val, best_node = -1;
-
- min_val = INT_MAX;
-
- for (i = 0; i < nr_node_ids; i++) {
- /* Start at @node */
- n = (node + i) % nr_node_ids;
-
- if (!nr_cpus_node(n))
- continue;
-
- /* Skip already used nodes */
- if (node_isset(n, *used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node != -1)
- node_set(best_node, *used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
- nodemask_t used_nodes;
- int i;
-
- cpumask_clear(span);
- nodes_clear(used_nodes);
-
- cpumask_or(span, span, cpumask_of_node(node));
- node_set(node, used_nodes);
-
- for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, &used_nodes);
- if (next_node < 0)
- break;
- cpumask_or(span, span, cpumask_of_node(next_node));
- }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
- lockdep_assert_held(&sched_domains_mutex);
-
- sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
- return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
- return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
- return cpumask_of_node(cpu_to_node(cpu));
-}
-
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
-struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
-};
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -5939,19 +5675,43 @@ enum s_alloc {
sa_none,
};
-struct sched_domain_topology_level;
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
-#define SDTL_OVERLAP 0x01
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
-struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- struct sd_data data;
-};
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
@@ -5971,6 +5731,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -5978,8 +5744,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -5988,10 +5752,25 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
- atomic_inc(&sg->sgp->ref);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ build_group_mask(sd, sg);
- if (cpumask_test_cpu(cpu, sg_span))
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->capacity_orig = sg->sgc->capacity;
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ group_balance_cpu(sg) == cpu)
groups = sg;
if (!first)
@@ -6021,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
- atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -6031,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -6047,7 +5826,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
get_group(cpu, sdd, &sd->groups);
atomic_inc(&sd->groups->ref);
- if (cpu != cpumask_first(sched_domain_span(sd)))
+ if (cpu != cpumask_first(span))
return 0;
lockdep_assert_held(&sched_domains_mutex);
@@ -6057,14 +5836,13 @@ build_sched_groups(struct sched_domain *sd, int cpu)
for_each_cpu(i, span) {
struct sched_group *sg;
- int group = get_group(i, sdd, &sg);
- int j;
+ int group, j;
if (cpumask_test_cpu(i, covered))
continue;
- cpumask_clear(sched_group_cpus(sg));
- sg->sgp->power = 0;
+ group = get_group(i, sdd, &sg);
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6086,36 +5864,31 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
*
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
*/
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
- WARN_ON(!sd || !sg);
+ WARN_ON(!sg);
do {
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
- update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -6123,48 +5896,13 @@ int __weak arch_sd_sibling_asym_packing(void)
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type) sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type) do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type) \
-static noinline struct sched_domain * \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
-{ \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
- unsigned long val;
-
- val = simple_strtoul(str, NULL, 0);
- if (val < sched_domain_level_max)
- default_relax_domain_level = val;
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
return 1;
}
@@ -6241,46 +5979,399 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
- *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
{
- return topology_thread_cpumask(cpu);
-}
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
#endif
+ };
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
+
+ return sd;
+}
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
-#endif
- { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
- { sd_init_ALLNODES, cpu_allnodes_mask, },
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
int j;
- for (tl = sched_domain_topology; tl->init; tl++) {
+ for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6291,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
- sdd->sgp = alloc_percpu(struct sched_group_power *);
- if (!sdd->sgp)
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6312,14 +6403,16 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sg)
return -ENOMEM;
+ sg->next = sg;
+
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
- if (!sgp)
+ if (!sgc)
return -ENOMEM;
- *per_cpu_ptr(sdd->sgp, j) = sgp;
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6331,40 +6424,49 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sched_domain_topology_level *tl;
int j;
- for (tl = sched_domain_topology; tl->init; tl++) {
+ for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- kfree(*per_cpu_ptr(sdd->sg, j));
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
+ sdd->sd = NULL;
free_percpu(sdd->sg);
- free_percpu(sdd->sgp);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
}
}
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- struct s_data *d, const struct cpumask *cpu_map,
- struct sched_domain_attr *attr, struct sched_domain *child,
- int cpu)
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
- set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
+ sd->child = child;
}
- sd->child = child;
+ set_domain_attribute(sd, attr);
return sd;
}
@@ -6376,7 +6478,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state = sa_none;
+ enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;
@@ -6390,18 +6492,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++) {
- sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
-
- while (sd->child)
- sd = sd->child;
-
- *per_cpu_ptr(d.sd, i) = sd;
}
/* Build the groups for the domains */
@@ -6418,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
- /* Calculate CPU power for physical packages and nodes */
+ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
- init_sched_groups_power(i, sd);
+ init_sched_groups_capacity(i, sd);
}
}
@@ -6460,7 +6559,7 @@ static cpumask_var_t fallback_doms;
* cpu core maps. It is supposed to return 1 if the topology changed
* or 0 if it stayed the same.
*/
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
{
return 0;
}
@@ -6505,7 +6604,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- dattr_cur = NULL;
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
@@ -6597,8 +6695,9 @@ match1:
;
}
+ n = ndoms_cur;
if (doms_new == NULL) {
- ndoms_cur = 0;
+ n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
WARN_ON_ONCE(dattr_new);
@@ -6606,7 +6705,7 @@ match1:
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur && !new_topology; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
@@ -6630,125 +6729,66 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
- get_online_cpus();
-
- /* Destroy domains first to force the rebuild */
- partition_sched_domains(0, NULL, NULL);
-
- rebuild_sched_domains();
- put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
- unsigned int level = 0;
-
- if (sscanf(buf, "%u", &level) != 1)
- return -EINVAL;
-
- /*
- * level is always be positive so don't check for
- * level < POWERSAVINGS_BALANCE_NONE which is 0
- * What happens on 0 or 1 byte write,
- * need to check for count as well?
- */
-
- if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
- return -EINVAL;
-
- if (smt)
- sched_smt_power_savings = level;
- else
- sched_mc_power_savings = level;
-
- reinit_sched_domains();
-
- return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
- sched_mc_power_savings_show,
- sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
- sched_smt_power_savings_show,
- sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
- int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
- if (smt_capable())
- err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
- if (!err && mc_capable())
- err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
- return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
void __init sched_init_smp(void)
@@ -6758,21 +6798,24 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- get_online_cpus();
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
- put_online_cpus();
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
- /* RT runtime code needs to handle some hotplug events */
- hotcpu_notifier(update_runtime, 0);
-
init_hrtick();
/* Move init over to a non-isolated CPU */
@@ -6782,6 +6825,7 @@ void __init sched_init_smp(void)
free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
+ init_sched_dl_class();
}
#else
void __init sched_init_smp(void)
@@ -6800,10 +6844,15 @@ int in_sched_functions(unsigned long addr)
}
#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
struct task_group root_task_group;
+LIST_HEAD(task_groups);
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void)
{
@@ -6840,19 +6889,21 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
- per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ per_cpu(load_balance_mask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
}
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth,
+ global_rt_period(), global_rt_runtime());
+
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
-
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -6866,12 +6917,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
- root_cpuacct.cpustat = &kernel_cpustat;
- root_cpuacct.cpuusage = alloc_percpu(u64);
- /* Too early, not expected to fail */
- BUG_ON(!root_cpuacct.cpuusage);
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -6882,6 +6927,7 @@ void __init sched_init(void)
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
+ init_dl_rq(&rq->dl, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6910,7 +6956,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
- INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -6922,7 +6967,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_POWER_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -6931,10 +6976,17 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+
+ INIT_LIST_HEAD(&rq->cfs_tasks);
+
rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
rq->nohz_flags = 0;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = 0;
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -6946,10 +6998,6 @@ void __init sched_init(void)
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
-#ifdef CONFIG_RT_MUTEXES
- plist_head_init(&init_task.pi_waiters);
-#endif
-
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -6976,6 +7024,8 @@ void __init sched_init(void)
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
#endif
init_sched_fair_class();
@@ -6995,7 +7045,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -7013,6 +7064,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
}
EXPORT_SYMBOL(__might_sleep);
@@ -7022,13 +7080,16 @@ EXPORT_SYMBOL(__might_sleep);
static void normalize_task(struct rq *rq, struct task_struct *p)
{
const struct sched_class *prev_class = p->sched_class;
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ };
int old_prio = p->prio;
int on_rq;
on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
- __setscheduler(rq, p, SCHED_NORMAL, 0);
+ __setscheduler(rq, p, &attr);
if (on_rq) {
enqueue_task(rq, p, 0);
resched_task(rq->curr);
@@ -7058,12 +7119,12 @@ void normalize_rt_tasks(void)
p->se.statistics.block_start = 0;
#endif
- if (!rt_task(p)) {
+ if (!dl_task(p) && !rt_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
*/
- if (TASK_NICE(p) < 0 && p->mm)
+ if (task_nice(p) < 0 && p->mm)
set_user_nice(p, 0);
continue;
}
@@ -7098,6 +7159,8 @@ void normalize_rt_tasks(void)
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
*/
struct task_struct *curr_task(int cpu)
{
@@ -7145,7 +7208,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7157,6 +7219,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7166,12 +7239,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7184,6 +7251,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7195,9 +7268,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7207,6 +7277,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -7221,6 +7292,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
@@ -7237,16 +7314,6 @@ void sched_move_task(struct task_struct *tsk)
}
#endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
- if (runtime == RUNTIME_INF)
- return 1ULL << 20;
-
- return div64_u64(runtime << 20, period);
-}
-#endif
-
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Ensure that the real time constraints are schedulable.
@@ -7375,7 +7442,7 @@ unlock:
return err;
}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;
@@ -7387,7 +7454,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
@@ -7399,7 +7466,7 @@ long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
{
u64 rt_runtime, rt_period;
@@ -7412,7 +7479,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;
@@ -7420,24 +7487,13 @@ long sched_group_rt_period(struct task_group *tg)
do_div(rt_period_us, NSEC_PER_USEC);
return rt_period_us;
}
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
static int sched_rt_global_constraints(void)
{
- u64 runtime, period;
int ret = 0;
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- runtime = global_rt_runtime();
- period = global_rt_period();
-
- /*
- * Sanity check on the sysctl variables.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
ret = __rt_schedulable(NULL, 0, 0);
@@ -7447,7 +7503,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7460,17 +7516,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
static int sched_rt_global_constraints(void)
{
unsigned long flags;
- int i;
-
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- /*
- * There's always some RT tasks in the root group
- * -- migration, kstopmachine etc..
- */
- if (sysctl_sched_rt_runtime == 0)
- return -EBUSY;
+ int i, ret = 0;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
@@ -7482,17 +7528,91 @@ static int sched_rt_global_constraints(void)
}
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- return 0;
+ return ret;
}
#endif /* CONFIG_RT_GROUP_SCHED */
+static int sched_dl_global_constraints(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ int cpu, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ int cpu;
+ unsigned long flags;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ }
+}
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret;
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
+ int ret;
mutex_lock(&mutex);
old_period = sysctl_sched_rt_period;
@@ -7501,41 +7621,68 @@ int sched_rt_handler(struct ctl_table *table, int write,
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
ret = sched_rt_global_constraints();
- if (ret) {
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- } else {
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period =
- ns_to_ktime(global_rt_period());
- }
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
}
mutex_unlock(&mutex);
return ret;
}
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
#ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
- struct task_group, css);
+ return css ? container_of(css, struct task_group, css) : NULL;
}
static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct task_group *tg, *parent;
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
- if (!cgrp->parent) {
+ if (!parent) {
/* This is early initialization for the top cgroup */
return &root_task_group.css;
}
- parent = cgroup_tg(cgrp->parent);
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
@@ -7543,22 +7690,38 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
return &tg->css;
}
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_destroy_group(tg);
}
-static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
@@ -7569,18 +7732,18 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
return 0;
}
-static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, tset)
sched_move_task(task);
}
-static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task)
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -7594,15 +7757,16 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 shareval)
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
return (u64) scale_load_down(tg->shares);
}
@@ -7646,7 +7810,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7655,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
/* restart the period timer (if active) to handle new period expiry */
if (runtime_enabled && cfs_b->timer_active) {
/* force a reprogram */
- cfs_b->timer_active = 0;
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, true);
}
raw_spin_unlock_irq(&cfs_b->lock);
@@ -7672,6 +7840,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
@@ -7724,26 +7894,28 @@ long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_quota(cgroup_tg(cgrp));
+ return tg_get_cfs_quota(css_tg(css));
}
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
- s64 cfs_quota_us)
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
{
- return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_period(cgroup_tg(cgrp));
+ return tg_get_cfs_period(css_tg(css));
}
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 cfs_period_us)
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
{
- return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
struct cfs_schedulable_data {
@@ -7824,15 +7996,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+static int cpu_stats_show(struct seq_file *sf, void *v)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- cb->fill(cb, "nr_periods", cfs_b->nr_periods);
- cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
- cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+ seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+ seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+ seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
return 0;
}
@@ -7840,26 +8011,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
- s64 val)
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
{
- return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ return sched_group_set_rt_runtime(css_tg(css), val);
}
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_runtime(cgroup_tg(cgrp));
+ return sched_group_rt_runtime(css_tg(css));
}
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_period_us)
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
{
- return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
}
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_period(cgroup_tg(cgrp));
+ return sched_group_rt_period(css_tg(css));
}
#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7884,7 +8057,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
- .read_map = cpu_stats_show,
+ .seq_show = cpu_stats_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -7899,247 +8072,25 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+ { } /* terminate */
};
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
-struct cgroup_subsys cpu_cgroup_subsys = {
- .name = "cpu",
- .create = cpu_cgroup_create,
- .destroy = cpu_cgroup_destroy,
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .populate = cpu_cgroup_populate,
- .subsys_id = cpu_cgroup_subsys_id,
+ .base_cftypes = cpu_files,
.early_init = 1,
};
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
- struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- struct cpuacct *ca;
-
- if (!cgrp->parent)
- return &root_cpuacct.css;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- goto out;
-
- ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage)
- goto out_free_ca;
-
- ca->cpustat = alloc_percpu(struct kernel_cpustat);
- if (!ca->cpustat)
- goto out_free_cpuusage;
-
- return &ca->css;
-
-out_free_cpuusage:
- free_percpu(ca->cpuusage);
-out_free_ca:
- kfree(ca);
-out:
- return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
-
- free_percpu(ca->cpustat);
- free_percpu(ca->cpuusage);
- kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 data;
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
-#endif
-
- return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit write safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- u64 totalcpuusage = 0;
- int i;
-
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
- return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int err = 0;
- int i;
-
- if (reset) {
- err = -EINVAL;
- goto out;
- }
-
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
-
-out:
- return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
-{
- struct cpuacct *ca = cgroup_ca(cgroup);
- u64 percpu;
- int i;
-
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
- seq_printf(m, "%llu ", (unsigned long long) percpu);
- }
- seq_printf(m, "\n");
- return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int cpu;
- s64 val = 0;
-
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
- val = 0;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
- }
-
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-
- return 0;
-}
-
-static struct cftype files[] = {
- {
- .name = "usage",
- .read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
- },
- {
- .name = "usage_percpu",
- .read_seq_string = cpuacct_percpu_seq_read,
- },
- {
- .name = "stat",
- .read_map = cpuacct_stats_show,
- },
-};
-
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+void dump_cpu_task(int cpu)
{
- return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
}
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
- struct cpuacct *ca;
- int cpu;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- cpu = task_cpu(tsk);
-
- rcu_read_lock();
-
- ca = task_ca(tsk);
-
- for (; ca; ca = parent_ca(ca)) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
- }
-
- rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
- .create = cpuacct_create,
- .destroy = cpuacct_destroy,
- .populate = cpuacct_populate,
- .subsys_id = cpuacct_subsys_id,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 00000000000..9cf350c94ec
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,283 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+
+#include "sched.h"
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuacct, css) : NULL;
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ return css_ca(ca->css.parent);
+}
+
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cpuacct *ca;
+
+ if (!parent_css)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuacct *ca = css_ca(css);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuacct *ca = css_ca(css);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 reset)
+{
+ struct cpuacct *ca = css_ca(css);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+ return err;
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
+{
+ struct cpuacct *ca = css_ca(seq_css(sf));
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_percpu",
+ .seq_show = cpuacct_percpu_seq_show,
+ },
+ {
+ .name = "stat",
+ .seq_show = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int cpu;
+
+ cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
+ ca = task_ca(tsk);
+
+ while (true) {
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+
+ ca = parent_ca(ca);
+ if (!ca)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca != &root_cpuacct) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += val;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_cgrp_subsys = {
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .base_cftypes = files,
+ .early_init = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 00000000000..ed605624a5e
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+
+#else
+
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+
+#endif
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 00000000000..bd95963dae8
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,229 @@
+/*
+ * kernel/sched/cpudl.c
+ *
+ * Global CPU deadline management
+ *
+ * Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "cpudeadline.h"
+
+static inline int parent(int i)
+{
+ return (i - 1) >> 1;
+}
+
+static inline int left_child(int i)
+{
+ return (i << 1) + 1;
+}
+
+static inline int right_child(int i)
+{
+ return (i << 1) + 2;
+}
+
+static inline int dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+static void cpudl_exchange(struct cpudl *cp, int a, int b)
+{
+ int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ int l, r, largest;
+
+ /* adapted from lib/prio_heap.c */
+ while(1) {
+ l = left_child(idx);
+ r = right_child(idx);
+ largest = idx;
+
+ if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+ cp->elements[l].dl))
+ largest = l;
+ if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+ cp->elements[r].dl))
+ largest = r;
+ if (largest == idx)
+ break;
+
+ /* Push idx down the heap one level and bump one up */
+ cpudl_exchange(cp, largest, idx);
+ idx = largest;
+ }
+}
+
+static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+{
+ WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+
+ if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+ cp->elements[idx].dl = new_dl;
+ cpudl_heapify(cp, idx);
+ } else {
+ cp->elements[idx].dl = new_dl;
+ while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl)) {
+ cpudl_exchange(cp, idx, parent(idx));
+ idx = parent(idx);
+ }
+ }
+}
+
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+ return cp->elements[0].cpu;
+}
+
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - best CPU (heap maximum if suitable)
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask)
+{
+ int best_cpu = -1;
+ const struct sched_dl_entity *dl_se = &p->dl;
+
+ if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+ &p->cpus_allowed) && cpumask_and(later_mask,
+ later_mask, cpu_active_mask)) {
+ best_cpu = cpumask_any(later_mask);
+ goto out;
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ best_cpu = cpudl_maximum(cp);
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
+ }
+
+out:
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+
+ return best_cpu;
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+{
+ int old_idx, new_cpu;
+ unsigned long flags;
+
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+ old_idx = cp->elements[cpu].idx;
+ if (!is_valid) {
+ /* remove item */
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if a rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ goto out;
+ }
+ new_cpu = cp->elements[cp->size - 1].cpu;
+ cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+ cp->elements[old_idx].cpu = new_cpu;
+ cp->size--;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
+ while (old_idx > 0 && dl_time_before(
+ cp->elements[parent(old_idx)].dl,
+ cp->elements[old_idx].dl)) {
+ cpudl_exchange(cp, old_idx, parent(old_idx));
+ old_idx = parent(old_idx);
+ }
+ cpumask_set_cpu(cpu, cp->free_cpus);
+ cpudl_heapify(cp, old_idx);
+
+ goto out;
+ }
+
+ if (old_idx == IDX_INVALID) {
+ cp->size++;
+ cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].cpu = cpu;
+ cp->elements[cpu].idx = cp->size - 1;
+ cpudl_change_key(cp, cp->size - 1, dl);
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+ } else {
+ cpudl_change_key(cp, old_idx, dl);
+ }
+
+out:
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+ raw_spin_lock_init(&cp->lock);
+ cp->size = 0;
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
+ cpumask_setall(cp->free_cpus);
+
+ return 0;
+}
+
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 00000000000..538c9796ad4
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_CPUDL_H
+#define _LINUX_CPUDL_H
+
+#include <linux/sched.h>
+
+#define IDX_INVALID -1
+
+struct cpudl_item {
+ u64 dl;
+ int cpu;
+ int idx;
+};
+
+struct cpudl {
+ raw_spinlock_t lock;
+ int size;
+ cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
+};
+
+
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+int cpudl_init(struct cpudl *cp);
+void cpudl_cleanup(struct cpudl *cp);
+#else
+#define cpudl_set(cp, cpu, dl) do { } while (0)
+#define cpudl_init() do { } while (0)
+#endif /* CONFIG_SMP */
+
+#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf66..981fcd7dc39 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,9 @@
*/
#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -60,16 +63,15 @@ static int convert_prio(int prio)
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
{
- int idx = 0;
- int task_pri = convert_prio(p->prio);
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -137,9 +139,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
- int *currpri = &cp->cpu_to_pri[cpu];
- int oldpri = *currpri;
- int do_mb = 0;
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
newpri = convert_prio(newpri);
@@ -163,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* do a write memory barrier, and then update the count, to
* make sure the vector is visible when count is set.
*/
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&(vec)->count);
do_mb = 1;
}
@@ -183,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* the new priority vec.
*/
if (do_mb)
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/*
* When removing from the vector, we decrement the counter first
* do a memory barrier and then clear the mask.
*/
atomic_dec(&(vec)->count);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
cpumask_clear_cpu(cpu, vec->mask);
}
@@ -201,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
*/
int cpupri_init(struct cpupri *cp)
{
@@ -217,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -235,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d75617349..6b033347fdf 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 00000000000..72fdf06ef86
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,838 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kernel_stat.h>
+#include <linux/static_key.h>
+#include <linux/context_tracking.h>
+#include "sched.h"
+
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in vtime_account, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/vtime_account on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+ unsigned long flags;
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+
+ irq_time_write_end();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+static int irqtime_account_hi_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime (0)
+
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
+
+ cpuacct_account_field(p, index, tmp);
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ int index;
+
+ /* Add user time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+ /* Add user time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for user time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ /* Add guest time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+ p->gtime += cputime;
+
+ /* Add guest time to cpustat. */
+ if (task_nice(p) > 0) {
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ } else {
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, int index)
+{
+ /* Add system time to process. */
+ p->stime += cputime;
+ p->stimescaled += cputime_scaled;
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for system time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+ cputime_t cputime, cputime_t cputime_scaled)
+{
+ int index;
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime, cputime_scaled);
+ return;
+ }
+
+ if (hardirq_count() - hardirq_offset)
+ index = CPUTIME_IRQ;
+ else if (in_serving_softirq())
+ index = CPUTIME_SOFTIRQ;
+ else
+ index = CPUTIME_SYSTEM;
+
+ __account_system_time(p, cputime, cputime_scaled, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct rq *rq = this_rq();
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ else
+ cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+}
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ u64 steal;
+ cputime_t steal_ct;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ /*
+ * cputime_t may be less precise than nsecs (eg: if it's
+ * based on jiffies). Lets cast the result to cputime
+ * granularity and account the rest on the next rounds.
+ */
+ steal_ct = nsecs_to_cputime(steal);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+
+ account_steal_time(steal_ct);
+ return steal_ct;
+ }
+#endif
+ return false;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct signal_struct *sig = tsk->signal;
+ cputime_t utime, stime;
+ struct task_struct *t;
+
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ rcu_read_lock();
+ /* make sure we can trust tsk->thread_group list */
+ if (!likely(pid_alive(tsk)))
+ goto out;
+
+ t = tsk;
+ do {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ } while_each_thread(tsk, t);
+out:
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int ticks)
+{
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ if (steal_account_process_tick())
+ return;
+
+ cputime *= ticks;
+ scaled *= ticks;
+
+ if (irqtime_account_hi_update()) {
+ cpustat[CPUTIME_IRQ] += cputime;
+ } else if (irqtime_account_si_update()) {
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+ } else if (user_tick) {
+ account_user_time(p, cputime, scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime, scaled);
+ } else {
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ struct rq *rq = this_rq();
+
+ irqtime_account_process_tick(current, 0, rq, ticks);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int nr_ticks) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_common_account_irq_enter(struct task_struct *tsk)
+{
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ struct rq *rq = this_rq();
+
+ if (vtime_accounting_enabled())
+ return;
+
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq, 1);
+ return;
+ }
+
+ if (steal_account_process_tick())
+ return;
+
+ if (user_tick)
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+ one_jiffy_scaled);
+ else
+ account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+ account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
+ account_idle_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+ u64 scaled;
+
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime)
+ swap(rtime, stime);
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
+
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+ struct cputime *prev,
+ cputime_t *ut, cputime_t *st)
+{
+ cputime_t rtime, stime, utime;
+
+ /*
+ * Tick based cputime accounting depend on random scheduling
+ * timeslices of a task to be interrupted or not by the timer.
+ * Depending on these circumstances, the number of these interrupts
+ * may be over or under-optimistic, matching the real user and system
+ * cputime with a variable precision.
+ *
+ * Fix this by scaling these tick based values against the total
+ * runtime accounted by the CFS scheduler.
+ */
+ rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ stime = curr->stime;
+ utime = curr->utime;
+
+ if (utime == 0) {
+ stime = rtime;
+ } else if (stime == 0) {
+ utime = rtime;
+ } else {
+ cputime_t total = stime + utime;
+
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ }
+
+ /*
+ * If the tick based count grows faster than the scheduler one,
+ * the result of the scaling may go backward.
+ * Let's enforce monotonicity.
+ */
+ prev->stime = max(prev->stime, stime);
+ prev->utime = max(prev->utime, utime);
+
+out:
+ *ut = prev->utime;
+ *st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime = {
+ .sum_exec_runtime = p->se.sum_exec_runtime,
+ };
+
+ task_cputime(p, &cputime.utime, &cputime.stime);
+ cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long clock;
+
+ clock = local_clock();
+ if (clock < tsk->vtime_snap)
+ return 0;
+
+ return clock - tsk->vtime_snap;
+}
+
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long delta = vtime_delta(tsk);
+
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ tsk->vtime_snap += delta;
+
+ /* CHECKME: always safe to convert nsecs to cputime? */
+ return nsecs_to_cputime(delta);
+}
+
+static void __vtime_account_system(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ if (context_tracking_in_user())
+ tsk->vtime_snap_whence = VTIME_USER;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_user(struct task_struct *tsk)
+{
+ cputime_t delta_cpu;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ delta_cpu = get_vtime_delta(tsk);
+ tsk->vtime_snap_whence = VTIME_SYS;
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ tsk->vtime_snap_whence = VTIME_USER;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_snap flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags |= PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags &= ~PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_idle_time(delta_cpu);
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+ write_seqlock(&prev->vtime_seqlock);
+ prev->vtime_snap_whence = VTIME_SLEEPING;
+ write_sequnlock(&prev->vtime_seqlock);
+
+ write_seqlock(&current->vtime_seqlock);
+ current->vtime_snap_whence = VTIME_SYS;
+ current->vtime_snap = sched_clock_cpu(smp_processor_id());
+ write_sequnlock(&current->vtime_seqlock);
+}
+
+void vtime_init_idle(struct task_struct *t, int cpu)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ t->vtime_snap_whence = VTIME_SYS;
+ t->vtime_snap = sched_clock_cpu(cpu);
+ write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+
+cputime_t task_gtime(struct task_struct *t)
+{
+ unsigned int seq;
+ cputime_t gtime;
+
+ do {
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ gtime = t->gtime;
+ if (t->flags & PF_VCPU)
+ gtime += vtime_delta(t);
+
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+
+ return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+ cputime_t *u_dst, cputime_t *s_dst,
+ cputime_t *u_src, cputime_t *s_src,
+ cputime_t *udelta, cputime_t *sdelta)
+{
+ unsigned int seq;
+ unsigned long long delta;
+
+ do {
+ *udelta = 0;
+ *sdelta = 0;
+
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ if (u_dst)
+ *u_dst = *u_src;
+ if (s_dst)
+ *s_dst = *s_src;
+
+ /* Task is sleeping, nothing to add */
+ if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(t);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+ *udelta = delta;
+ } else {
+ if (t->vtime_snap_whence == VTIME_SYS)
+ *sdelta = delta;
+ }
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+
+
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utime, stime, &t->utime,
+ &t->stime, &udelta, &sdelta);
+ if (utime)
+ *utime += udelta;
+ if (stime)
+ *stime += sdelta;
+}
+
+void task_cputime_scaled(struct task_struct *t,
+ cputime_t *utimescaled, cputime_t *stimescaled)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utimescaled, stimescaled,
+ &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+ if (utimescaled)
+ *utimescaled += cputime_to_scaled(udelta);
+ if (stimescaled)
+ *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 00000000000..fc4f98b1258
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1676 @@
+/*
+ * Deadline Scheduling Class (SCHED_DEADLINE)
+ *
+ * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
+ *
+ * Tasks that periodically executes their instances for less than their
+ * runtime won't miss any of their deadlines.
+ * Tasks that are not periodic or sporadic or that tries to execute more
+ * than their reserved bandwidth will be slowed down (and may potentially
+ * miss some of their deadlines), and won't affect any other task.
+ *
+ * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
+ * Juri Lelli <juri.lelli@gmail.com>,
+ * Michael Trimarchi <michael@amarulasolutions.com>,
+ * Fabio Checconi <fchecconi@gmail.com>
+ */
+#include "sched.h"
+
+#include <linux/slab.h>
+
+struct dl_bandwidth def_dl_bandwidth;
+
+static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
+{
+ return container_of(dl_se, struct task_struct, dl);
+}
+
+static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
+{
+ return container_of(dl_rq, struct rq, dl);
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->dl;
+}
+
+static inline int on_dl_rq(struct sched_dl_entity *dl_se)
+{
+ return !RB_EMPTY_NODE(&dl_se->rb_node);
+}
+
+static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ return dl_rq->rb_leftmost == &dl_se->rb_node;
+}
+
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+{
+ raw_spin_lock_init(&dl_b->dl_runtime_lock);
+ dl_b->dl_period = period;
+ dl_b->dl_runtime = runtime;
+}
+
+void init_dl_bw(struct dl_bw *dl_b)
+{
+ raw_spin_lock_init(&dl_b->lock);
+ raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
+ if (global_rt_runtime() == RUNTIME_INF)
+ dl_b->bw = -1;
+ else
+ dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
+ raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+ dl_b->total_bw = 0;
+}
+
+void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+{
+ dl_rq->rb_root = RB_ROOT;
+
+#ifdef CONFIG_SMP
+ /* zero means no -deadline tasks */
+ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
+
+ dl_rq->dl_nr_migratory = 0;
+ dl_rq->overloaded = 0;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT;
+#else
+ init_dl_bw(&dl_rq->dl_bw);
+#endif
+}
+
+#ifdef CONFIG_SMP
+
+static inline int dl_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->dlo_count);
+}
+
+static inline void dl_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
+ /*
+ * Must be visible before the overload count is
+ * set (as in sched_rt.c).
+ *
+ * Matched by the barrier in pull_dl_task().
+ */
+ smp_wmb();
+ atomic_inc(&rq->rd->dlo_count);
+}
+
+static inline void dl_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ atomic_dec(&rq->rd->dlo_count);
+ cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
+}
+
+static void update_dl_migration(struct dl_rq *dl_rq)
+{
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+ if (!dl_rq->overloaded) {
+ dl_set_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 1;
+ }
+ } else if (dl_rq->overloaded) {
+ dl_clear_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 0;
+ }
+}
+
+static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory++;
+
+ update_dl_migration(dl_rq);
+}
+
+static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory--;
+
+ update_dl_migration(dl_rq);
+}
+
+/*
+ * The list of pushable -deadline task is not a plist, like in
+ * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
+ */
+static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct task_struct *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct task_struct,
+ pushable_dl_tasks);
+ if (dl_entity_preempt(&p->dl, &entry->dl))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+
+ rb_link_node(&p->pushable_dl_tasks, parent, link);
+ rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+}
+
+static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
+ return;
+
+ if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&p->pushable_dl_tasks);
+ dl_rq->pushable_dl_tasks_leftmost = next_node;
+ }
+
+ rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
+}
+
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+}
+
+static int push_dl_task(struct rq *rq);
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return dl_task(prev);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+ rq->post_schedule = has_pushable_dl_tasks(rq);
+}
+
+#else
+
+static inline
+void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline
+void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_dl_task(struct rq *rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags);
+
+/*
+ * We are being explicitly informed that a new instance is starting,
+ * and this means that:
+ * - the absolute deadline of the entity has to be placed at
+ * current time + relative deadline;
+ * - the runtime of the entity has to be set to the maximum value.
+ *
+ * The capability of specifying such event is useful whenever a -deadline
+ * entity wants to (try to!) synchronize its behaviour with the scheduler's
+ * one, and to (try to!) reconcile itself with its own scheduling
+ * parameters.
+ */
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+
+ /*
+ * We use the regular wall clock time to set deadlines in the
+ * future; in fact, we must consider execution overheads (time
+ * spent on hardirq context, etc.).
+ */
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ dl_se->dl_new = 0;
+}
+
+/*
+ * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+ * possibility of a entity lasting more than what it declared, and thus
+ * exhausting its runtime.
+ *
+ * Here we are interested in making runtime overrun possible, but we do
+ * not want a entity which is misbehaving to affect the scheduling of all
+ * other entities.
+ * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
+ * is used, in order to confine each entity within its own bandwidth.
+ *
+ * This function deals exactly with that, and ensures that when the runtime
+ * of a entity is replenished, its deadline is also postponed. That ensures
+ * the overrunning entity can't interfere with other entity in the system and
+ * can't make them miss their deadlines. Reasons why this kind of overruns
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setscheduler_ex().
+ */
+static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ BUG_ON(pi_se->dl_runtime <= 0);
+
+ /*
+ * This could be the case for a !-dl task that is boosted.
+ * Just go with full inherited parameters.
+ */
+ if (dl_se->dl_deadline == 0) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+
+ /*
+ * We keep moving the deadline away until we get some
+ * available runtime for the entity. This ensures correct
+ * handling of situations where the runtime overrun is
+ * arbitrary large.
+ */
+ while (dl_se->runtime <= 0) {
+ dl_se->deadline += pi_se->dl_period;
+ dl_se->runtime += pi_se->dl_runtime;
+ }
+
+ /*
+ * At this point, the deadline really should be "in
+ * the future" with respect to rq->clock. If it's
+ * not, we are, for some reason, lagging too much!
+ * Anyway, after having warn userspace abut that,
+ * we still try to keep the things running by
+ * resetting the deadline and the budget of the
+ * entity.
+ */
+ if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+ printk_deferred_once("sched: DL replenish lagged to much\n");
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+}
+
+/*
+ * Here we check if --at time t-- an entity (which is probably being
+ * [re]activated or, in general, enqueued) can use its remaining runtime
+ * and its current deadline _without_ exceeding the bandwidth it is
+ * assigned (function returns true if it can't). We are in fact applying
+ * one of the CBS rules: when a task wakes up, if the residual runtime
+ * over residual deadline fits within the allocated bandwidth, then we
+ * can keep the current (absolute) deadline and residual budget without
+ * disrupting the schedulability of the system. Otherwise, we should
+ * refill the runtime and set the deadline a period in the future,
+ * because keeping the current (absolute) deadline of the task would
+ * result in breaking guarantees promised to other tasks (refer to
+ * Documentation/scheduler/sched-deadline.txt for more informations).
+ *
+ * This function returns true if:
+ *
+ * runtime / (deadline - t) > dl_runtime / dl_period ,
+ *
+ * IOW we can't recycle current parameters.
+ *
+ * Notice that the bandwidth check is done against the period. For
+ * task with deadline equal to period this is the same of using
+ * dl_deadline instead of dl_period in the equation above.
+ */
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, u64 t)
+{
+ u64 left, right;
+
+ /*
+ * left and right are the two sides of the equation above,
+ * after a bit of shuffling to use multiplications instead
+ * of divisions.
+ *
+ * Note that none of the time values involved in the two
+ * multiplications are absolute: dl_deadline and dl_runtime
+ * are the relative deadline and the maximum runtime of each
+ * instance, runtime is the runtime left for the last instance
+ * and (deadline - t), since t is rq->clock, is the time left
+ * to the (absolute) deadline. Even if overflowing the u64 type
+ * is very unlikely to occur in both cases, here we scale down
+ * as we want to avoid that risk at all. Scaling down by 10
+ * means that we reduce granularity to 1us. We are fine with it,
+ * since this is only a true/false check and, anyway, thinking
+ * of anything below microseconds resolution is actually fiction
+ * (but still we want to give the user that illusion >;).
+ */
+ left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ right = ((dl_se->deadline - t) >> DL_SCALE) *
+ (pi_se->dl_runtime >> DL_SCALE);
+
+ return dl_time_before(right, left);
+}
+
+/*
+ * When a -deadline entity is queued back on the runqueue, its runtime and
+ * deadline might need updating.
+ *
+ * The policy here is that we update the deadline of the entity only if:
+ * - the current deadline is in the past,
+ * - using the remaining runtime with the current deadline would make
+ * the entity exceed its bandwidth.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * The arrival of a new instance needs special treatment, i.e.,
+ * the actual scheduling parameters have to be "renewed".
+ */
+ if (dl_se->dl_new) {
+ setup_new_dl_entity(dl_se, pi_se);
+ return;
+ }
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+ dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+}
+
+/*
+ * If the entity depleted all its runtime, and if we want it to sleep
+ * while waiting for some new execution time to become available, we
+ * set the bandwidth enforcement timer to the replenishment instant
+ * and try to activate it.
+ *
+ * Notice that it is important for the caller to know if the timer
+ * actually started or not (i.e., the replenishment instant is in
+ * the future or in the past).
+ */
+static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ ktime_t now, act;
+ ktime_t soft, hard;
+ unsigned long range;
+ s64 delta;
+
+ if (boosted)
+ return 0;
+ /*
+ * We want the timer to fire at the deadline, but considering
+ * that it is actually coming from rq->clock and not from
+ * hrtimer's time base reading.
+ */
+ act = ns_to_ktime(dl_se->deadline);
+ now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ delta = ktime_to_ns(now) - rq_clock(rq);
+ act = ktime_add_ns(act, delta);
+
+ /*
+ * If the expiry time already passed, e.g., because the value
+ * chosen as the deadline is too small, don't even try to
+ * start the timer in the past!
+ */
+ if (ktime_us_delta(act, now) < 0)
+ return 0;
+
+ hrtimer_set_expires(&dl_se->dl_timer, act);
+
+ soft = hrtimer_get_softexpires(&dl_se->dl_timer);
+ hard = hrtimer_get_expires(&dl_se->dl_timer);
+ range = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
+ range, HRTIMER_MODE_ABS, 0);
+
+ return hrtimer_active(&dl_se->dl_timer);
+}
+
+/*
+ * This is the bandwidth enforcement timer callback. If here, we know
+ * a task is not on its dl_rq, since the fact that the timer was running
+ * means the task is throttled and needs a runtime replenishment.
+ *
+ * However, what we actually do depends on the fact the task is active,
+ * (it is on its rq) or has been removed from there by a call to
+ * dequeue_task_dl(). In the former case we must issue the runtime
+ * replenishment and add the task back to the dl_rq; in the latter, we just
+ * do nothing but clearing dl_throttled, so that runtime and deadline
+ * updating (and the queueing back to dl_rq) will be done by the
+ * next call to enqueue_task_dl().
+ */
+static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ dl_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq;
+again:
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+
+ if (rq != task_rq(p)) {
+ /* Task was moved, retrying. */
+ raw_spin_unlock(&rq->lock);
+ goto again;
+ }
+
+ /*
+ * We need to take care of a possible races here. In fact, the
+ * task might have changed its scheduling policy to something
+ * different from SCHED_DEADLINE or changed its reservation
+ * parameters (through sched_setattr()).
+ */
+ if (!dl_task(p) || dl_se->dl_new)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ if (p->on_rq) {
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (task_has_dl_policy(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_task(rq->curr);
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
+#endif
+ }
+unlock:
+ raw_spin_unlock(&rq->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->dl_timer;
+
+ if (hrtimer_active(timer)) {
+ hrtimer_try_to_cancel(timer);
+ return;
+ }
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = dl_task_timer;
+}
+
+static
+int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
+ int rorun = dl_se->runtime <= 0;
+
+ if (!rorun && !dmiss)
+ return 0;
+
+ /*
+ * If we are beyond our current deadline and we are still
+ * executing, then we have already used some of the runtime of
+ * the next instance. Thus, if we do not account that, we are
+ * stealing bandwidth from the system at each deadline miss!
+ */
+ if (dmiss) {
+ dl_se->runtime = rorun ? dl_se->runtime : 0;
+ dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
+ }
+
+ return 1;
+}
+
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ u64 delta_exec;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
+
+ sched_rt_avg_update(rq, delta_exec);
+
+ dl_se->runtime -= delta_exec;
+ if (dl_runtime_exceeded(rq, dl_se)) {
+ __dequeue_task_dl(rq, curr, 0);
+ if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
+ dl_se->dl_throttled = 1;
+ else
+ enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+
+ if (!is_leftmost(curr, &rq->dl))
+ resched_task(curr);
+ }
+
+ /*
+ * Because -- for now -- we share the rt bandwidth, we need to
+ * account our runtime there too, otherwise actual rt tasks
+ * would be able to exceed the shared quota.
+ *
+ * Account to the root rt group for now.
+ *
+ * The solution we're working towards is having the RT groups scheduled
+ * using deadline servers -- however there's a few nasties to figure
+ * out before that can happen.
+ */
+ if (rt_bandwidth_enabled()) {
+ struct rt_rq *rt_rq = &rq->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We'll let actual RT tasks worry about the overflow here, we
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
+ */
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+}
+
+#ifdef CONFIG_SMP
+
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
+
+static inline u64 next_deadline(struct rq *rq)
+{
+ struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
+
+ if (next && dl_prio(next->prio))
+ return next->dl.deadline;
+ else
+ return 0;
+}
+
+static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ if (dl_rq->earliest_dl.curr == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ /*
+ * If the dl_rq had no -deadline tasks, or if the new task
+ * has shorter deadline than the current one on dl_rq, we
+ * know that the previous earliest becomes our next earliest,
+ * as the new task becomes the earliest itself.
+ */
+ dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
+ dl_rq->earliest_dl.curr = deadline;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+ } else if (dl_rq->earliest_dl.next == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.next)) {
+ /*
+ * On the other hand, if the new -deadline task has a
+ * a later deadline than the earliest one on dl_rq, but
+ * it is earlier than the next (if any), we must
+ * recompute the next-earliest.
+ */
+ dl_rq->earliest_dl.next = next_deadline(rq);
+ }
+}
+
+static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * Since we may have removed our earliest (and/or next earliest)
+ * task we must recompute them.
+ */
+ if (!dl_rq->dl_nr_running) {
+ dl_rq->earliest_dl.curr = 0;
+ dl_rq->earliest_dl.next = 0;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ } else {
+ struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct sched_dl_entity *entry;
+
+ entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
+ dl_rq->earliest_dl.curr = entry->deadline;
+ dl_rq->earliest_dl.next = next_deadline(rq);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ }
+}
+
+#else
+
+static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+
+#endif /* CONFIG_SMP */
+
+static inline
+void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+ u64 deadline = dl_se->deadline;
+
+ WARN_ON(!dl_prio(prio));
+ dl_rq->dl_nr_running++;
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ inc_dl_deadline(dl_rq, deadline);
+ inc_dl_migration(dl_se, dl_rq);
+}
+
+static inline
+void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+
+ WARN_ON(!dl_prio(prio));
+ WARN_ON(!dl_rq->dl_nr_running);
+ dl_rq->dl_nr_running--;
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ dec_dl_deadline(dl_rq, dl_se->deadline);
+ dec_dl_migration(dl_se, dl_rq);
+}
+
+static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_dl_entity *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_dl_entity, rb_node);
+ if (dl_time_before(dl_se->deadline, entry->deadline))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->rb_leftmost = &dl_se->rb_node;
+
+ rb_link_node(&dl_se->rb_node, parent, link);
+ rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+
+ inc_dl_tasks(dl_se, dl_rq);
+}
+
+static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ if (RB_EMPTY_NODE(&dl_se->rb_node))
+ return;
+
+ if (dl_rq->rb_leftmost == &dl_se->rb_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&dl_se->rb_node);
+ dl_rq->rb_leftmost = next_node;
+ }
+
+ rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ RB_CLEAR_NODE(&dl_se->rb_node);
+
+ dec_dl_tasks(dl_se, dl_rq);
+}
+
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, int flags)
+{
+ BUG_ON(on_dl_rq(dl_se));
+
+ /*
+ * If this is a wakeup or a new instance, the scheduling
+ * parameters of the task might need updating. Otherwise,
+ * we want a replenishment of its runtime.
+ */
+ if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
+ else
+ update_dl_entity(dl_se, pi_se);
+
+ __enqueue_dl_entity(dl_se);
+}
+
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ __dequeue_dl_entity(dl_se);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct sched_dl_entity *pi_se = &p->dl;
+
+ /*
+ * Use the scheduling parameters of the top pi-waiter
+ * task if we have one and its (relative) deadline is
+ * smaller than our one... OTW we keep our runtime and
+ * deadline.
+ */
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ pi_se = &pi_task->dl;
+
+ /*
+ * If p is throttled, we do nothing. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ */
+ if (p->dl.dl_throttled)
+ return;
+
+ enqueue_dl_entity(&p->dl, pi_se, flags);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ dequeue_dl_entity(&p->dl);
+ dequeue_pushable_dl_task(rq, p);
+}
+
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_curr_dl(rq);
+ __dequeue_task_dl(rq, p, flags);
+}
+
+/*
+ * Yield task semantic for -deadline tasks is:
+ *
+ * get off from the CPU until our next instance, with
+ * a new runtime. This is of little use now, since we
+ * don't have a bandwidth reclaiming mechanism. Anyway,
+ * bandwidth reclaiming is planned for the future, and
+ * yield_task_dl will indicate that some spare budget
+ * is available for other task instances to use it.
+ */
+static void yield_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ /*
+ * We make the task go to sleep until its current deadline by
+ * forcing its runtime to zero. This way, update_curr_dl() stops
+ * it and the bandwidth timer will wake it up and will give it
+ * new scheduling parameters (thanks to dl_yielded=1).
+ */
+ if (p->dl.runtime > 0) {
+ rq->curr->dl.dl_yielded = 1;
+ p->dl.runtime = 0;
+ }
+ update_curr_dl(rq);
+}
+
+#ifdef CONFIG_SMP
+
+static int find_later_rq(struct task_struct *task);
+
+static int
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ struct task_struct *curr;
+ struct rq *rq;
+
+ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
+ /*
+ * If we are dealing with a -deadline task, we must
+ * decide where to wake it up.
+ * If it has a later deadline and the current task
+ * on this rq can't move (provided the waking task
+ * can!) we prefer to send it somewhere else. On the
+ * other hand, if it has a shorter deadline, we
+ * try to make it stay here, it might be important.
+ */
+ if (unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ (p->nr_cpus_allowed > 1)) {
+ int target = find_later_rq(p);
+
+ if (target != -1)
+ cpu = target;
+ }
+ rcu_read_unlock();
+
+out:
+ return cpu;
+}
+
+static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ return;
+
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
+ if (p->nr_cpus_allowed != 1 &&
+ cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ return;
+
+ resched_task(rq->curr);
+}
+
+static int pull_dl_task(struct rq *this_rq);
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Only called when both the current and waking task are -deadline
+ * tasks.
+ */
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ resched_task(rq->curr);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * In the unlikely case current and p have the same deadline
+ * let us try to decide what's the best thing to do...
+ */
+ if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ !test_tsk_need_resched(rq->curr))
+ check_preempt_equal_dl(rq, p);
+#endif /* CONFIG_SMP */
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+ s64 delta = p->dl.dl_runtime - p->dl.runtime;
+
+ if (delta > 10000)
+ hrtick_start(rq, p->dl.runtime);
+}
+#endif
+
+static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
+ struct dl_rq *dl_rq)
+{
+ struct rb_node *left = dl_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_dl_entity, rb_node);
+}
+
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p;
+ struct dl_rq *dl_rq;
+
+ dl_rq = &rq->dl;
+
+ if (need_pull_dl_task(rq, prev)) {
+ pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
+ /*
+ * When prev is DL, we may throttle it in put_prev_task().
+ * So, we update time before we check for dl_nr_running.
+ */
+ if (prev->sched_class == &dl_sched_class)
+ update_curr_dl(rq);
+
+ if (unlikely(!dl_rq->dl_nr_running))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ dl_se = pick_next_dl_entity(rq, dl_rq);
+ BUG_ON(!dl_se);
+
+ p = dl_task_of(dl_se);
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* Running task will never be pushed. */
+ dequeue_pushable_dl_task(rq, p);
+
+#ifdef CONFIG_SCHED_HRTICK
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+#endif
+
+ set_post_schedule(rq);
+
+ return p;
+}
+
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+{
+ update_curr_dl(rq);
+
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
+{
+ update_curr_dl(rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+ start_hrtick_dl(rq, p);
+#endif
+}
+
+static void task_fork_dl(struct task_struct *p)
+{
+ /*
+ * SCHED_DEADLINE tasks cannot fork and this is achieved through
+ * sched_fork()
+ */
+}
+
+static void task_dead_dl(struct task_struct *p)
+{
+ struct hrtimer *timer = &p->dl.dl_timer;
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ /*
+ * Since we are TASK_DEAD we won't slip out of the domain!
+ */
+ raw_spin_lock_irq(&dl_b->lock);
+ dl_b->total_bw -= p->dl.dl_bw;
+ raw_spin_unlock_irq(&dl_b->lock);
+
+ hrtimer_cancel(timer);
+}
+
+static void set_curr_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* You can't push away the running task */
+ dequeue_pushable_dl_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define DL_MAX_TRIES 3
+
+static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+ (p->nr_cpus_allowed > 1))
+ return 1;
+
+ return 0;
+}
+
+/* Returns the second earliest -deadline task, NULL otherwise */
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.rb_leftmost;
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p = NULL;
+
+next_node:
+ next_node = rb_next(next_node);
+ if (next_node) {
+ dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
+ p = dl_task_of(dl_se);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ goto next_node;
+ }
+
+ return NULL;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+
+static int find_later_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+ int this_cpu = smp_processor_id();
+ int best_cpu, cpu = task_cpu(task);
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!later_mask))
+ return -1;
+
+ if (task->nr_cpus_allowed == 1)
+ return -1;
+
+ best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
+ task, later_mask);
+ if (best_cpu == -1)
+ return -1;
+
+ /*
+ * If we are here, some target has been found,
+ * the most suitable of which is cached in best_cpu.
+ * This is, among the runqueues where the current tasks
+ * have later deadlines than the task's one, the rq
+ * with the latest possible one.
+ *
+ * Now we check how well this matches with task's
+ * affinity and system topology.
+ *
+ * The last cpu where the task run is our first
+ * guess, since it is most likely cache-hot there.
+ */
+ if (cpumask_test_cpu(cpu, later_mask))
+ return cpu;
+ /*
+ * Check if this_cpu is to be skipped (i.e., it is
+ * not in the mask) or not.
+ */
+ if (!cpumask_test_cpu(this_cpu, later_mask))
+ this_cpu = -1;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+
+ /*
+ * If possible, preempting this_cpu is
+ * cheaper than migrating.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ /*
+ * Last chance: if best_cpu is valid and is
+ * in the mask, that becomes our choice.
+ */
+ if (best_cpu < nr_cpu_ids &&
+ cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * At this point, all our guesses failed, we just return
+ * 'something', and let the caller sort the things out.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any(later_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ return -1;
+}
+
+/* Locks the rq it finds */
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *later_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < DL_MAX_TRIES; tries++) {
+ cpu = find_later_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ later_rq = cpu_rq(cpu);
+
+ /* Retry if something changed. */
+ if (double_lock_balance(rq, later_rq)) {
+ if (unlikely(task_rq(task) != rq ||
+ !cpumask_test_cpu(later_rq->cpu,
+ &task->cpus_allowed) ||
+ task_running(rq, task) || !task->on_rq)) {
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ break;
+ }
+ }
+
+ /*
+ * If the rq we found has no -deadline task, or
+ * its earliest one has a later deadline than our
+ * task, the rq is a good one.
+ */
+ if (!later_rq->dl.dl_nr_running ||
+ dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr))
+ break;
+
+ /* Otherwise we try again. */
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ }
+
+ return later_rq;
+}
+
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ struct task_struct, pushable_dl_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!p->on_rq);
+ BUG_ON(!dl_task(p));
+
+ return p;
+}
+
+/*
+ * See if the non running -deadline tasks on this rq
+ * can be sent to some other CPU where they can preempt
+ * and start executing.
+ */
+static int push_dl_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *later_rq;
+
+ if (!rq->dl.overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_dl_task(rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * If next_task preempts rq->curr, and rq->curr
+ * can move away, it makes sense to just reschedule
+ * without going further in pushing next_task.
+ */
+ if (dl_task(rq->curr) &&
+ dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ rq->curr->nr_cpus_allowed > 1) {
+ resched_task(rq->curr);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* Will lock the rq it'll find */
+ later_rq = find_lock_later_rq(next_task, rq);
+ if (!later_rq) {
+ struct task_struct *task;
+
+ /*
+ * We must check all this again, since
+ * find_lock_later_rq releases rq->lock and it is
+ * then possible that next_task has migrated.
+ */
+ task = pick_next_pushable_dl_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * The task is still there. We don't try
+ * again, some other cpu will pull it when ready.
+ */
+ dequeue_pushable_dl_task(rq, next_task);
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks */
+ goto out;
+
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, later_rq->cpu);
+ activate_task(later_rq, next_task, 0);
+
+ resched_task(later_rq->curr);
+
+ double_unlock_balance(rq, later_rq);
+
+out:
+ put_task_struct(next_task);
+
+ return 1;
+}
+
+static void push_dl_tasks(struct rq *rq)
+{
+ /* Terminates as it moves a -deadline task */
+ while (push_dl_task(rq))
+ ;
+}
+
+static int pull_dl_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct task_struct *p;
+ struct rq *src_rq;
+ u64 dmin = LONG_MAX;
+
+ if (likely(!dl_overloaded(this_rq)))
+ return 0;
+
+ /*
+ * Match the barrier from dl_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the dlo_mask bit.
+ */
+ smp_rmb();
+
+ for_each_cpu(cpu, this_rq->rd->dlo_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+
+ /*
+ * It looks racy, abd it is! However, as in sched_rt.c,
+ * we are fine with this.
+ */
+ if (this_rq->dl.dl_nr_running &&
+ dl_time_before(this_rq->dl.earliest_dl.curr,
+ src_rq->dl.earliest_dl.next))
+ continue;
+
+ /* Might drop this_rq->lock */
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * If there are no more pullable tasks on the
+ * rq, we're done with it.
+ */
+ if (src_rq->dl.dl_nr_running <= 1)
+ goto skip;
+
+ p = pick_next_earliest_dl_task(src_rq, this_cpu);
+
+ /*
+ * We found a task to be pulled if:
+ * - it preempts our current (if there's one),
+ * - it will preempt the last one we pulled (if any).
+ */
+ if (p && dl_time_before(p->dl.deadline, dmin) &&
+ (!this_rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ this_rq->dl.earliest_dl.curr))) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!p->on_rq);
+
+ /*
+ * Then we pull iff p has actually an earlier
+ * deadline than the current task of its runqueue.
+ */
+ if (dl_time_before(p->dl.deadline,
+ src_rq->curr->dl.deadline))
+ goto skip;
+
+ ret = 1;
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+
+ /* Is there any other task even earlier? */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ return ret;
+}
+
+static void post_schedule_dl(struct rq *rq)
+{
+ push_dl_tasks(rq);
+}
+
+/*
+ * Since the task is not running and a reschedule is not going to happen
+ * anytime soon on its runqueue, we try pushing it away now.
+ */
+static void task_woken_dl(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ has_pushable_dl_tasks(rq) &&
+ p->nr_cpus_allowed > 1 &&
+ dl_task(rq->curr) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ push_dl_tasks(rq);
+ }
+}
+
+static void set_cpus_allowed_dl(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ struct rq *rq;
+ int weight;
+
+ BUG_ON(!dl_task(p));
+
+ /*
+ * Update only if the task is actually running (i.e.,
+ * it is on the rq AND it is not throttled).
+ */
+ if (!on_dl_rq(&p->dl))
+ return;
+
+ weight = cpumask_weight(new_mask);
+
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
+ return;
+
+ rq = task_rq(p);
+
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_dl_task(rq, p);
+ BUG_ON(!rq->dl.dl_nr_migratory);
+ rq->dl.dl_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_dl_task(rq, p);
+ rq->dl.dl_nr_migratory++;
+ }
+
+ update_dl_migration(&rq->dl);
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_set_overload(rq);
+
+ if (rq->dl.dl_nr_running > 0)
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_clear_overload(rq);
+
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+}
+
+void init_sched_dl_class(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
+ GFP_KERNEL, cpu_to_node(i));
+}
+
+#endif /* CONFIG_SMP */
+
+static void switched_from_dl(struct rq *rq, struct task_struct *p)
+{
+ if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
+
+#ifdef CONFIG_SMP
+ /*
+ * Since this might be the only -deadline task on the rq,
+ * this is the right place to try to pull some other one
+ * from an overloaded cpu, if any.
+ */
+ if (!rq->dl.dl_nr_running)
+ pull_dl_task(rq);
+#endif
+}
+
+/*
+ * When switching to -deadline, we may overload the rq, then
+ * we try to push someone off, if possible.
+ */
+static void switched_to_dl(struct rq *rq, struct task_struct *p)
+{
+ int check_resched = 1;
+
+ /*
+ * If p is throttled, don't consider the possibility
+ * of preempting rq->curr, the check will be done right
+ * after its runtime will get replenished.
+ */
+ if (unlikely(p->dl.dl_throttled))
+ return;
+
+ if (p->on_rq && rq->curr != p) {
+#ifdef CONFIG_SMP
+ if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ /* Only reschedule if pushing failed */
+ check_resched = 0;
+#endif /* CONFIG_SMP */
+ if (check_resched && task_has_dl_policy(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ }
+}
+
+/*
+ * If the scheduling parameters of a -deadline task changed,
+ * a push or pull operation might be needed.
+ */
+static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+ int oldprio)
+{
+ if (p->on_rq || rq->curr == p) {
+#ifdef CONFIG_SMP
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ pull_dl_task(rq);
+
+ /*
+ * If we now have a earlier deadline task than p,
+ * then reschedule, provided p is still on this
+ * runqueue.
+ */
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
+ rq->curr == p)
+ resched_task(p);
+#else
+ /*
+ * Again, we don't know if p has a earlier
+ * or later deadline, so let's blindly set a
+ * (maybe not needed) rescheduling point.
+ */
+ resched_task(p);
+#endif /* CONFIG_SMP */
+ } else
+ switched_to_dl(rq, p);
+}
+
+const struct sched_class dl_sched_class = {
+ .next = &rt_sched_class,
+ .enqueue_task = enqueue_task_dl,
+ .dequeue_task = dequeue_task_dl,
+ .yield_task = yield_task_dl,
+
+ .check_preempt_curr = check_preempt_curr_dl,
+
+ .pick_next_task = pick_next_task_dl,
+ .put_prev_task = put_prev_task_dl,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_dl,
+ .set_cpus_allowed = set_cpus_allowed_dl,
+ .rq_online = rq_online_dl,
+ .rq_offline = rq_offline_dl,
+ .post_schedule = post_schedule_dl,
+ .task_woken = task_woken_dl,
+#endif
+
+ .set_curr_task = set_curr_task_dl,
+ .task_tick = task_tick_dl,
+ .task_fork = task_fork_dl,
+ .task_dead = task_dead_dl,
+
+ .prio_changed = prio_changed_dl,
+ .switched_from = switched_from_dl,
+ .switched_to = switched_to_dl,
+};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a075e10004..627b3c34b82 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+#include <linux/mempolicy.h>
#include "sched.h"
@@ -61,14 +62,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
@@ -85,6 +92,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.decay_count);
+#endif
#undef PN
#undef P
}
@@ -98,15 +111,7 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- /*
- * May be NULL if the underlying cgroup isn't fully-created yet
- */
- if (!tg->css.cgroup) {
- group_path[0] = '\0';
- return group_path;
- }
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
- return group_path;
+ return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
}
#endif
@@ -119,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, " ");
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
- p->comm, p->pid,
+ p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -132,6 +137,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ SEQ_printf(m, " %d", task_node(p));
+#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
#endif
@@ -154,7 +162,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
- if (!p->on_rq || task_cpu(p) != rq_cpu)
+ if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -202,20 +210,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
- SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
- SPLIT_NS(cfs_rq->load_avg));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
- SPLIT_NS(cfs_rq->load_period));
- SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
- cfs_rq->load_contribution);
- SEQ_printf(m, " .%-30s: %d\n", "load_tg",
- atomic_read(&cfs_rq->tg->load_weight));
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic_long_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+#endif
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+ cfs_rq->tg->cfs_bandwidth.timer_active);
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
@@ -253,15 +275,21 @@ static void print_cpu(struct seq_file *m, int cpu)
{
unsigned int freq = cpu_khz ? : 1;
- SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
#else
- SEQ_printf(m, "\ncpu#%d\n", cpu);
+ SEQ_printf(m, "cpu#%d\n", cpu);
#endif
-#define P(x) \
- SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define P(x) \
+do { \
+ if (sizeof(rq->x) == 4) \
+ SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ else \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
+
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
@@ -272,7 +300,7 @@ static void print_cpu(struct seq_file *m, int cpu)
P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
- P(curr->pid);
+ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
P(cpu_load[0]);
P(cpu_load[1]);
@@ -288,11 +316,11 @@ static void print_cpu(struct seq_file *m, int cpu)
P(yld_count);
- P(sched_switch);
P(sched_count);
P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
+ P64(max_idle_balance_cost);
#endif
P(ttwu_count);
@@ -309,6 +337,7 @@ static void print_cpu(struct seq_file *m, int cpu)
print_rq(m, rq, cpu);
rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
@@ -317,11 +346,10 @@ static const char *sched_tunable_scaling_names[] = {
"linear"
};
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
{
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
- int cpu;
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
@@ -329,7 +357,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
cpu_clk = local_clock();
local_irq_restore(flags);
- SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+ SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@@ -343,7 +371,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
PN(cpu_clk);
P(jiffies);
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- P(sched_clock_stable);
+ P(sched_clock_stable());
#endif
#undef PN
#undef P
@@ -363,33 +391,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P
- SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
sysctl_sched_tunable_scaling,
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
- for_each_online_cpu(cpu)
- print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
- SEQ_printf(m, "\n");
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
return 0;
}
void sysrq_sched_debug_show(void)
{
- sched_debug_show(NULL, NULL);
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
}
static int sched_debug_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_debug_show, NULL);
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
}
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = sched_debug_release,
};
static int __init init_sched_debug_procfs(void)
@@ -404,22 +500,73 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
+#define __P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ struct mempolicy *pol;
+ int node, i;
+
+ if (p->mm)
+ P(mm->numa_scan_seq);
+
+ task_lock(p);
+ pol = p->mempolicy;
+ if (pol && !(pol->flags & MPOL_F_MORON))
+ pol = NULL;
+ mpol_get(pol);
+ task_unlock(p);
+
+ SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+
+ for_each_online_node(node) {
+ for (i = 0; i < 2; i++) {
+ unsigned long nr_faults = -1;
+ int cpu_current, home_node;
+
+ if (p->numa_faults_memory)
+ nr_faults = p->numa_faults_memory[2*node + i];
+
+ cpu_current = !i ? (task_node(p) == node) :
+ (pol && node_isset(node, pol->v.nodes));
+
+ home_node = (p->numa_preferred_nid == node);
+
+ SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
+ i, node, cpu_current, home_node, nr_faults);
+ }
+ }
+
+ mpol_put(pol);
+#endif
+}
+
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
get_nr_threads(p));
SEQ_printf(m,
- "---------------------------------------------------------\n");
+ "---------------------------------------------------------"
+ "----------\n");
#define __P(F) \
- SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
- SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
- SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
- SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
PN(se.exec_start);
PN(se.vruntime);
@@ -461,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
- do_div(avg_atom, nr_switches);
+ avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;
@@ -478,12 +625,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
}
#endif
__P(nr_switches);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+#ifdef CONFIG_SMP
+ P(se.avg.runnable_avg_sum);
+ P(se.avg.runnable_avg_period);
+ P(se.avg.load_avg_contrib);
+ P(se.avg.decay_count);
+#endif
P(policy);
P(prio);
#undef PN
@@ -497,9 +650,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
+
+ sched_show_numa(p, m);
}
void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669..fea7d3335e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>
@@ -110,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
@@ -157,59 +178,61 @@ void sched_init_granularity(void)
update_sysctl();
}
-#if BITS_PER_LONG == 32
-# define WMULT_CONST (~0UL)
-#else
-# define WMULT_CONST (1UL << 32)
-#endif
-
+#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
/*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
- u64 tmp;
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
- /*
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- * 2^SCHED_LOAD_RESOLUTION.
- */
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- tmp = (u64)delta_exec * scale_load_down(weight);
- else
- tmp = (u64)delta_exec;
+ __update_inv_weight(lw);
- if (!lw->inv_weight) {
- unsigned long w = scale_load_down(lw->weight);
-
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- lw->inv_weight = 1;
- else if (unlikely(!w))
- lw->inv_weight = WMULT_CONST;
- else
- lw->inv_weight = WMULT_CONST / w;
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
}
- /*
- * Check whether we'd overflow the 64-bit multiplication:
- */
- if (unlikely(tmp > WMULT_CONST))
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- WMULT_SHIFT/2);
- else
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+
+ return mul_u64_u32_shr(delta_exec, fact, shift);
}
@@ -259,6 +282,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update);
+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -278,6 +304,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
+ /* We should have no load, but we need to update last_decay. */
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -294,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;
- return 0;
+ return NULL;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -332,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
*/
/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
@@ -398,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- return 1;
-}
-
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
@@ -416,20 +427,20 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec);
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
+ s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
- min_vruntime = vruntime;
+ max_vruntime = vruntime;
- return min_vruntime;
+ return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -465,6 +476,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
vruntime = min_vruntime(vruntime, se->vruntime);
}
+ /* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -585,11 +597,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
/*
* delta /= w
*/
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
@@ -597,7 +608,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
/*
* The idea is to set a period in which each task runs once.
*
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
@@ -638,13 +649,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&lw, se->load.weight);
load = &lw;
}
- slice = calc_delta_mine(slice, se->load.weight, load);
+ slice = __calc_delta(slice, se->load.weight, load);
}
return slice;
}
/*
- * We calculate the vruntime slice of a to be inserted task
+ * We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
@@ -653,55 +664,55 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
- unsigned long delta_exec)
-{
- unsigned long delta_exec_weighted;
-
- schedstat_set(curr->statistics.exec_max,
- max((u64)delta_exec, curr->statistics.exec_max));
+#ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+static inline void __update_task_entity_contrib(struct sched_entity *se);
- curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
+/* Give new task start runnable values to heavy its load in infant time */
+void init_task_runnable_average(struct task_struct *p)
+{
+ u32 slice;
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
+ p->se.avg.decay_count = 0;
+ slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+ p->se.avg.runnable_avg_sum = slice;
+ p->se.avg.runnable_avg_period = slice;
+ __update_task_entity_contrib(&p->se);
+}
+#else
+void init_task_runnable_average(struct task_struct *p)
+{
}
+#endif
+/*
+ * Update the current task's runtime statistics.
+ */
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock_task;
- unsigned long delta_exec;
+ u64 now = rq_clock_task(rq_of(cfs_rq));
+ u64 delta_exec;
if (unlikely(!curr))
return;
- /*
- * Get the amount of time the current task was running
- * since the last time we changed load (this cannot
- * overflow on 32 bits):
- */
- delta_exec = (unsigned long)(now - curr->exec_start);
- if (!delta_exec)
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
return;
- __update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
+ schedstat_set(curr->statistics.exec_max,
+ max(delta_exec, curr->statistics.exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+ curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_min_vruntime(cfs_rq);
+
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
@@ -716,7 +727,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
/*
@@ -736,14 +747,14 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
}
#endif
schedstat_set(se->statistics.wait_start, 0);
@@ -769,119 +780,1292 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock_task;
+ se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/**************************************************
* Scheduling class queueing methods:
*/
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
{
- cfs_rq->task_weight += weight;
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
}
-#endif
-static void
-account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static unsigned int task_scan_max(struct task_struct *p)
{
- update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- add_cfs_task_weight(cfs_rq, se->load.weight);
- list_add(&se->group_node, &cfs_rq->tasks);
+ unsigned int smin = task_scan_min(p);
+ unsigned int smax;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+ return max(smin, smax);
+}
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ struct list_head task_list;
+
+ struct rcu_head rcu;
+ nodemask_t active_nodes;
+ unsigned long total_faults;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+ return p->numa_group ? p->numa_group->gid : 0;
+}
+
+static inline int task_faults_idx(int nid, int priv)
+{
+ return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults_memory)
+ return 0;
+
+ return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+ p->numa_faults_memory[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_group)
+ return 0;
+
+ return p->numa_group->faults[task_faults_idx(nid, 0)] +
+ p->numa_group->faults[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+ return group->faults_cpu[task_faults_idx(nid, 0)] +
+ group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node. The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+ unsigned long total_faults;
+
+ if (!p->numa_faults_memory)
+ return 0;
+
+ total_faults = p->total_numa_faults;
+
+ if (!total_faults)
+ return 0;
+
+ return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+ if (!p->numa_group || !p->numa_group->total_faults)
+ return 0;
+
+ return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+{
+ struct numa_group *ng = p->numa_group;
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By using
+ * a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+ * a task's usage of a particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and getting the
+ * same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely we
+ * act on an unlikely task<->page relation.
+ */
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) &&
+ cpupid_to_nid(last_cpupid) != dst_nid)
+ return false;
+
+ /* Always allow migrate on private faults */
+ if (cpupid_match_pid(p, last_cpupid))
+ return true;
+
+ /* A shared fault, but p->numa_group has not been set up yet. */
+ if (!ng)
+ return true;
+
+ /*
+ * Do not migrate if the destination is not a node that
+ * is actively used by this numa group.
+ */
+ if (!node_isset(dst_nid, ng->active_nodes))
+ return false;
+
+ /*
+ * Source is a node that is not actively used by this
+ * numa group, while the destination is. Migrate.
+ */
+ if (!node_isset(src_nid, ng->active_nodes))
+ return true;
+
+ /*
+ * Both source and destination are nodes in active
+ * use by this numa group. Maximize memory bandwidth
+ * by migrating from more heavily used groups, to less
+ * heavily used ones, spreading the load around.
+ * Use a 1/4 hysteresis to avoid spurious page movement.
+ */
+ return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long capacity_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+ unsigned long nr_running;
+ unsigned long load;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long compute_capacity;
+
+ /* Approximate capacity in terms of runnable tasks on a node */
+ unsigned long task_capacity;
+ int has_free_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int cpu, cpus = 0;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(cpu);
+ ns->compute_capacity += capacity_of(cpu);
+
+ cpus++;
}
- cfs_rq->nr_running++;
+
+ /*
+ * If we raced with hotplug and there are no CPUs left in our mask
+ * the @ns structure is NULL'ed and task_numa_compare() will
+ * not find this node attractive.
+ *
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
+ */
+ if (!cpus)
+ return;
+
+ ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
+ ns->task_capacity =
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
-static void
-account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+
+ struct numa_stats src_stats, dst_stats;
+
+ int imbalance_pct;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
{
- update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- add_cfs_task_weight(cfs_rq, -se->load.weight);
- list_del_init(&se->group_node);
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+ long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long imb, old_imb;
+
+ /* We care about the slope of the imbalance, not the direction. */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ /* Is the difference below the threshold? */
+ imb = dst_load * 100 - src_load * env->imbalance_pct;
+ if (imb <= 0)
+ return false;
+
+ /*
+ * The imbalance is above the allowed threshold.
+ * Compare it with the old imbalance.
+ */
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
+
+ old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ struct task_struct *cur;
+ long orig_src_load, src_load;
+ long orig_dst_load, dst_load;
+ long load;
+ long imp = (groupimp > 0) ? groupimp : taskimp;
+
+ rcu_read_lock();
+ cur = ACCESS_ONCE(dst_rq->curr);
+ if (cur->pid == 0) /* idle */
+ cur = NULL;
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more rmeote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ if (cur) {
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ goto unlock;
+
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+ if (cur->numa_group)
+ imp -= imp/16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by
+ * itself (not part of a group), use the task weight
+ * instead.
+ */
+ if (env->p->numa_group)
+ imp = groupimp;
+ else
+ imp = taskimp;
+
+ if (cur->numa_group)
+ imp += group_weight(cur, env->src_nid) -
+ group_weight(cur, env->dst_nid);
+ else
+ imp += task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ }
}
- cfs_rq->nr_running--;
+
+ if (imp < env->best_imp)
+ goto unlock;
+
+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.has_free_capacity &&
+ !env->dst_stats.has_free_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ orig_dst_load = env->dst_stats.load;
+ orig_src_load = env->src_stats.load;
+
+ /* XXX missing capacity terms */
+ load = task_h_load(env->p);
+ dst_load = orig_dst_load + load;
+ src_load = orig_src_load - load;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ if (load_too_imbalanced(orig_src_load, orig_dst_load,
+ src_load, dst_load, env))
+ goto unlock;
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
+ };
+ struct sched_domain *sd;
+ unsigned long taskweight, groupweight;
+ int nid, ret;
+ long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ if (sd)
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ /*
+ * Cpusets can break the scheduler domain tree into smaller
+ * balance domains, some of which do not cross NUMA boundaries.
+ * Tasks that are "trapped" in such domains cannot be migrated
+ * elsewhere, so there is no point in (re)trying.
+ */
+ if (unlikely(!sd)) {
+ p->numa_preferred_nid = task_node(p);
+ return -EINVAL;
+ }
+
+ taskweight = task_weight(p, env.src_nid);
+ groupweight = group_weight(p, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ env.dst_nid = p->numa_preferred_nid;
+ taskimp = task_weight(p, env.dst_nid) - taskweight;
+ groupimp = group_weight(p, env.dst_nid) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* If the preferred nid has free capacity, try to use it. */
+ if (env.dst_stats.has_free_capacity)
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /* No space available on the preferred nid. Look elsewhere. */
+ if (env.best_cpu == -1) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid) - taskweight;
+ groupimp = group_weight(p, nid) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
+
+ /*
+ * Reset the scan period if the task is being rescheduled on an
+ * alternative node to recheck if the tasks is now properly placed.
+ */
+ p->numa_scan_period = task_scan_min(p);
+
+ if (env.best_task == NULL) {
+ ret = migrate_task_to(p, env.best_cpu);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ unsigned long interval = HZ;
+
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ return;
+
+ /* Periodically retry migrating the task to the preferred node */
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
+
+ /* Success if task is already running on preferred CPU */
+ if (task_node(p) == p->numa_preferred_nid)
+ return;
+
+ /* Otherwise, try migrate to a CPU on the preferred node */
+ task_numa_migrate(p);
+}
+
+/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+ unsigned long faults, max_faults = 0;
+ int nid;
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults > max_faults)
+ max_faults = faults;
+ }
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (!node_isset(nid, numa_group->active_nodes)) {
+ if (faults > max_faults * 6 / 16)
+ node_set(nid, numa_group->active_nodes);
+ } else if (faults < max_faults * 3 / 16)
+ node_clear(nid, numa_group->active_nodes);
+ }
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Scan slower
+ */
+ if (local + shared == 0) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
+
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+ /*
+ * Scale scan rate increases based on sharing. There is an
+ * inverse relationship between the degree of sharing and
+ * the adjustment made to the scanning period. Broadly
+ * speaking the intent is that there is little point
+ * scanning faster if shared accesses dominate as it may
+ * simply bounce migrations uselessly
+ */
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+ u64 runtime, delta, now;
+ /* Use the start of this time slice to avoid calculations. */
+ now = p->se.exec_start;
+ runtime = p->se.sum_exec_runtime;
+
+ if (p->last_task_numa_placement) {
+ delta = runtime - p->last_sum_exec_runtime;
+ *period = now - p->last_task_numa_placement;
+ } else {
+ delta = p->se.avg.runnable_avg_sum;
+ *period = p->se.avg.runnable_avg_period;
+ }
+
+ p->last_sum_exec_runtime = runtime;
+ p->last_task_numa_placement = now;
+
+ return delta;
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq, nid, max_nid = -1, max_group_nid = -1;
+ unsigned long max_faults = 0, max_group_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
+ unsigned long total_faults;
+ u64 runtime, period;
+ spinlock_t *group_lock = NULL;
+
+ seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+
+ total_faults = p->numa_faults_locality[0] +
+ p->numa_faults_locality[1];
+ runtime = numa_get_avg_runtime(p, &period);
+
+ /* If the task is part of a group prevent parallel updates to group stats */
+ if (p->numa_group) {
+ group_lock = &p->numa_group->lock;
+ spin_lock_irq(group_lock);
+ }
+
+ /* Find the node with the highest number of faults */
+ for_each_online_node(nid) {
+ unsigned long faults = 0, group_faults = 0;
+ int priv, i;
+
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+ long diff, f_diff, f_weight;
+
+ i = task_faults_idx(nid, priv);
+
+ /* Decay existing window, copy faults since last scan */
+ diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
+ fault_types[priv] += p->numa_faults_buffer_memory[i];
+ p->numa_faults_buffer_memory[i] = 0;
+
+ /*
+ * Normalize the faults_from, so all tasks in a group
+ * count according to CPU use, instead of by the raw
+ * number of faults. Tasks with little runtime have
+ * little over-all impact on throughput, and thus their
+ * faults are less important.
+ */
+ f_weight = div64_u64(runtime << 16, period + 1);
+ f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ (total_faults + 1);
+ f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+ p->numa_faults_buffer_cpu[i] = 0;
+
+ p->numa_faults_memory[i] += diff;
+ p->numa_faults_cpu[i] += f_diff;
+ faults += p->numa_faults_memory[i];
+ p->total_numa_faults += diff;
+ if (p->numa_group) {
+ /* safe because we can only change our own group */
+ p->numa_group->faults[i] += diff;
+ p->numa_group->faults_cpu[i] += f_diff;
+ p->numa_group->total_faults += diff;
+ group_faults += p->numa_group->faults[i];
+ }
+ }
+
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+
+ if (group_faults > max_group_faults) {
+ max_group_faults = group_faults;
+ max_group_nid = nid;
+ }
+ }
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+
+ if (p->numa_group) {
+ update_numa_active_node_mask(p->numa_group);
+ /*
+ * If the preferred task and group nids are different,
+ * iterate over the nodes again to find the best place.
+ */
+ if (max_nid != max_group_nid) {
+ unsigned long weight, max_weight = 0;
+
+ for_each_online_node(nid) {
+ weight = task_weight(p, nid) + group_weight(p, nid);
+ if (weight > max_weight) {
+ max_weight = weight;
+ max_nid = nid;
+ }
+ }
+ }
+
+ spin_unlock_irq(group_lock);
+ }
+
+ /* Preferred node as the node with the most faults */
+ if (max_faults && max_nid != p->numa_preferred_nid) {
+ /* Update the preferred nid and migrate task if possible */
+ sched_setnuma(p, max_nid);
+ numa_migrate_preferred(p);
+ }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+ return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+ if (atomic_dec_and_test(&grp->refcount))
+ kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+ int *priv)
+{
+ struct numa_group *grp, *my_grp;
+ struct task_struct *tsk;
+ bool join = false;
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+ if (unlikely(!p->numa_group)) {
+ unsigned int size = sizeof(struct numa_group) +
+ 4*nr_node_ids*sizeof(unsigned long);
+
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!grp)
+ return;
+
+ atomic_set(&grp->refcount, 1);
+ spin_lock_init(&grp->lock);
+ INIT_LIST_HEAD(&grp->task_list);
+ grp->gid = p->pid;
+ /* Second half of the array tracks nids where faults happen */
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+ nr_node_ids;
+
+ node_set(task_node(current), grp->active_nodes);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults_memory[i];
+
+ grp->total_faults = p->total_numa_faults;
+
+ list_add(&p->numa_entry, &grp->task_list);
+ grp->nr_tasks++;
+ rcu_assign_pointer(p->numa_group, grp);
+ }
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
+ rcu_read_lock();
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+ if (!cpupid_match_pid(tsk, cpupid))
+ goto no_join;
+
+ grp = rcu_dereference(tsk->numa_group);
+ if (!grp)
+ goto no_join;
+
+ my_grp = p->numa_group;
+ if (grp == my_grp)
+ goto no_join;
+
+ /*
+ * Only join the other group if its bigger; if we're the bigger group,
+ * the other task will join us.
+ */
+ if (my_grp->nr_tasks > grp->nr_tasks)
+ goto no_join;
+
+ /*
+ * Tie-break on the grp address.
+ */
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+ goto no_join;
+
+ /* Always join threads in the same process. */
+ if (tsk->mm == current->mm)
+ join = true;
+
+ /* Simple filter to avoid false positives due to PID collisions */
+ if (flags & TNF_SHARED)
+ join = true;
+
+ /* Update priv based on whether false sharing was detected */
+ *priv = !join;
+
+ if (join && !get_numa_group(grp))
+ goto no_join;
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
+ rcu_read_unlock();
+
+ if (!join)
+ return;
+
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] += p->numa_faults_memory[i];
}
+ my_grp->total_faults -= p->total_numa_faults;
+ grp->total_faults += p->total_numa_faults;
+
+ list_move(&p->numa_entry, &grp->task_list);
+ my_grp->nr_tasks--;
+ grp->nr_tasks++;
+
+ spin_unlock(&my_grp->lock);
+ spin_unlock_irq(&grp->lock);
+
+ rcu_assign_pointer(p->numa_group, grp);
+
+ put_numa_group(my_grp);
+ return;
+
+no_join:
+ rcu_read_unlock();
+ return;
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+void task_numa_free(struct task_struct *p)
{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
+ struct numa_group *grp = p->numa_group;
+ void *numa_faults = p->numa_faults_memory;
+ unsigned long flags;
+ int i;
+
+ if (grp) {
+ spin_lock_irqsave(&grp->lock, flags);
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults_memory[i];
+ grp->total_faults -= p->total_numa_faults;
+
+ list_del(&p->numa_entry);
+ grp->nr_tasks--;
+ spin_unlock_irqrestore(&grp->lock, flags);
+ rcu_assign_pointer(p->numa_group, NULL);
+ put_numa_group(grp);
+ }
+
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->numa_faults_cpu= NULL;
+ p->numa_faults_buffer_cpu = NULL;
+ kfree(numa_faults);
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
+{
+ struct task_struct *p = current;
+ bool migrated = flags & TNF_MIGRATED;
+ int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
+ int priv;
+
+ if (!numabalancing_enabled)
+ return;
+
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
+ /* Do not worry about placement if exiting */
+ if (p->state == TASK_DEAD)
return;
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
+ /* Allocate buffer to track faults on a per-node basis */
+ if (unlikely(!p->numa_faults_memory)) {
+ int size = sizeof(*p->numa_faults_memory) *
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
+
+ p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults_memory)
+ return;
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
+ BUG_ON(p->numa_faults_buffer_memory);
+ /*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+ p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+ p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
+ p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
+ /*
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
+ */
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+ priv = 1;
+ } else {
+ priv = cpupid_match_pid(p, last_cpupid);
+ if (!priv && !(flags & TNF_NO_GROUP))
+ task_numa_group(p, last_cpupid, flags, &priv);
}
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ if (!priv && !local && p->numa_group &&
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
+ node_isset(mem_node, p->numa_group->active_nodes))
+ local = 1;
+
+ task_numa_placement(p);
+
+ /*
+ * Retry task to preferred node migration periodically, in case it
+ * case it previously failed, or the scheduler moved us.
+ */
+ if (time_after(jiffies, p->numa_migrate_retry))
+ numa_migrate_preferred(p);
+
+ if (migrated)
+ p->numa_pages_migrated += pages;
+
+ p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+ p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults_locality[local] += pages;
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
+ long pages;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ if (!mm->numa_next_scan) {
+ mm->numa_next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_min(p);
+ }
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Delay this task enough that another task of this mm will likely win
+ * the next time around.
+ */
+ p->node_stamp += 2 * TICK_NSEC;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ if (!pages)
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ continue;
+
+ /*
+ * Shared library pages mapped by multiple processes are not
+ * migrated as it is expected they are cache replicated. Avoid
+ * hinting faults in read-only file-backed mappings or the vdso
+ * as migrating the pages will be of marginal benefit.
+ */
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ continue;
- while (cfs_rq->load_period > period) {
/*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
*/
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ nr_pte_updates += change_prot_numa(vma, start, end);
+
+ /*
+ * Scan sysctl_numa_balancing_scan_size but ensure that
+ * at least one PTE is updated so that unused virtual
+ * address space is quickly skipped.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
+
+ start = end;
+ if (pages <= 0)
+ goto out;
+
+ cond_resched();
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few
+ * VMAs are not guaranteed to the vma_migratable. If they are not, we
+ * would find the !migratable VMA on the next scan but not reset the
+ * scanner to the start so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = task_scan_min(curr);
+ curr->node_stamp += period;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
}
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
}
+#endif /* CONFIG_NUMA_BALANCING */
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
+ cfs_rq->nr_running++;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ list_del_init(&se->group_node);
+ }
+ cfs_rq->nr_running--;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -891,8 +2075,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic_long_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -916,27 +2100,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -954,6 +2122,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -973,19 +2143,513 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_SMP
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
+{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable)
{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count = 0;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic_long_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib;
+
+ /* The fraction of a cpu used by this cfs_rq */
+ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+ }
}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div_u64(contrib,
+ atomic_long_read(&tg->load_avg) + 1);
+
+ /*
+ * For group entities we need to compute a correction term in the case
+ * that they are consuming <1 cpu so that we would contribute the same
+ * load as a task of equal weight.
+ *
+ * Explicitly co-ordinating this measurement would be expensive, but
+ * fortunately the sum of each cpus contribution forms a usable
+ * lower-bound on the true value.
+ *
+ * Consider the aggregate of 2 contributions. Either they are disjoint
+ * (and the sum represents true value) or they are disjoint and we are
+ * understating by the aggregate of their overlap.
+ *
+ * Extending this to N cpus, for a given overlap, the maximum amount we
+ * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ * cpus that overlap for this interval and w_i is the interval width.
+ *
+ * On a small machine; the first term is well-bounded which bounds the
+ * total error since w_i is a subset of the period. Whereas on a
+ * larger machine, while this first term can be larger, if w_i is the
+ * of consequential size guaranteed to see n_i*w_i quickly converge to
+ * our upper bound of 1-cpu.
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < NICE_0_LOAD) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ }
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic_long_read(&cfs_rq->removed_load)) {
+ unsigned long removed_load;
+ removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ *
+ * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
+ * are seen by enqueue_entity_load_avg() as a migration with an already
+ * constructed load_avg_contrib.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ /* Indicate that we're now synchronized and on-rq */
+ se->avg.decay_count = 0;
+ }
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
+static int idle_balance(struct rq *this_rq);
+
+#else /* CONFIG_SMP */
+
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+
+static inline int idle_balance(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -995,7 +2659,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
tsk = task_of(se);
if (se->statistics.sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
@@ -1003,6 +2667,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (unlikely(delta > se->statistics.sleep_max))
se->statistics.sleep_max = delta;
+ se->statistics.sleep_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
@@ -1011,7 +2676,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
}
if (se->statistics.block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
@@ -1019,6 +2684,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (unlikely(delta > se->statistics.block_max))
se->statistics.block_max = delta;
+ se->statistics.block_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
@@ -1088,9 +2754,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}
/* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
-
- se->vruntime = vruntime;
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1100,7 +2764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
- * through callig update_curr().
+ * through calling update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
@@ -1109,7 +2773,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -1134,10 +2798,10 @@ static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}
@@ -1145,10 +2809,10 @@ static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}
@@ -1156,10 +2820,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}
@@ -1175,7 +2839,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
__clear_buddies_skip(se);
}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1184,6 +2848,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1192,9 +2857,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
@@ -1204,7 +2869,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
/*
@@ -1302,17 +2966,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -1334,7 +3017,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
return se;
}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -1353,6 +3036,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1366,9 +3051,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1399,20 +3086,21 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef HAVE_JUMP_LABEL
-static struct jump_label_key __cfs_bandwidth_used;
+static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
{
- return static_branch(&__cfs_bandwidth_used);
+ return static_key_false(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_inc(void)
+{
+ static_key_slow_inc(&__cfs_bandwidth_used);
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_dec(void)
{
- /* only need to count groups transitioning between enabled/!enabled */
- if (enabled && !was_enabled)
- jump_label_inc(&__cfs_bandwidth_used);
- else if (!enabled && was_enabled)
- jump_label_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -1420,7 +3108,8 @@ static bool cfs_bandwidth_used(void)
return true;
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
#endif /* HAVE_JUMP_LABEL */
/*
@@ -1461,6 +3150,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1483,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
*/
if (!cfs_b->timer_active) {
__refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, false);
}
if (cfs_b->runtime > 0) {
@@ -1514,10 +3212,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct rq *rq = rq_of(cfs_rq);
/* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
return;
if (cfs_rq->runtime_remaining < 0)
@@ -1529,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced.
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
*/
- if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -1541,8 +3240,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
}
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
@@ -1559,8 +3257,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
resched_task(rq_of(cfs_rq)->curr);
}
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
@@ -1605,14 +3303,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1624,9 +3317,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq_clock_task(rq);
cfs_rq->throttle_count++;
return 0;
@@ -1641,7 +3334,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1662,12 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running -= task_delta;
+ sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
}
@@ -1679,16 +3374,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
int enqueue = 1;
long task_delta;
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ se = cfs_rq->tg->se[cpu_of(rq)];
cfs_rq->throttled = 0;
+
+ update_rq_clock(rq);
+
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
- update_rq_clock(rq);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -1710,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running += task_delta;
+ add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -1764,28 +3460,35 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int idle = 1, throttled;
+ int throttled;
- raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
- goto out_unlock;
+ goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- /* idle depends on !throttled (for the case of a large deficit) */
- idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
- /* if we're going inactive then everything else can be deferred */
- if (idle)
- goto out_unlock;
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
+
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- goto out_unlock;
+ return 0;
}
/* account preceding periods in which throttling occurred */
@@ -1825,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
-out_unlock:
- if (idle)
- cfs_b->timer_active = 0;
- raw_spin_unlock(&cfs_b->lock);
- return idle;
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -1840,7 +3543,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -1916,10 +3625,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
u64 expires;
/* confirm we're still not at a refresh boundary */
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
return;
+ }
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
@@ -1963,28 +3674,25 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;
throttle_cfs_rq(cfs_rq);
+ return true;
}
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -2002,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
+ raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -2011,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -2036,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
{
/*
* The timer may be active because we're trying to set a new bandwidth
@@ -2044,14 +3754,14 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
+ cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
- if (cfs_b->timer_active)
+ if (!force && cfs_b->timer_active)
return;
}
@@ -2065,13 +3775,11 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
if (!cfs_rq->runtime_enabled)
continue;
@@ -2079,18 +3787,22 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
- cfs_rq->runtime_remaining = cfs_b->quota;
+ cfs_rq->runtime_remaining = 1;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
}
#else /* CONFIG_CFS_BANDWIDTH */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_task(rq_of(cfs_rq));
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -2119,7 +3831,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -2220,12 +3932,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
}
- if (!se)
- inc_nr_running(rq);
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
+ add_nr_running(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2279,12 +3993,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
}
- if (!se)
- dec_nr_running(rq);
+ if (!se) {
+ sub_nr_running(rq, 1);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2292,7 +4008,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->load.weight;
+ return cpu_rq(cpu)->cfs.runnable_load_avg;
}
/*
@@ -2328,22 +4044,40 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long power_of(int cpu)
+static unsigned long capacity_of(int cpu)
{
- return cpu_rq(cpu)->cpu_power;
+ return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+ unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
- return rq->load.weight / nr_running;
+ return load_avg / nr_running;
return 0;
}
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Rough decay (wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * about the loss.
+ */
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
static void task_waking_fair(struct task_struct *p)
{
@@ -2364,6 +4098,7 @@ static void task_waking_fair(struct task_struct *p)
#endif
se->vruntime -= min_vruntime;
+ record_wakee(p);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2474,14 +4209,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
}
#else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
- unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
return wl;
}
#endif
+static int wake_wide(struct task_struct *p)
+{
+ int factor = this_cpu_read(sd_llc_size);
+
+ /*
+ * Yeah, it's the switching-frequency, could means many wakee or
+ * rapidly switch, use factor here will just help to automatically
+ * adjust the loose-degree, so bigger node will lead to more pull.
+ */
+ if (p->wakee_flips > factor) {
+ /*
+ * wakee is somewhat hot, it needs certain amount of cpu
+ * resource, so if waker is far more hot, prefer to leave
+ * it alone.
+ */
+ if (current->wakee_flips > (factor * p->wakee_flips))
+ return 1;
+ }
+
+ return 0;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
@@ -2491,6 +4247,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
+ /*
+ * If we wake multiple tasks be careful to not bounce
+ * ourselves around too much.
+ */
+ if (wake_wide(p))
+ return 0;
+
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -2526,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
- this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= capacity_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
@@ -2571,12 +4334,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int load_idx)
+ int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ if (sd_flag & SD_BALANCE_WAKE)
+ load_idx = sd->wake_idx;
+
do {
unsigned long load, avg_load;
int local_group;
@@ -2603,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
}
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
if (local_group) {
this_load = avg_load;
@@ -2647,31 +4414,22 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
- int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
struct sched_domain *sd;
struct sched_group *sg;
- int i;
+ int i = task_cpu(p);
- /*
- * If the task is going to be woken-up on this cpu and if it is
- * already idle, then it is the right target.
- */
- if (target == cpu && idle_cpu(cpu))
- return cpu;
+ if (idle_cpu(target))
+ return target;
/*
- * If the task is going to be woken-up on the cpu where it previously
- * ran and if it is currently idle, then it the right target.
+ * If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (target == prev_cpu && idle_cpu(prev_cpu))
- return prev_cpu;
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
- rcu_read_lock();
-
sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) {
sg = sd->groups;
@@ -2681,7 +4439,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;
for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
+ if (i == target || !idle_cpu(i))
goto next;
}
@@ -2693,34 +4451,31 @@ next:
} while (sg != sd->groups);
}
done:
- rcu_read_unlock();
-
return target;
}
/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
- * Balance, ie. select the least loaded group.
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
*
- * Returns the target CPU number, or the same CPU if no balancing is needed.
+ * Returns the target cpu number.
*
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
- int want_sd = 1;
int sync = wake_flags & WF_SYNC;
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
@@ -2735,59 +4490,28 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
continue;
/*
- * If power savings logic is enabled for a domain, see if we
- * are not overloaded, if so, don't balance wider.
- */
- if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
- unsigned long power = 0;
- unsigned long nr_running = 0;
- unsigned long capacity;
- int i;
-
- for_each_cpu(i, sched_domain_span(tmp)) {
- power += power_of(i);
- nr_running += cpu_rq(i)->cfs.nr_running;
- }
-
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-
- if (tmp->flags & SD_POWERSAVINGS_BALANCE)
- nr_running /= 2;
-
- if (nr_running < capacity)
- want_sd = 0;
- }
-
- /*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
- want_affine = 0;
- }
-
- if (!want_sd && !want_affine)
break;
+ }
- if (!(tmp->flags & sd_flag))
- continue;
-
- if (want_sd)
+ if (tmp->flags & sd_flag)
sd = tmp;
}
- if (affine_sd) {
- if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+ if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
while (sd) {
- int load_idx = sd->forkexec_idx;
struct sched_group *group;
int weight;
@@ -2796,10 +4520,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
continue;
}
- if (sd_flag & SD_BALANCE_WAKE)
- load_idx = sd->wake_idx;
-
- group = find_idlest_group(sd, p, cpu, load_idx);
+ group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
@@ -2829,6 +4550,34 @@ unlock:
return new_cpu;
}
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a positive decay-count. It can never
+ * be negative here since on-rq tasks have decay-count == 0.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic_long_add(se->avg.load_avg_contrib,
+ &cfs_rq->removed_load);
+ }
+
+ /* We have migrated, no longer consider this task hot */
+ se->exec_start = 0;
+}
#endif /* CONFIG_SMP */
static unsigned long
@@ -2920,7 +4669,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/*
- * This is possible from callers such as pull_task(), in which we
+ * This is possible from callers such as move_task(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -2955,7 +4704,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
- if (unlikely(p->policy != SCHED_NORMAL))
+ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
@@ -2991,26 +4740,124 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+again:
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
- return NULL;
+ goto idle;
+
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the throttle and
+ * dequeue its entity in the parent(s). Therefore the 'simple'
+ * nr_running test will indeed be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+simple:
+ cfs_rq = &rq->cfs;
+#endif
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ put_prev_task(rq, prev);
do {
- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
+
+idle:
+ new_tasks = idle_balance(rq);
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
+
+ if (new_tasks > 0)
+ goto again;
+
+ return NULL;
}
/*
@@ -3081,27 +4928,174 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
#ifdef CONFIG_SMP
/**************************************************
- * Fair scheduling class load-balancing methods:
- */
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * C_i is the compute capacity of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of cpus doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
+
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+enum fbq_type { regular, remote, all };
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02
+#define LBF_DST_PINNED 0x04
+#define LBF_SOME_PINNED 0x08
+
+struct lb_env {
+ struct sched_domain *sd;
+
+ struct rq *src_rq;
+ int src_cpu;
+
+ int dst_cpu;
+ struct rq *dst_rq;
+
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
+ enum cpu_idle_type idle;
+ long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
+ unsigned int flags;
+
+ unsigned int loop;
+ unsigned int loop_break;
+ unsigned int loop_max;
+
+ enum fbq_type fbq_type;
+};
/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
+ * move_task - move a task from one runqueue to another runqueue.
* Both runqueues must be locked.
*/
-static void pull_task(struct rq *src_rq, struct task_struct *p,
- struct rq *this_rq, int this_cpu)
+static void move_task(struct task_struct *p, struct lb_env *env)
{
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- check_preempt_curr(this_rq, p, 0);
+ deactivate_task(env->src_rq, p, 0);
+ set_task_cpu(p, env->dst_cpu);
+ activate_task(env->dst_rq, p, 0);
+ check_preempt_curr(env->dst_rq, p, 0);
}
/*
* Is this task likely cache-hot:
*/
static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+task_hot(struct task_struct *p, u64 now)
{
s64 delta;
@@ -3129,61 +5123,182 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
return delta < (s64)sysctl_sched_migration_cost;
}
-#define LBF_ALL_PINNED 0x01
-#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
-#define LBF_HAD_BREAK 0x04
-#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT 0x10
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ !(env->sd->flags & SD_NUMA)) {
+ return false;
+ }
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ if (numa_group) {
+ /* Task is already in the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return true;
+
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+ return false;
+
+ if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ return false;
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ if (numa_group) {
+ /* Task is moving within/into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving out of the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ }
+
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
+
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) running (obviously), or
+ * 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
*/
- if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ int cpu;
+
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+ env->flags |= LBF_SOME_PINNED;
+
+ /*
+ * Remember if this task can be migrated to any other cpu in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Also avoid computing new_dst_cpu if we have already computed
+ * one in current iteration.
+ */
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ return 0;
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
+ }
+
return 0;
}
- *lb_flags &= ~LBF_ALL_PINNED;
- if (task_running(rq, p)) {
+ /* Record that we found atleast one task that could run on dst_cpu */
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
- * 1) task is cache cold, or
- * 2) too many balance attempts have failed.
+ * 1) destination numa is preferred
+ * 2) task is cache cold, or
+ * 3) too many balance attempts have failed.
*/
+ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
+ if (!tsk_cache_hot)
+ tsk_cache_hot = migrate_degrades_locality(p, env);
- tsk_cache_hot = task_hot(p, rq->clock_task, sd);
- if (!tsk_cache_hot ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
+ if (migrate_improves_locality(p, env)) {
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
- schedstat_inc(sd, lb_hot_gained[idle]);
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
- if (tsk_cache_hot) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
- return 0;
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+
+ return 1;
}
- return 1;
+
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ return 0;
}
/*
@@ -3193,65 +5308,74 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
*
* Called with both runqueues locked.
*/
-static int
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle)
+static int move_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
- struct cfs_rq *cfs_rq;
- int pinned = 0;
- for_each_leaf_cfs_rq(busiest, cfs_rq) {
- list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
- if (throttled_lb_pair(task_group(p),
- busiest->cpu, this_cpu))
- break;
-
- if (!can_migrate_task(p, busiest, this_cpu,
- sd, idle, &pinned))
- continue;
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (!can_migrate_task(p, env))
+ continue;
- pull_task(busiest, p, this_rq, this_cpu);
- /*
- * Right now, this is only the second place pull_task()
- * is called, so we can safely collect pull_task()
- * stats here rather than inside pull_task().
- */
- schedstat_inc(sd, lb_gained[idle]);
- return 1;
- }
+ move_task(p, env);
+ /*
+ * Right now, this is only the second place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
}
-
return 0;
}
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *lb_flags,
- struct cfs_rq *busiest_cfs_rq)
+static const unsigned int sched_nr_migrate_break = 32;
+
+/*
+ * move_tasks tries to move up to imbalance weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct lb_env *env)
{
- int loops = 0, pulled = 0;
- long rem_load_move = max_load_move;
- struct task_struct *p, *n;
+ struct list_head *tasks = &env->src_rq->cfs_tasks;
+ struct task_struct *p;
+ unsigned long load;
+ int pulled = 0;
- if (max_load_move == 0)
- goto out;
+ if (env->imbalance <= 0)
+ return 0;
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
- list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
- if (loops++ > sysctl_sched_nr_migrate) {
- *lb_flags |= LBF_NEED_BREAK;
+ env->loop++;
+ /* We've more or less seen every task there is, call it quits */
+ if (env->loop > env->loop_max)
+ break;
+
+ /* take a breather every nr_migrate tasks */
+ if (env->loop > env->loop_break) {
+ env->loop_break += sched_nr_migrate_break;
+ env->flags |= LBF_NEED_BREAK;
break;
}
- if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle,
- lb_flags))
- continue;
+ if (!can_migrate_task(p, env))
+ goto next;
+
+ load = task_h_load(p);
+
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- pull_task(busiest, p, this_rq, this_cpu);
+ if ((load / 2) > env->imbalance)
+ goto next;
+
+ move_task(p, env);
pulled++;
- rem_load_move -= p->se.load.weight;
+ env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
@@ -3259,273 +5383,209 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
- if (idle == CPU_NEWLY_IDLE) {
- *lb_flags |= LBF_ABORT;
+ if (env->idle == CPU_NEWLY_IDLE)
break;
- }
#endif
/*
* We only want to steal up to the prescribed amount of
* weighted load.
*/
- if (rem_load_move <= 0)
+ if (env->imbalance <= 0)
break;
+
+ continue;
+next:
+ list_move_tail(&p->se.group_node, tasks);
}
-out:
+
/*
- * Right now, this is one of only two places pull_task() is called,
- * so we can safely collect pull_task() stats here rather than
- * inside pull_task().
+ * Right now, this is one of only two places move_task() is called,
+ * so we can safely collect move_task() stats here rather than
+ * inside move_task().
*/
- schedstat_add(sd, lb_gained[idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], pulled);
- return max_load_move - rem_load_move;
+ return pulled;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
- return 0;
+ if (se) {
+ update_entity_load_avg(se, 1);
+ /*
+ * We pivot on our runnable average having decayed to zero for
+ * list removal. This generally implies that all our children
+ * have also been removed (modulo rounding error or bandwidth
+ * control); however, such cases are rare and we can fix these
+ * at enqueue.
+ *
+ * TODO: fix up out-of-order children on enqueue.
+ */
+ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ list_del_leaf_cfs_rq(cfs_rq);
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+ update_rq_runnable_avg(rq, rq->nr_running);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
- rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Note: We may want to consider periodically releasing
+ * rq->lock about these updates so that creating many task
+ * groups does not result in continually extending hold time.
+ */
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
}
- rcu_read_unlock();
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
+ struct rq *rq = rq_of(cfs_rq);
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+ unsigned long now = jiffies;
unsigned long load;
- long cpu = (long)data;
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
+ if (cfs_rq->last_h_load_update == now)
+ return;
- tg->cfs_rq[cpu]->h_load = load;
+ cfs_rq->h_load_next = NULL;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_load_next = se;
+ if (cfs_rq->last_h_load_update == now)
+ break;
+ }
- return 0;
-}
+ if (!se) {
+ cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->last_h_load_update = now;
+ }
-static void update_h_load(long cpu)
-{
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+ while ((se = cfs_rq->h_load_next) != NULL) {
+ load = cfs_rq->h_load;
+ load = div64_ul(load * se->avg.load_avg_contrib,
+ cfs_rq->runnable_load_avg + 1);
+ cfs_rq = group_cfs_rq(se);
+ cfs_rq->h_load = load;
+ cfs_rq->last_h_load_update = now;
+ }
}
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
+static unsigned long task_h_load(struct task_struct *p)
{
- long rem_load_move = max_load_move;
- struct cfs_rq *busiest_cfs_rq;
-
- rcu_read_lock();
- update_h_load(cpu_of(busiest));
-
- for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
- unsigned long busiest_h_load = busiest_cfs_rq->h_load;
- unsigned long busiest_weight = busiest_cfs_rq->load.weight;
- u64 rem_load, moved_load;
-
- if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
- break;
-
- /*
- * empty group or part of a throttled hierarchy
- */
- if (!busiest_cfs_rq->task_weight ||
- throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
- continue;
-
- rem_load = (u64)rem_load_move * busiest_weight;
- rem_load = div_u64(rem_load, busiest_h_load + 1);
-
- moved_load = balance_tasks(this_rq, this_cpu, busiest,
- rem_load, sd, idle, lb_flags,
- busiest_cfs_rq);
-
- if (!moved_load)
- continue;
-
- moved_load *= busiest_h_load;
- moved_load = div_u64(moved_load, busiest_weight + 1);
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
- rem_load_move -= moved_load;
- if (rem_load_move < 0)
- break;
- }
- rcu_read_unlock();
-
- return max_load_move - rem_load_move;
+ update_cfs_rq_h_load(cfs_rq);
+ return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
+ cfs_rq->runnable_load_avg + 1);
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
+static unsigned long task_h_load(struct task_struct *p)
{
- return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, lb_flags,
- &busiest->cfs);
+ return p->se.avg.load_avg_contrib;
}
#endif
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
-{
- unsigned long total_load_moved = 0, load_moved;
-
- do {
- load_moved = load_balance_fair(this_rq, this_cpu, busiest,
- max_load_move - total_load_moved,
- sd, idle, lb_flags);
-
- total_load_moved += load_moved;
-
- if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
- break;
-
-#ifdef CONFIG_PREEMPT
- /*
- * NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
- * the critical section.
- */
- if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
- *lb_flags |= LBF_ABORT;
- break;
- }
-#endif
- } while (load_moved && max_load_move > total_load_moved);
-
- return total_load_moved > 0;
-}
-
/********** Helpers for find_busiest_group ************************/
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
- */
-struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *this; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
-
- /** Statistics of this group */
- unsigned long this_load;
- unsigned long this_load_per_task;
- unsigned long this_nr_running;
- unsigned long this_has_capacity;
- unsigned int this_idle_cpus;
-
- /* Statistics of the busiest group */
- unsigned int busiest_idle_cpus;
- unsigned long max_load;
- unsigned long busiest_load_per_task;
- unsigned long busiest_nr_running;
- unsigned long busiest_group_capacity;
- unsigned long busiest_has_capacity;
- unsigned int busiest_group_weight;
-
- int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance; /* Is powersave balance needed for this sd */
- struct sched_group *group_min; /* Least loaded group in sd */
- struct sched_group *group_leader; /* Group which relieves group_min */
- unsigned long min_load_per_task; /* load_per_task in group_min */
- unsigned long leader_nr_running; /* Nr running of group_leader */
- unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-
-/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long load_per_task;
unsigned long group_capacity;
- unsigned long idle_cpus;
- unsigned long group_weight;
+ unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int group_capacity_factor;
+ unsigned int idle_cpus;
+ unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
- int group_has_capacity; /* Is there extra capacity in the group? */
+ int group_has_free_capacity;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
};
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
+};
+
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+ /*
+ * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+ * local_stat because update_sg_lb_stats() does a full clear/assignment.
+ * We must however clear busiest_stat::avg_load because
+ * update_sd_pick_busiest() reads this before assignment.
+ */
+ *sds = (struct sd_lb_stats){
+ .busiest = NULL,
+ .local = NULL,
+ .total_load = 0UL,
+ .total_capacity = 0UL,
+ .busiest_stat = {
+ .avg_load = 0UL,
+ },
+ };
+}
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ *
+ * Return: The load index.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
@@ -3548,159 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
+static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
{
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- sds->power_savings_balance = 0;
- else {
- sds->power_savings_balance = 1;
- sds->min_nr_running = ULONG_MAX;
- sds->leader_nr_running = 0;
- }
+ return SCHED_CAPACITY_SCALE;
}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
-
- if (!sds->power_savings_balance)
- return;
-
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
- !sds->this_nr_running))
- sds->power_savings_balance = 0;
-
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!sds->power_savings_balance ||
- sgs->sum_nr_running >= sgs->group_capacity ||
- !sgs->sum_nr_running)
- return;
-
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sgs->sum_nr_running < sds->min_nr_running) ||
- (sgs->sum_nr_running == sds->min_nr_running &&
- group_first_cpu(group) > group_first_cpu(sds->group_min))) {
- sds->group_min = group;
- sds->min_nr_running = sgs->sum_nr_running;
- sds->min_load_per_task = sgs->sum_weighted_load /
- sgs->sum_nr_running;
- }
-
- /*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sgs->sum_nr_running + 1 > sgs->group_capacity)
- return;
-
- if (sgs->sum_nr_running > sds->leader_nr_running ||
- (sgs->sum_nr_running == sds->leader_nr_running &&
- group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
- sds->group_leader = group;
- sds->leader_nr_running = sgs->sum_nr_running;
- }
+ return default_scale_capacity(sd, cpu);
}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- * under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- if (!sds->power_savings_balance)
- return 0;
-
- if (sds->this != sds->group_leader ||
- sds->group_leader == sds->group_min)
- return 0;
-
- *imbalance = sds->min_load_per_task;
- sds->busiest = sds->group_min;
-
- return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
- return;
-}
-
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- return SCHED_POWER_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -3710,87 +5628,146 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_power(sd, cpu);
+ return default_scale_smt_capacity(sd, cpu);
}
-unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
+ s64 delta;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
- if (unlikely(total < rq->rt_avg)) {
- /* Ensures that power won't end up being negative */
+ delta = rq_clock(rq) - age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
+
+ if (unlikely(total < avg)) {
+ /* Ensures that capacity won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
- if (unlikely((s64)total < SCHED_POWER_SCALE))
- total = SCHED_POWER_SCALE;
+ if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
+ total = SCHED_CAPACITY_SCALE;
- total >>= SCHED_POWER_SHIFT;
+ total >>= SCHED_CAPACITY_SHIFT;
return div_u64(available, total);
}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_POWER_SCALE;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_smt_power(sd, cpu);
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_smt_capacity(sd, cpu);
else
- power *= default_scale_smt_power(sd, cpu);
+ capacity *= default_scale_smt_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
}
- sdg->sgp->power_orig = power;
+ sdg->sgc->capacity_orig = capacity;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_freq_capacity(sd, cpu);
else
- power *= default_scale_freq_power(sd, cpu);
+ capacity *= default_scale_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
- power *= scale_rt_power(cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity *= scale_rt_capacity(cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
- if (!power)
- power = 1;
+ if (!capacity)
+ capacity = 1;
- cpu_rq(cpu)->cpu_power = power;
- sdg->sgp->power = power;
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
}
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power;
+ unsigned long capacity, capacity_orig;
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+ sdg->sgc->next_update = jiffies + interval;
if (!child) {
- update_cpu_power(sd, cpu);
+ update_cpu_capacity(sd, cpu);
return;
}
- power = 0;
+ capacity_orig = capacity = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ struct sched_group_capacity *sgc;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * build_sched_domains() -> init_sched_groups_capacity()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
+ *
+ * This avoids capacity/capacity_orig from being 0 and
+ * causing divide-by-zero issues on boot.
+ *
+ * Runtime updates will correct capacity_orig.
+ */
+ if (unlikely(!rq->sd)) {
+ capacity_orig += capacity_of(cpu);
+ capacity += capacity_of(cpu);
+ continue;
+ }
+
+ sgc = rq->sd->groups->sgc;
+ capacity_orig += sgc->capacity_orig;
+ capacity += sgc->capacity;
+ }
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
- sdg->sgp->power = power;
+ group = child->groups;
+ do {
+ capacity_orig += group->sgc->capacity_orig;
+ capacity += group->sgc->capacity;
+ group = group->next;
+ } while (group != child->groups);
+ }
+
+ sdg->sgc->capacity_orig = capacity_orig;
+ sdg->sgc->capacity = capacity;
}
/*
@@ -3804,142 +5781,157 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_POWER_SCALE
+ * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
*/
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ if (!(sd->flags & SD_SHARE_CPUCAPACITY))
return 0;
/*
- * If ~90% of the cpu_power is still there, we're good.
+ * If ~90% of the cpu_capacity is still there, we're good.
*/
- if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+ if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
return 1;
return 0;
}
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+
+static inline int sg_imbalanced(struct sched_group *group)
+{
+ return group->sgc->imbalance;
+}
+
+/*
+ * Compute the group capacity factor.
+ *
+ * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit unit capacity with that.
+ */
+static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+{
+ unsigned int capacity_factor, smt, cpus;
+ unsigned int capacity, capacity_orig;
+
+ capacity = group->sgc->capacity;
+ capacity_orig = group->sgc->capacity_orig;
+ cpus = group->group_weight;
+
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ capacity_factor = cpus / smt; /* cores */
+
+ capacity_factor = min_t(unsigned,
+ capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
+
+ return capacity_factor;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
-static inline void update_sg_lb_stats(struct sched_domain *sd,
- struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx,
- int local_group, const struct cpumask *cpus,
- int *balance, struct sg_lb_stats *sgs)
+static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sched_group *group, int load_idx,
+ int local_group, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
+ unsigned long load;
int i;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long avg_load_per_task = 0;
- if (local_group)
- balance_cpu = group_first_cpu(group);
+ memset(sgs, 0, sizeof(*sgs));
- /* Tally up the load of all CPUs in the group */
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
- max_nr_running = 0;
-
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
-
+ if (local_group)
load = target_load(i, load_idx);
- } else {
+ else
load = source_load(i, load_idx);
- if (load > max_cpu_load) {
- max_cpu_load = load;
- max_nr_running = rq->nr_running;
- }
- if (min_cpu_load > load)
- min_cpu_load = load;
- }
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (idle != CPU_NEWLY_IDLE && local_group) {
- if (balance_cpu != this_cpu) {
- *balance = 0;
- return;
- }
- update_group_power(sd, this_cpu);
- }
-
- /* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of a task.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
if (sgs->sum_nr_running)
- avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
- sgs->group_imb = 1;
-
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
- SCHED_POWER_SCALE);
- if (!sgs->group_capacity)
- sgs->group_capacity = fix_small_capacity(sd, group);
sgs->group_weight = group->group_weight;
- if (sgs->group_capacity > sgs->sum_nr_running)
- sgs->group_has_capacity = 1;
+ sgs->group_imb = sg_imbalanced(group);
+ sgs->group_capacity_factor = sg_capacity_factor(env, group);
+
+ if (sgs->group_capacity_factor > sgs->sum_nr_running)
+ sgs->group_has_free_capacity = 1;
}
/**
* update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
- * @this_cpu: the current cpu
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
*/
-static bool update_sd_pick_busiest(struct sched_domain *sd,
+static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
- struct sg_lb_stats *sgs,
- int this_cpu)
+ struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->max_load)
+ if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
- if (sgs->sum_nr_running > sgs->group_capacity)
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
return true;
if (sgs->group_imb)
@@ -3950,8 +5942,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- this_cpu < group_first_cpu(sg)) {
+ if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -3962,79 +5954,101 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
return false;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
+ * @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = sd->child;
- struct sched_group *sg = sd->groups;
- struct sg_lb_stats sgs;
+ struct sched_domain *child = env->sd->child;
+ struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
- init_sd_power_savings_stats(sd, sds, idle);
- load_idx = get_sd_load_idx(sd, idle);
+ load_idx = get_sd_load_idx(env->sd, env->idle);
do {
+ struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
- memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
- local_group, cpus, balance, &sgs);
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ if (local_group) {
+ sds->local = sg;
+ sgs = &sds->local_stat;
- if (local_group && !(*balance))
- return;
+ if (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
+ }
+
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
- sds->total_load += sgs.group_load;
- sds->total_pwr += sg->sgp->power;
+ if (local_group)
+ goto next_group;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity to one so that we'll try
+ * first, lower the sg capacity factor to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity. The
+ * these excess tasks, i.e. nr_running < group_capacity_factor. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling && !local_group && sds->this_has_capacity)
- sgs.group_capacity = min(sgs.group_capacity, 1UL);
+ if (prefer_sibling && sds->local &&
+ sds->local_stat.group_has_free_capacity)
+ sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
- if (local_group) {
- sds->this_load = sgs.avg_load;
- sds->this = sg;
- sds->this_nr_running = sgs.sum_nr_running;
- sds->this_load_per_task = sgs.sum_weighted_load;
- sds->this_has_capacity = sgs.group_has_capacity;
- sds->this_idle_cpus = sgs.idle_cpus;
- } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
- sds->max_load = sgs.avg_load;
+ if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
- sds->busiest_nr_running = sgs.sum_nr_running;
- sds->busiest_idle_cpus = sgs.idle_cpus;
- sds->busiest_group_capacity = sgs.group_capacity;
- sds->busiest_load_per_task = sgs.sum_weighted_load;
- sds->busiest_has_capacity = sgs.group_has_capacity;
- sds->busiest_group_weight = sgs.group_weight;
- sds->group_imb = sgs.group_imb;
+ sds->busiest_stat = *sgs;
}
- update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+next_group:
+ /* Now, start updating sd_lb_stats */
+ sds->total_load += sgs->group_load;
+ sds->total_capacity += sgs->group_capacity;
+
sg = sg->next;
- } while (sg != sd->groups);
+ } while (sg != env->sd->groups);
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
}
/**
@@ -4054,32 +6068,30 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
*/
-static int check_asym_packing(struct sched_domain *sd,
- struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
{
int busiest_cpu;
- if (!(sd->flags & SD_ASYM_PACKING))
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
if (!sds->busiest)
return 0;
busiest_cpu = group_first_cpu(sds->busiest);
- if (this_cpu > busiest_cpu)
+ if (env->dst_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
- SCHED_POWER_SCALE);
+ env->imbalance = DIV_ROUND_CLOSEST(
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
+
return 1;
}
@@ -4087,110 +6099,117 @@ static int check_asym_packing(struct sched_domain *sd,
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
+ * @env: The load balancing environment.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
*/
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static inline
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
+ struct sg_lb_stats *local, *busiest;
- if (sds->this_nr_running) {
- sds->this_load_per_task /= sds->this_nr_running;
- if (sds->busiest_load_per_task >
- sds->this_load_per_task)
- imbn = 1;
- } else
- sds->this_load_per_task =
- cpu_avg_load_per_task(this_cpu);
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (!local->sum_nr_running)
+ local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
+ else if (busiest->load_per_task > local->load_per_task)
+ imbn = 1;
- scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->sgp->power;
+ scaled_busy_load_per_task =
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
- if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
- (scaled_busy_load_per_task * imbn)) {
- *imbalance = sds->busiest_load_per_task;
+ if (busiest->avg_load + scaled_busy_load_per_task >=
+ local->avg_load + (scaled_busy_load_per_task * imbn)) {
+ env->imbalance = busiest->load_per_task;
return;
}
/*
* OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
+ * however we may be able to increase total CPU capacity used by
* moving them.
*/
- pwr_now += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load);
- pwr_now /= SCHED_POWER_SCALE;
+ capa_now += busiest->group_capacity *
+ min(busiest->load_per_task, busiest->avg_load);
+ capa_now += local->group_capacity *
+ min(local->load_per_task, local->avg_load);
+ capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->sgp->power;
- if (sds->max_load > tmp)
- pwr_move += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load - tmp);
+ if (busiest->avg_load > scaled_busy_load_per_task) {
+ capa_move += busiest->group_capacity *
+ min(busiest->load_per_task,
+ busiest->avg_load - scaled_busy_load_per_task);
+ }
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->sgp->power <
- sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
- else
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->sgp->power;
- pwr_move += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load + tmp);
- pwr_move /= SCHED_POWER_SCALE;
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
+ } else {
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+ }
+ capa_move += local->group_capacity *
+ min(local->load_per_task, local->avg_load + tmp);
+ capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
- if (pwr_move > pwr_now)
- *imbalance = sds->busiest_load_per_task;
+ if (capa_move > capa_now)
+ env->imbalance = busiest->load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
+ * @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
*/
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
- unsigned long *imbalance)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
+ struct sg_lb_stats *local, *busiest;
- sds->busiest_load_per_task /= sds->busiest_nr_running;
- if (sds->group_imb) {
- sds->busiest_load_per_task =
- min(sds->busiest_load_per_task, sds->avg_load);
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (busiest->group_imb) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure cpu-load equilibrium, look at wider averages. XXX
+ */
+ busiest->load_per_task =
+ min(busiest->load_per_task, sds->avg_load);
}
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
- * its cpu_power, while calculating max_load..)
+ * its cpu_capacity, while calculating max_load..)
*/
- if (sds->max_load < sds->avg_load) {
- *imbalance = 0;
- return fix_small_imbalance(sds, this_cpu, imbalance);
+ if (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return fix_small_imbalance(env, sds);
}
- if (!sds->group_imb) {
+ if (!busiest->group_imb) {
/*
* Don't want to pull so many tasks that a group would go idle.
+ * Except of course for the group_imb case, since then we might
+ * have to drop below capacity to reach cpu-load equilibrium.
*/
- load_above_capacity = (sds->busiest_nr_running -
- sds->busiest_group_capacity);
+ load_above_capacity =
+ (busiest->sum_nr_running - busiest->group_capacity_factor);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-
- load_above_capacity /= sds->busiest->sgp->power;
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+ load_above_capacity /= busiest->group_capacity;
}
/*
@@ -4200,15 +6219,14 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* we also don't want to reduce the group load below the group capacity
* (so that we can implement power-savings policies etc). Thus we look
* for the minimum possible imbalance.
- * Be careful of negative numbers as they'll appear as very large values
- * with unsigned longs.
*/
- max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->sgp->power,
- (sds->avg_load - sds->this_load) * sds->this->sgp->power)
- / SCHED_POWER_SCALE;
+ env->imbalance = min(
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -4216,9 +6234,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < sds->busiest_load_per_task)
- return fix_small_imbalance(sds, this_cpu, imbalance);
-
+ if (env->imbalance < busiest->load_per_task)
+ return fix_small_imbalance(env, sds);
}
/******* find_busiest_group() helpers end here *********************/
@@ -4233,159 +6250,163 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- * be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- * is the appropriate cpu to perform load balancing at this_level.
+ * @env: The load balancing environment.
*
- * Returns: - the busiest group if imbalance exists.
+ * Return: - The busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
*/
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- const struct cpumask *cpus, int *balance)
+static struct sched_group *find_busiest_group(struct lb_env *env)
{
+ struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
- memset(&sds, 0, sizeof(sds));
+ init_sd_lb_stats(&sds);
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+ update_sd_lb_stats(env, &sds);
+ local = &sds.local_stat;
+ busiest = &sds.busiest_stat;
- /*
- * this_cpu is not the appropriate cpu to perform load balancing at
- * this level.
- */
- if (!(*balance))
- goto ret;
-
- if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(sd, &sds, this_cpu, imbalance))
+ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || sds.busiest_nr_running == 0)
+ if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
- * work because they assumes all things are equal, which typically
+ * work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (sds.group_imb)
+ if (busiest->group_imb)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
- !sds.busiest_has_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+ !busiest->group_has_free_capacity)
goto force_balance;
/*
* If the local group is more busy than the selected busiest group
* don't try and pull any tasks.
*/
- if (sds.this_load >= sds.max_load)
+ if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/*
* Don't pull any tasks if this group is already above the domain
* average load.
*/
- if (sds.this_load >= sds.avg_load)
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
- if (idle == CPU_IDLE) {
+ if (env->idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
- if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
- sds.busiest_nr_running <= sds.busiest_group_weight)
+ if ((local->idle_cpus < busiest->idle_cpus) &&
+ busiest->sum_nr_running <= busiest->group_weight)
goto out_balanced;
} else {
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ if (100 * busiest->avg_load <=
+ env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
- calculate_imbalance(&sds, this_cpu, imbalance);
+ calculate_imbalance(env, &sds);
return sds.busiest;
out_balanced:
- /*
- * There is no obvious imbalance. But check if we can do some balancing
- * to save power.
- */
- if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
- return sds.busiest;
-ret:
- *imbalance = 0;
+ env->imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static struct rq *
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
- enum cpu_idle_type idle, unsigned long imbalance,
- const struct cpumask *cpus)
+static struct rq *find_busiest_queue(struct lb_env *env,
+ struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long max_load = 0;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
- for_each_cpu(i, sched_group_cpus(group)) {
- unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
- SCHED_POWER_SCALE);
- unsigned long wl;
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ unsigned long capacity, capacity_factor, wl;
+ enum fbq_type rt;
- if (!capacity)
- capacity = fix_small_capacity(sd, group);
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
- if (!cpumask_test_cpu(i, cpus))
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
continue;
- rq = cpu_rq(i);
+ capacity = capacity_of(i);
+ capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
+
wl = weighted_cpuload(i);
/*
* When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu power.
+ * which is not scaled with the cpu capacity.
*/
- if (capacity && rq->nr_running == 1 && wl > imbalance)
+ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu power, so that
- * the load can be moved away from the cpu that is potentially
- * running at a lower capacity.
+ * the weighted_cpuload() scaled with the cpu capacity, so
+ * that the load can be moved away from the cpu that is
+ * potentially running at a lower capacity.
+ *
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * multiplication to rid ourselves of the division works out
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
*/
- wl = (wl * SCHED_POWER_SCALE) / power;
-
- if (wl > max_load) {
- max_load = wl;
+ if (wl * busiest_capacity > busiest_load * capacity) {
+ busiest_load = wl;
+ busiest_capacity = capacity;
busiest = rq;
}
}
@@ -4400,42 +6421,21 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-static int need_active_balance(struct sched_domain *sd, int idle,
- int busiest_cpu, int this_cpu)
+static int need_active_balance(struct lb_env *env)
{
- if (idle == CPU_NEWLY_IDLE) {
+ struct sched_domain *sd = env->sd;
+
+ if (env->idle == CPU_NEWLY_IDLE) {
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* higher numbered CPUs in order to pack all tasks in the
* lowest numbered CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
return 1;
-
- /*
- * The only task running in a non-idle cpu can be moved to this
- * cpu in an attempt to completely freeup the other CPU
- * package.
- *
- * The package power saving logic comes from
- * find_busiest_group(). If there are no imbalance, then
- * f_b_g() will return NULL. However when sched_mc={1,2} then
- * f_b_g() will select a group from which a running task may be
- * pulled to this cpu in order to make the other package idle.
- * If there is no opportunity to make a package idle and if
- * there are no imbalance, then f_b_g() will return NULL and no
- * action will be taken in load_balance_newidle().
- *
- * Under normal task pull operation due to imbalance, there
- * will be more than one task in the source run queue and
- * move_tasks() will succeed. ld_moved will be true and this
- * active balance code will not be triggered.
- */
- if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
- return 0;
}
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4443,46 +6443,98 @@ static int need_active_balance(struct sched_domain *sd, int idle,
static int active_load_balance_cpu_stop(void *data);
+static int should_we_balance(struct lb_env *env)
+{
+ struct sched_group *sg = env->sd->groups;
+ struct cpumask *sg_cpus, *sg_mask;
+ int cpu, balance_cpu = -1;
+
+ /*
+ * In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ return 1;
+
+ sg_cpus = sched_group_cpus(sg);
+ sg_mask = sched_group_mask(sg);
+ /* Try to find first idle cpu */
+ for_each_cpu_and(cpu, sg_cpus, env->cpus) {
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ continue;
+
+ balance_cpu = cpu;
+ break;
+ }
+
+ if (balance_cpu == -1)
+ balance_cpu = group_balance_cpu(sg);
+
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above domains.
+ */
+ return balance_cpu == env->dst_cpu;
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance)
+ int *continue_balancing)
{
- int ld_moved, lb_flags = 0, active_balance = 0;
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
- unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+ struct cpumask *cpus = __get_cpu_var(load_balance_mask);
+
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ };
+
+ /*
+ * For NEWLY_IDLE load_balancing, we don't need to consider
+ * other cpus in our group
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ env.dst_grpmask = NULL;
cpumask_copy(cpus, cpu_active_mask);
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, idle,
- cpus, balance);
-
- if (*balance == 0)
+ if (!should_we_balance(&env)) {
+ *continue_balancing = 0;
goto out_balanced;
+ }
+ group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
- BUG_ON(busiest == this_rq);
+ BUG_ON(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], imbalance);
+ schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {
@@ -4492,35 +6544,92 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- lb_flags |= LBF_ALL_PINNED;
+ env.flags |= LBF_ALL_PINNED;
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+
+more_balance:
local_irq_save(flags);
- double_rq_lock(this_rq, busiest);
- ld_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle, &lb_flags);
- double_rq_unlock(this_rq, busiest);
+ double_rq_lock(env.dst_rq, busiest);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = move_tasks(&env);
+ ld_moved += cur_ld_moved;
+ double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
- if (ld_moved && this_cpu != smp_processor_id())
- resched_cpu(this_cpu);
+ if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+ resched_cpu(env.dst_cpu);
- if (lb_flags & LBF_ABORT)
- goto out_balanced;
+ if (env.flags & LBF_NEED_BREAK) {
+ env.flags &= ~LBF_NEED_BREAK;
+ goto more_balance;
+ }
- if (lb_flags & LBF_NEED_BREAK) {
- lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
- if (lb_flags & LBF_ABORT)
- goto out_balanced;
- goto redo;
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of cpus in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
+ env.dst_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_DST_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
+
+ /*
+ * We failed to reach balance because of affinity.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ *group_imbalance = 1;
+ } else if (*group_imbalance)
+ *group_imbalance = 0;
}
/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(lb_flags & LBF_ALL_PINNED)) {
+ if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus))
+ if (!cpumask_empty(cpus)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
goto redo;
+ }
goto out_balanced;
}
}
@@ -4536,7 +6645,7 @@ redo:
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
- if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+ if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -4547,7 +6656,7 @@ redo:
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
- lb_flags |= LBF_ALL_PINNED;
+ env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
@@ -4563,10 +6672,11 @@ redo:
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
- if (active_balance)
+ if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ }
/*
* We've kicked active balancing, reset the failure
@@ -4600,7 +6710,7 @@ out_balanced:
out_one_pinned:
/* tune up the balancing interval */
- if (((lb_flags & LBF_ALL_PINNED) &&
+ if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
@@ -4610,60 +6720,135 @@ out:
return ld_moved;
}
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ interval = get_sd_balance_interval(sd, cpu_busy);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-void idle_balance(int this_cpu, struct rq *this_rq)
+static int idle_balance(struct rq *this_rq)
{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
- unsigned long next_balance = jiffies + HZ;
+ u64 curr_cost = 0;
- this_rq->idle_stamp = this_rq->clock;
+ idle_enter_fair(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
- return;
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, 0, &next_balance);
+ rcu_read_unlock();
+
+ goto out;
+ }
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- unsigned long interval;
- int balance = 1;
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, 0, &next_balance);
+ break;
+ }
+
if (sd->flags & SD_BALANCE_NEWIDLE) {
- /* If we've pulled tasks over stop searching: */
+ t0 = sched_clock_cpu(this_cpu);
+
pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE, &balance);
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
}
- interval = msecs_to_jiffies(sd->balance_interval);
- if (time_after(next_balance, sd->last_balance + interval))
- next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ update_next_balance(sd, 0, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
break;
- }
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
- /*
- * We are going idle. next_balance may be set based on
- * a busy processor. So reset next_balance.
- */
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task) {
+ idle_exit_fair(this_rq);
+ this_rq->idle_stamp = 0;
}
+
+ return pulled_task;
}
/*
@@ -4710,10 +6895,18 @@ static int active_load_balance_cpu_stop(void *data)
}
if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
schedstat_inc(sd, alb_count);
- if (move_one_task(target_rq, target_cpu, busiest_rq,
- sd, CPU_IDLE))
+ if (move_one_task(&env))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
@@ -4726,7 +6919,12 @@ out_unlock:
return 0;
}
-#ifdef CONFIG_NO_HZ
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -4739,117 +6937,28 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu: The cpu whose lowest level of sched domain is to
- * be returned.
- * @flag: The flag to check for the lowest sched_domain
- * for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
- struct sched_domain *sd;
-
- for_each_domain(cpu, sd)
- if (sd->flags & flag)
- break;
-
- return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu: The cpu whose domains we're iterating over.
- * @sd: variable holding the value of the power_savings_sd
- * for cpu.
- * @flag: The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
- for (sd = lowest_flag_domain(cpu, flag); \
- (sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu: The cpu which is nominating a new idle_load_balancer.
- *
- * Returns: Returns the id of the idle load balancer if it exists,
- * Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
+static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
- struct sched_group *ilbg;
- struct sched_domain *sd;
-
- /*
- * Have idle load balancer selection from semi-idle packages only
- * when power-aware load balancing is enabled
- */
- if (!(sched_smt_power_savings || sched_mc_power_savings))
- goto out_done;
-
- /*
- * Optimize for the case when we have no idle CPUs or only one
- * idle CPU. Don't walk the sched_domain hierarchy in such cases
- */
- if (cpumask_weight(nohz.idle_cpus_mask) < 2)
- goto out_done;
-
- rcu_read_lock();
- for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilbg = sd->groups;
-
- do {
- if (ilbg->group_weight !=
- atomic_read(&ilbg->sgp->nr_busy_cpus)) {
- ilb = cpumask_first_and(nohz.idle_cpus_mask,
- sched_group_cpus(ilbg));
- goto unlock;
- }
-
- ilbg = ilbg->next;
- } while (ilbg != sd->groups);
- }
-unlock:
- rcu_read_unlock();
-
-out_done:
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
return nr_cpu_ids;
}
-#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
- return nr_cpu_ids;
-}
-#endif
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(int cpu)
+static void nohz_balancer_kick(void)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb(cpu);
+ ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
@@ -4866,11 +6975,16 @@ static void nohz_balancer_kick(int cpu)
return;
}
-static inline void clear_nohz_tick_stopped(int cpu)
+static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
+ /*
+ * Completely isolated CPUs don't ever set, so we must test.
+ */
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
@@ -4880,13 +6994,15 @@ static inline void set_cpu_sd_state_busy(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -4895,47 +7011,50 @@ void set_cpu_sd_state_idle(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
/*
- * This routine will record that this cpu is going idle with tick stopped.
+ * This routine will record that the cpu is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
-void select_nohz_load_balancer(int stop_tick)
+void nohz_balance_enter_idle(int cpu)
{
- int cpu = smp_processor_id();
-
/*
* If this cpu is going down, then nothing needs to be done.
*/
if (!cpu_active(cpu))
return;
- if (stop_tick) {
- if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
- return;
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+ return;
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
- set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- }
- return;
+ /*
+ * If we're a completely isolated CPU, we don't play.
+ */
+ if (on_null_domain(cpu_rq(cpu)))
+ return;
+
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+static int sched_ilb_notifier(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DYING:
- clear_nohz_tick_stopped(smp_processor_id());
+ nohz_balance_exit_idle(smp_processor_id());
return NOTIFY_OK;
default:
return NOTIFY_DONE;
@@ -4945,8 +7064,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
static DEFINE_SPINLOCK(balancing);
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
@@ -4960,50 +7077,69 @@ void update_max_interval(void)
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
- int balance = 1;
- struct rq *rq = cpu_rq(cpu);
+ int continue_balancing = 1;
+ int cpu = rq->cpu;
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize;
+ int need_serialize, need_decay = 0;
+ u64 max_cost = 0;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
+ /*
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains. Decay ~1% per second.
+ */
+ if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+ sd->max_newidle_lb_cost =
+ (sd->max_newidle_lb_cost * 253) / 256;
+ sd->next_decay_max_lb_cost = jiffies + HZ;
+ need_decay = 1;
+ }
+ max_cost += sd->max_newidle_lb_cost;
+
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- interval = sd->balance_interval;
- if (idle != CPU_IDLE)
- interval *= sd->busy_factor;
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!continue_balancing) {
+ if (need_decay)
+ continue;
+ break;
+ }
- /* scale ms to jiffies */
- interval = msecs_to_jiffies(interval);
- interval = clamp(interval, 1UL, max_load_balance_interval);
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
need_serialize = sd->flags & SD_SERIALIZE;
-
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
- * We've pulled tasks over so either we're no
- * longer idle.
+ * The LBF_DST_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
*/
- idle = CPU_NOT_IDLE;
+ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -5012,14 +7148,14 @@ out:
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
-
+ }
+ if (need_decay) {
/*
- * Stop the load balance at this level. There is another
- * CPU in our sched group which is doing load balancing more
- * actively.
+ * Ensure the rq-wide value also decays but keep it at a
+ * reasonable floor to avoid funnies with rq->avg_idle.
*/
- if (!balance)
- break;
+ rq->max_idle_balance_cost =
+ max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
@@ -5032,14 +7168,14 @@ out:
rq->next_balance = next_balance;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- struct rq *this_rq = cpu_rq(this_cpu);
+ int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
@@ -5059,14 +7195,20 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
if (need_resched())
break;
- raw_spin_lock_irq(&this_rq->lock);
- update_rq_clock(this_rq);
- update_cpu_load(this_rq);
- raw_spin_unlock_irq(&this_rq->lock);
+ rq = cpu_rq(balance_cpu);
- rebalance_domains(balance_cpu, CPU_IDLE);
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ rebalance_domains(rq, CPU_IDLE);
+ }
- rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
}
@@ -5080,16 +7222,18 @@ end:
* of an idle cpu is the system.
* - This rq has more than one task.
* - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's power.
+ * busy cpu's exceeding the group's capacity.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
+static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
+ struct sched_group_capacity *sgc;
+ int nr_busy, cpu = rq->cpu;
- if (unlikely(idle_cpu(cpu)))
+ if (unlikely(rq->idle_balance))
return 0;
/*
@@ -5097,7 +7241,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
- clear_nohz_tick_stopped(cpu);
+ nohz_balance_exit_idle(cpu);
/*
* None are in tickless mode and hence no need for NOHZ idle load
@@ -5113,22 +7257,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- for_each_domain(cpu, sd) {
- struct sched_group *sg = sd->groups;
- struct sched_group_power *sgp = sg->sgp;
- int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
- goto need_kick_unlock;
+ if (sd) {
+ sgc = sd->groups->sgc;
+ nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
- && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+ if (nr_busy > 1)
goto need_kick_unlock;
-
- if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
- break;
}
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
rcu_read_unlock();
return 0;
@@ -5138,7 +7282,7 @@ need_kick:
return 1;
}
#else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
#endif
/*
@@ -5147,38 +7291,34 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
*/
static void run_rebalance_domains(struct softirq_action *h)
{
- int this_cpu = smp_processor_id();
- struct rq *this_rq = cpu_rq(this_cpu);
+ struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_cpu, idle);
+ rebalance_domains(this_rq, idle);
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- nohz_idle_balance(this_cpu, idle);
-}
-
-static inline int on_null_domain(int cpu)
-{
- return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+ nohz_idle_balance(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq)
{
/* Don't need to rebalance while attached to NULL domain */
- if (time_after_eq(jiffies, rq->next_balance) &&
- likely(!on_null_domain(cpu)))
+ if (unlikely(on_null_domain(rq)))
+ return;
+
+ if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
- if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
- nohz_balancer_kick(cpu);
+#ifdef CONFIG_NO_HZ_COMMON
+ if (nohz_kick_needed(rq))
+ nohz_balancer_kick();
#endif
}
@@ -5190,6 +7330,9 @@ static void rq_online_fair(struct rq *rq)
static void rq_offline_fair(struct rq *rq)
{
update_sysctl();
+
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
@@ -5206,6 +7349,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ if (numabalancing_enabled)
+ task_tick_numa(rq, curr);
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -5228,11 +7376,15 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
@@ -5282,15 +7434,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when its
+ * Ensure the task's vruntime is normalized, so that when it's
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it was on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it was !on_rq, then only when
+ * If it's on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !on_rq, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!se->on_rq && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -5298,6 +7450,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#ifdef CONFIG_SMP
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (se->avg.decay_count) {
+ __synchronize_entity_decay(se);
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ }
+#endif
}
/*
@@ -5305,7 +7469,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- if (!p->se.on_rq)
+ struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+ if (!se->on_rq)
return;
/*
@@ -5340,16 +7512,22 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
- INIT_LIST_HEAD(&cfs_rq->tasks);
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#ifdef CONFIG_SMP
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic_long_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5375,14 +7553,26 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;
if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(se);
+ se->vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * migrate_task_rq_fair() will have removed our previous
+ * contribution, but we must synchronize for ongoing future
+ * decay.
+ */
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5467,10 +7657,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5480,13 +7666,17 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
if (!se)
return;
- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }
se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
@@ -5517,6 +7707,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /* Possible calls to update_curr() need rq clock */
+ update_rq_clock(rq);
for_each_sched_entity(se)
update_cfs_shares(group_cfs_rq(se));
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5550,7 +7743,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
@@ -5572,6 +7765,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
+ .migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
@@ -5611,7 +7805,8 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
+ nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d..90284d117fe 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
SCHED_FEAT(START_DEBIT, true)
/*
- * Based on load and program behaviour, see if it makes sense to place
- * a newly woken task on the same cpu as the task that woke it --
- * improve cache locality. Typically used with SYNC wakeups as
- * generated by pipes and the like, see also SYNC_WAKEUPS.
- */
-SCHED_FEAT(AFFINE_WAKEUPS, true)
-
-/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
@@ -40,25 +32,23 @@ SCHED_FEAT(LAST_BUDDY, true)
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
- * Use arch dependent cpu power functions
+ * Allow wakeup-time preemption of the current task:
*/
-SCHED_FEAT(ARCH_POWER, false)
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+
+/*
+ * Use arch dependent cpu capacity functions
+ */
+SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
-/*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
*/
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
@@ -68,3 +58,28 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA, false)
+
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 00000000000..cf009fb0bc2
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,273 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+#include "sched.h"
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+ rcu_idle_enter();
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ while (!tif_need_resched())
+ cpu_relax();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ rcu_idle_exit();
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+ local_irq_enable();
+}
+
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
+ */
+static void cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state;
+ bool broadcast;
+
+ /*
+ * Check if the idle task must be rescheduled. If it is the
+ * case, exit the function after re-enabling the local irq.
+ */
+ if (need_resched()) {
+ local_irq_enable();
+ return;
+ }
+
+ /*
+ * During the idle period, stop measuring the disabled irqs
+ * critical sections latencies
+ */
+ stop_critical_timings();
+
+ /*
+ * Tell the RCU framework we are entering an idle section,
+ * so no more rcu read side critical sections and one more
+ * step to the grace period
+ */
+ rcu_idle_enter();
+
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch idle method on errors.
+ */
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0) {
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
+ }
+
+
+ /*
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+
+ /*
+ * Tell the time framework to switch to a broadcast timer
+ * because our local timer will be shutdown. If a local timer
+ * is used from another cpu as a broadcast timer, this call may
+ * fail if it is not available
+ */
+ if (broadcast &&
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ goto use_default;
+
+ trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+
+exit_idle:
+ __current_set_polling();
+
+ /*
+ * It is up to the idle functions to reenable local interrupts
+ */
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
+
+ rcu_idle_exit();
+ start_critical_timings();
+}
+
+/*
+ * Generic idle loop implementation
+ *
+ * Called with polling cleared.
+ */
+static void cpu_idle_loop(void)
+{
+ while (1) {
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if
+ * rq->curr != rq->idle). This means that, if rq->idle has
+ * the polling bit set, then setting need_resched is
+ * guaranteed to cause the cpu to reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id()))
+ arch_cpu_idle_dead();
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin.
+ *
+ * Also if we detected in the wakeup from idle
+ * path that the tick broadcast device expired
+ * for us, we don't want to go deep idle as we
+ * know that the IPI is going to arrive right
+ * away
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+
+ arch_cpu_idle_exit();
+ }
+
+ /*
+ * Since we fell out of the loop above, we know
+ * TIF_NEED_RESCHED must be set, propagate it into
+ * PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will
+ * not have had an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending and reschedule
+ * if need_resched is set while polling is set. That
+ * means that clearing polling needs to be visible
+ * before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
+ }
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ /*
+ * This #ifdef needs to die, but it's too late in the cycle to
+ * make this generic (arm and sh have never invoked the canary
+ * init for the non boot cpus!). Will be fixed in 3.11
+ */
+#ifdef CONFIG_X86
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. The boot CPU already has it initialized but no harm
+ * in doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+#endif
+ arch_cpu_idle_prepare();
+ cpu_idle_loop();
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f28..879f2b75266 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,16 +4,17 @@
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
- * handled in sched_fair.c)
+ * handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
+
/*
* Idle tasks are unconditionally rescheduled:
*/
@@ -22,10 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_task(rq->idle);
}
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
- calc_load_account_idle(rq);
return rq->idle;
}
@@ -44,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 00000000000..16f5a30f9c8
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
+/*
+ * kernel/sched/proc.c
+ *
+ * Kernel load calculations, forked from sched/core.c
+ */
+
+#include <linux/export.h>
+
+#include "sched.h"
+
+unsigned long this_cpu_load(void)
+{
+ struct rq *this = this_rq();
+ return this->cpu_load[0];
+}
+
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
+ long delta;
+
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
+ delta = calc_load_fold_active(this_rq);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
+}
+
+void calc_load_exit_idle(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the sample window, we're done.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+
+ return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+ long delta, active, n;
+
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ long active, delta;
+
+ if (time_before(jiffies, calc_load_update + 10))
+ return;
+
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+
+ /*
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+ */
+ calc_global_nohz();
+}
+
+/*
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long delta;
+
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ delta = calc_load_fold_active(this_rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * End of global load-average stuff
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_SMP
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->cfs.runnable_load_avg;
+}
+#else
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->load.weight;
+}
+#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long load = get_rq_runnable_load(this_rq);
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = get_rq_runnable_load(this_rq);
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
+
+ calc_load_account_active(this_rq);
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7fb5ec..a49083192c6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
#include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
+
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
@@ -77,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -110,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
void free_rt_sched_group(struct task_group *tg)
{
int i;
@@ -209,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
return &rq->rt;
}
@@ -227,6 +244,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ return rq->rt.highest_prio.curr > prev->prio;
+}
+
static inline int rt_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->rto_count);
@@ -244,8 +269,10 @@ static inline void rt_set_overload(struct rq *rq)
* if we should look at the mask. It would be a shame
* if we looked at the mask, but the mask was not
* updated yet.
+ *
+ * Matched by the barrier in pull_rt_task().
*/
- wmb();
+ smp_wmb();
atomic_inc(&rq->rd->rto_count);
}
@@ -274,13 +301,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -288,13 +318,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -305,6 +338,15 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
+static inline void set_post_schedule(struct rq *rq)
+{
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+}
+
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -349,8 +391,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -391,20 +449,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
(iter = next_task_group(iter)) && \
(rt_rq = iter->rt_rq[cpu_of(rq)]);)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
- list_add_rcu(&rt_rq->leaf_rt_rq_list,
- &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
- list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
- list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -426,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
- if (rt_se && !on_rt_rq(rt_se))
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
+
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
@@ -440,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (rt_se && on_rt_rq(rt_se))
+ if (!rt_se)
+ dequeue_top_rt_rq(rt_rq);
+ else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
}
@@ -464,7 +513,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
- return cpu_rq(smp_processor_id())->rd->span;
+ return this_rq()->rd->span;
}
#else
static inline const struct cpumask *sched_rt_period_mask(void)
@@ -501,17 +550,6 @@ typedef struct rt_rq *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
- for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
@@ -522,12 +560,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_running)
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_task(rq->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
+ dequeue_top_rt_rq(rt_rq);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -553,6 +597,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
@@ -560,7 +612,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
int i, weight, more = 0;
u64 rt_period;
@@ -685,20 +737,12 @@ balanced:
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
}
-static void disable_runtime(struct rq *rq)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __disable_runtime(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static void __enable_runtime(struct rq *rq)
{
rt_rq_iter_t iter;
@@ -723,37 +767,6 @@ static void __enable_runtime(struct rq *rq)
}
}
-static void enable_runtime(struct rq *rq)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __enable_runtime(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
- switch (action) {
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- disable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- enable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- default:
- return NOTIFY_DONE;
- }
-}
-
static int balance_runtime(struct rt_rq *rt_rq)
{
int more = 0;
@@ -778,13 +791,23 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
- int i, idle = 1;
+ int i, idle = 1, throttled = 0;
const struct cpumask *span;
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return 1;
-
span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -818,12 +841,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (!rt_rq_throttled(rt_rq))
enqueue = 1;
}
+ if (rt_rq->rt_throttled)
+ throttled = 1;
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
raw_spin_unlock(&rq->lock);
}
+ if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
+ return 1;
+
return idle;
}
@@ -855,8 +883,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
if (rt_rq->rt_time > runtime) {
- rt_rq->rt_throttled = 1;
- printk_once(KERN_WARNING "sched: RT throttling activated\n");
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ /*
+ * Don't actually throttle groups that have no runtime assigned
+ * but accrue some time due to boosting.
+ */
+ if (likely(rt_b->rt_runtime)) {
+ rt_rq->rt_throttled = 1;
+ printk_deferred_once("sched: RT throttling activated\n");
+ } else {
+ /*
+ * In case we did anyway, make it go away,
+ * replenishment is a joke, since it will replenish us
+ * with exactly 0 ns.
+ */
+ rt_rq->rt_time = 0;
+ }
+
if (rt_rq_throttled(rt_rq)) {
sched_rt_rq_dequeue(rt_rq);
return 1;
@@ -874,22 +918,22 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- delta_exec = rq->clock_task - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
- schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock_task;
+ curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -898,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
@@ -910,6 +954,38 @@ static void update_curr_rt(struct rq *rq)
}
}
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+ if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+ return;
+
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+}
+
#if defined CONFIG_SMP
static void
@@ -917,6 +993,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -926,6 +1009,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
@@ -1019,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
- rt_rq->rt_nr_running++;
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1036,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
@@ -1059,9 +1160,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
- if (!rt_rq->rt_nr_running)
- list_add_leaf_rt_rq(rt_rq);
-
if (head)
list_add(&rt_se->run_list, queue);
else
@@ -1081,8 +1179,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
- if (!rt_rq->rt_nr_running)
- list_del_leaf_rt_rq(rt_rq);
}
/*
@@ -1098,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
back = rt_se;
}
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1106,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
+ enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
@@ -1121,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
+ enqueue_top_rt_rq(&rq->rt);
}
/*
@@ -1136,10 +1240,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
-
- inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1150,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
-
- dec_nr_running(rq);
}
/*
@@ -1192,15 +1292,12 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
- int cpu;
-
- cpu = task_cpu(p);
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
goto out;
/* For anything but wake ups, just return the task_cpu */
@@ -1235,9 +1332,8 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (curr->rt.nr_cpus_allowed < 2 ||
- curr->prio <= p->prio) &&
- (p->rt.nr_cpus_allowed > 1)) {
+ (curr->nr_cpus_allowed < 2 ||
+ curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1251,10 +1347,10 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->rt.nr_cpus_allowed == 1)
+ if (rq->curr->nr_cpus_allowed == 1)
return;
- if (p->rt.nr_cpus_allowed != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1321,15 +1417,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
- struct rt_rq *rt_rq;
-
- rt_rq = &rq->rt;
-
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
- return NULL;
+ struct rt_rq *rt_rq = &rq->rt;
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1338,26 +1426,48 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
} while (rt_rq);
p = rt_task_of(rt_se);
- p->se.exec_start = rq->clock_task;
+ p->se.exec_start = rq_clock_task(rq);
return p;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (need_pull_rt_task(rq, prev)) {
+ pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
+ */
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }
+
+ /*
+ * We may dequeue prev's rt_rq in put_prev_task().
+ * So, we update time before rt_nr_running check.
+ */
+ if (prev->sched_class == &rt_sched_class)
+ update_curr_rt(rq);
+
+ if (!rt_rq->rt_queued)
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
-#ifdef CONFIG_SMP
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
-#endif
+ set_post_schedule(rq);
return p;
}
@@ -1370,7 +1480,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1382,48 +1492,29 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->rt.nr_cpus_allowed > 1))
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
return 1;
return 0;
}
-/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+/*
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the cpu, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{
- struct task_struct *next = NULL;
- struct sched_rt_entity *rt_se;
- struct rt_prio_array *array;
- struct rt_rq *rt_rq;
- int idx;
-
- for_each_leaf_rt_rq(rt_rq, rq) {
- array = &rt_rq->active;
- idx = sched_find_first_bit(array->bitmap);
-next_idx:
- if (idx >= MAX_RT_PRIO)
- continue;
- if (next && next->prio < idx)
- continue;
- list_for_each_entry(rt_se, array->queue + idx, run_list) {
- struct task_struct *p;
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *p;
- if (!rt_entity_is_task(rt_se))
- continue;
+ if (!has_pushable_tasks(rq))
+ return NULL;
- p = rt_task_of(rt_se);
- if (pick_rt_task(rq, p, cpu)) {
- next = p;
- break;
- }
- }
- if (!next) {
- idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
- goto next_idx;
- }
+ plist_for_each_entry(p, head, pushable_tasks) {
+ if (pick_rt_task(rq, p, cpu))
+ return p;
}
- return next;
+ return NULL;
}
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1439,7 +1530,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (task->rt.nr_cpus_allowed == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1531,7 +1622,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
task_running(rq, task) ||
!task->on_rq)) {
- raw_spin_unlock(&lowest_rq->lock);
+ double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
break;
}
@@ -1561,7 +1652,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
@@ -1587,11 +1678,6 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- if (unlikely(task_running(rq, next_task)))
- return 0;
-#endif
-
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
@@ -1677,6 +1763,12 @@ static int pull_rt_task(struct rq *this_rq)
if (likely(!rt_overloaded(this_rq)))
return 0;
+ /*
+ * Match the barrier from rt_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the rto_mask bit.
+ */
+ smp_rmb();
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1702,12 +1794,10 @@ static int pull_rt_task(struct rq *this_rq)
double_lock_balance(this_rq, src_rq);
/*
- * Are there still pullable RT tasks?
+ * We can pull only a task, which is pushable
+ * on its rq, and no others.
*/
- if (src_rq->rt.rt_nr_running <= 1)
- goto skip;
-
- p = pick_next_highest_task_rt(src_rq, this_cpu);
+ p = pick_highest_pushable_task(src_rq, this_cpu);
/*
* Do we have an RT task that preempts
@@ -1747,13 +1837,6 @@ skip:
return ret;
}
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull RT tasks here if we lower this rq's prio */
- if (rq->rt.highest_prio.curr > prev->prio)
- pull_rt_task(rq);
-}
-
static void post_schedule_rt(struct rq *rq)
{
push_rt_tasks(rq);
@@ -1768,9 +1851,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1 &&
- rt_task(rq->curr) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -1778,44 +1861,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_rt(struct task_struct *p,
const struct cpumask *new_mask)
{
- int weight = cpumask_weight(new_mask);
+ struct rq *rq;
+ int weight;
BUG_ON(!rt_task(p));
- /*
- * Update the migration status of the RQ if we have an RT task
- * which is running AND changing its weight value.
- */
- if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
- struct rq *rq = task_rq(p);
-
- if (!task_current(rq, p)) {
- /*
- * Make sure we dequeue this task from the pushable list
- * before going further. It will either remain off of
- * the list because we are no longer pushable, or it
- * will be requeued.
- */
- if (p->rt.nr_cpus_allowed > 1)
- dequeue_pushable_task(rq, p);
+ if (!p->on_rq)
+ return;
- /*
- * Requeue if our weight is changing and still > 1
- */
- if (weight > 1)
- enqueue_pushable_task(rq, p);
+ weight = cpumask_weight(new_mask);
- }
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
+ return;
- if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
- rq->rt.rt_nr_migratory++;
- } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- }
+ rq = task_rq(p);
- update_rt_migration(&rq->rt);
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_task(rq, p);
+ BUG_ON(!rq->rt.rt_nr_migratory);
+ rq->rt.rt_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_task(rq, p);
+ rq->rt.rt_nr_migratory++;
}
+
+ update_rt_migration(&rq->rt);
}
/* Assumes rq->lock is held */
@@ -1853,11 +1932,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (p->on_rq && !rq->rt.rt_nr_running)
- pull_rt_task(rq);
+ if (!p->on_rq || rq->rt.rt_nr_running)
+ return;
+
+ if (pull_rt_task(rq))
+ resched_task(rq->curr);
}
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
{
unsigned int i;
@@ -1886,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->rt.overloaded && push_rt_task(rq) &&
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
- rq != task_rq(p))
+ push_rt_task(rq) && rq != task_rq(p))
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
@@ -1949,7 +2031,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
if (soft != RLIM_INFINITY) {
unsigned long next;
- p->rt.timeout++;
+ if (p->rt.watchdog_stamp != jiffies) {
+ p->rt.timeout++;
+ p->rt.watchdog_stamp = jiffies;
+ }
+
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
if (p->rt.timeout > next)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -1958,6 +2044,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+
update_curr_rt(rq);
watchdog(rq, p);
@@ -1972,15 +2060,18 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
if (--p->rt.time_slice)
return;
- p->rt.time_slice = DEF_TIMESLICE;
+ p->rt.time_slice = sched_rr_timeslice;
/*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
+ * Requeue to the end of queue if we (and all of our ancestors) are not
+ * the only element on the queue
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
+ }
}
}
@@ -1988,7 +2079,7 @@ static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
- p->se.exec_start = rq->clock_task;
+ p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
@@ -2000,7 +2091,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
* Time slice is 0 for SCHED_FIFO tasks
*/
if (task->policy == SCHED_RR)
- return DEF_TIMESLICE;
+ return sched_rr_timeslice;
else
return 0;
}
@@ -2022,7 +2113,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db..31cc02ebc54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,57 +1,90 @@
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/tick.h>
+#include <linux/slab.h>
#include "cpupri.h"
+#include "cpudeadline.h"
+#include "cpuacct.h"
+
+struct rq;
extern __read_mostly int scheduler_running;
-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
+extern unsigned long calc_load_update;
+extern atomic_long_t calc_load_tasks;
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+extern long calc_load_fold_active(struct rq *this_rq);
+extern void update_cpu_load_active(struct rq *this_rq);
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
+# define SCHED_LOAD_RESOLUTION 10
+# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION 0
+# define scale_load(w) (w)
+# define scale_load_down(w) (w)
+#endif
+
+#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
/*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9 -> just above 0.5us
+ */
+#define DL_SCALE (10)
+
+/*
* These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
*/
-#define DEF_TIMESLICE (100 * HZ / 1000)
/*
* single value that denotes runtime == period, ie unlimited time.
*/
#define RUNTIME_INF ((u64)~0ULL)
+static inline int fair_policy(int policy)
+{
+ return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
static inline int rt_policy(int policy)
{
- if (policy == SCHED_FIFO || policy == SCHED_RR)
- return 1;
- return 0;
+ return policy == SCHED_FIFO || policy == SCHED_RR;
+}
+
+static inline int dl_policy(int policy)
+{
+ return policy == SCHED_DEADLINE;
}
static inline int task_has_rt_policy(struct task_struct *p)
@@ -59,6 +92,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
return rt_policy(p->policy);
}
+static inline int task_has_dl_policy(struct task_struct *p)
+{
+ return dl_policy(p->policy);
+}
+
+static inline bool dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+/*
+ * Tells if entity @a should preempt entity @b.
+ */
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+{
+ return dl_time_before(a->deadline, b->deadline);
+}
+
/*
* This is the priority-queue data structure of the RT scheduling class:
*/
@@ -74,6 +126,47 @@ struct rt_bandwidth {
u64 rt_runtime;
struct hrtimer rt_period_timer;
};
+/*
+ * To keep the bandwidth of -deadline tasks and groups under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of the system (the group);
+ * - cache the fraction of that bandwidth that is currently allocated.
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * meaning that:
+ * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ * - dl_total_bw array contains, in the i-eth element, the currently
+ * allocated bandwidth on the i-eth CPU.
+ * Moreover, groups consume bandwidth on each CPU, while tasks only
+ * consume bandwidth on the CPU they're running on.
+ * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
+ * that will be shown the next time the proc or cgroup controls will
+ * be red. It on its turn can be changed by writing on its own
+ * control.
+ */
+struct dl_bandwidth {
+ raw_spinlock_t dl_runtime_lock;
+ u64 dl_runtime;
+ u64 dl_period;
+};
+
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+extern struct dl_bw *dl_bw_of(int i);
+
+struct dl_bw {
+ raw_spinlock_t lock;
+ u64 bw, total_bw;
+};
extern struct mutex sched_domains_mutex;
@@ -84,7 +177,7 @@ extern struct mutex sched_domains_mutex;
struct cfs_rq;
struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -115,7 +208,10 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
- atomic_t load_weight;
+#ifdef CONFIG_SMP
+ atomic_long_t load_avg;
+ atomic_t runnable_avg;
+#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -154,11 +250,6 @@ struct task_group {
#define MAX_SHARES (1UL << 18)
#endif
-/* Default task group.
- * Every task in system belong to this group at bootup.
- */
-extern struct task_group root_task_group;
-
typedef int (*tg_visitor)(struct task_group *, void *);
extern int walk_tg_tree_from(struct task_group *from,
@@ -187,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void free_rt_sched_group(struct task_group *tg);
@@ -196,6 +287,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
+
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
@@ -205,7 +308,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running, h_nr_running;
+ unsigned int nr_running, h_nr_running;
u64 exec_clock;
u64 min_vruntime;
@@ -216,9 +319,6 @@ struct cfs_rq {
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
- struct list_head tasks;
- struct list_head *balance_iterator;
-
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
@@ -229,26 +329,22 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
- /*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
- */
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
-
#ifdef CONFIG_SMP
/*
- * the part of load.weight contributed by tasks
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
*/
- unsigned long task_weight;
+ unsigned long runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter;
+ u64 last_decay;
+ atomic_long_t removed_load;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* Required to track per-cpu representation of a task_group */
+ u32 tg_runnable_contrib;
+ unsigned long tg_load_contrib;
/*
* h_load = weight * f(tg)
@@ -257,26 +353,33 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
- * Maintaining per-cpu shares distribution for group scheduling
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
*
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
*/
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
- unsigned long load_contribution;
-#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
- u64 throttled_timestamp;
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -291,7 +394,7 @@ static inline int rt_bandwidth_enabled(void)
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
- unsigned long rt_nr_running;
+ unsigned int rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
@@ -306,6 +409,8 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
+ int rt_queued;
+
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -316,11 +421,45 @@ struct rt_rq {
unsigned long rt_nr_boosted;
struct rq *rq;
- struct list_head leaf_rt_rq_list;
struct task_group *tg;
#endif
};
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+ /* runqueue is an rbtree, ordered by deadline */
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+
+ unsigned long dl_nr_running;
+
+#ifdef CONFIG_SMP
+ /*
+ * Deadline values of the currently executing and the
+ * earliest ready task on this rq. Caching these facilitates
+ * the decision wether or not a ready but not running task
+ * should migrate somewhere else.
+ */
+ struct {
+ u64 curr;
+ u64 next;
+ } earliest_dl;
+
+ unsigned long dl_nr_migratory;
+ int overloaded;
+
+ /*
+ * Tasks on this rq that can be pushed away. They are kept in
+ * an rb-tree, ordered by tasks' deadlines, with caching
+ * of the leftmost (earliest deadline) element.
+ */
+ struct rb_root pushable_dl_tasks_root;
+ struct rb_node *pushable_dl_tasks_leftmost;
+#else
+ struct dl_bw dl_bw;
+#endif
+};
+
#ifdef CONFIG_SMP
/*
@@ -339,6 +478,15 @@ struct root_domain {
cpumask_var_t online;
/*
+ * The bit corresponding to a CPU gets set here if such CPU has more
+ * than one runnable -deadline task (as it is below for RT tasks).
+ */
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
+
+ /*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
@@ -365,14 +513,21 @@ struct rq {
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
- unsigned long nr_running;
+ unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ unsigned long last_sched_tick;
+#endif
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
@@ -382,14 +537,14 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
+ struct dl_rq dl;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- struct list_head leaf_rt_rq_list;
-#endif
+
+ struct sched_avg avg;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* This is part of a global counter where only the total sum
@@ -412,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
- unsigned long cpu_power;
+ unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
@@ -424,10 +579,15 @@ struct rq {
int cpu;
int online;
+ struct list_head cfs_tasks;
+
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
+
+ /* This is used to determine avg_idle's max value */
+ u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -462,7 +622,6 @@ struct rq {
unsigned int yld_count;
/* schedule() stats */
- unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
@@ -493,8 +652,26 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
+static inline u64 rq_clock(struct rq *rq)
+{
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ return rq->clock_task;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
+
#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -534,8 +711,87 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
return hsd;
}
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+
+struct sched_group_capacity {
+ atomic_t ref;
+ /*
+ * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * for a single CPU.
+ */
+ unsigned int capacity, capacity_orig;
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
+ /*
+ * Number of busy cpus in this group.
+ */
+ atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_capacity *sgc;
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgc->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_cpus(group));
+}
+
+extern int group_balance_cpu(struct sched_group *sg);
+
+#else
+
+static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
@@ -547,22 +803,19 @@ DECLARE_PER_CPU(int, sd_llc_id);
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_css() and friends because the cgroup subsystem
+ * changes that value before the cgroup_subsys::attach() method is called,
+ * therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -604,6 +857,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
+ p->wake_cpu = cpu;
#endif
}
@@ -611,7 +865,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
-# include <linux/jump_label.h>
+# include <linux/static_key.h>
# define const_debug __read_mostly
#else
# define const_debug const
@@ -630,18 +884,18 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct jump_label_key *key)
+static __always_inline bool static_branch__true(struct static_key *key)
{
- return likely(static_branch(key)); /* Not out of line branch. */
+ return static_key_true(key); /* Not out of line branch. */
}
-static __always_inline bool static_branch__false(struct jump_label_key *key)
+static __always_inline bool static_branch__false(struct static_key *key)
{
- return unlikely(static_branch(key)); /* Out of line branch. */
+ return static_key_false(key); /* Out of line branch. */
}
#define SCHED_FEAT(name, enabled) \
-static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
return static_branch__##enabled(key); \
}
@@ -650,12 +904,24 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
#undef SCHED_FEAT
-extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -669,8 +935,6 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
-
-
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
@@ -692,6 +956,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -742,11 +1009,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
*/
next->on_cpu = 1;
#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- raw_spin_unlock_irq(&rq->lock);
-#else
raw_spin_unlock(&rq->lock);
-#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -760,30 +1023,16 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
smp_wmb();
prev->on_cpu = 0;
#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
-#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
- lw->weight += inc;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
- lw->weight -= dec;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
- lw->weight = w;
- lw->inv_weight = 0;
-}
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x4 /* internal use, task got migrated */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -838,20 +1087,85 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
- CPUACCT_STAT_USER, /* ... user mode */
- CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+#define ENQUEUE_WAKEUP 1
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
+#define ENQUEUE_REPLENISH 8
+
+#define DEQUEUE_SLEEP 1
+
+#define RETRY_TASK ((void *)-1UL)
+
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
+ */
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+
+ void (*post_schedule) (struct rq *this_rq);
+ void (*task_waking) (struct task_struct *task);
+ void (*task_woken) (struct rq *this_rq, struct task_struct *task);
- CPUACCT_STAT_NSTATS,
+ void (*set_cpus_allowed)(struct task_struct *p,
+ const struct cpumask *newmask);
+
+ void (*rq_online)(struct rq *rq);
+ void (*rq_offline)(struct rq *rq);
+#endif
+
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+ void (*task_fork) (struct task_struct *p);
+ void (*task_dead) (struct task_struct *p);
+
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ int oldprio);
+
+ unsigned int (*get_rr_interval) (struct rq *rq,
+ struct task_struct *task);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
};
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ prev->sched_class->put_prev_task(rq, prev);
+}
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
@@ -859,24 +1173,28 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
-extern void trigger_load_balance(struct rq *rq, int cpu);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
-#else /* CONFIG_SMP */
+extern void trigger_load_balance(struct rq *rq);
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+
+#else
+
+static inline void idle_enter_fair(struct rq *rq) { }
+static inline void idle_exit_fair(struct rq *rq) { }
#endif
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
-extern void update_group_power(struct sched_domain *sd, int cpu);
-extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
+
+extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
+extern void init_sched_dl_class(void);
extern void resched_task(struct task_struct *p);
extern void resched_cpu(int cpu);
@@ -884,52 +1202,43 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void update_cpu_load(struct rq *this_rq);
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct kernel_cpustat __percpu *cpustat;
-};
+unsigned long to_ratio(u64 period, u64 runtime);
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
-}
+extern void update_idle_cpu_load(struct rq *this_rq);
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
+extern void init_task_runnable_average(struct task_struct *p);
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+static inline void add_nr_running(struct rq *rq, unsigned count)
{
- if (!ca || !ca->css.cgroup->parent)
- return NULL;
- return cgroup_ca(ca->css.cgroup->parent);
-}
+ unsigned prev_nr = rq->nr_running;
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+ rq->nr_running = prev_nr + count;
+
+#ifdef CONFIG_NO_HZ_FULL
+ if (prev_nr < 2 && rq->nr_running >= 2) {
+ if (tick_nohz_full_cpu(rq->cpu)) {
+ /* Order rq->nr_running write against the IPI */
+ smp_wmb();
+ smp_send_reschedule(rq->cpu);
+ }
+ }
#endif
+}
-static inline void inc_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running++;
+ rq->nr_running -= count;
}
-static inline void dec_nr_running(struct rq *rq)
+static inline void rq_last_tick_reset(struct rq *rq)
{
- rq->nr_running--;
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = jiffies;
+#endif
}
extern void update_rq_clock(struct rq *rq);
@@ -948,8 +1257,6 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
-void calc_load_account_idle(struct rq *this_rq);
-
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1067,6 +1374,33 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ raw_spin_lock(l1);
+ raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
/*
* double_rq_lock - safely lock two runqueues
*
@@ -1151,16 +1485,65 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
NOHZ_TICK_STOPPED,
NOHZ_BALANCE_KICK,
- NOHZ_IDLE,
};
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(u64, cpu_softirq_time);
+
+#ifndef CONFIG_64BIT
+DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 2a581ba8e19..a476bea17fb 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,20 +21,23 @@ static int show_schedstat(struct seq_file *seq, void *v)
if (mask_str == NULL)
return -ENOMEM;
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+ "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
cpu, rq->yld_count,
- rq->sched_switch, rq->sched_count, rq->sched_goidle,
+ rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
@@ -77,30 +80,61 @@ static int show_schedstat(struct seq_file *seq, void *v)
return 0;
}
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
+ unsigned long n = *offset;
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &schedstat_sops);
}
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release,
};
static int __init proc_schedstat_init(void)
@@ -108,4 +142,4 @@ static int __init proc_schedstat_init(void)
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0;
}
-module_init(proc_schedstat_init);
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5..4ab70433965 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = task_rq(t)->clock, delta = 0;
+ unsigned long long now = rq_clock(rq), delta = 0;
if (unlikely(sched_info_on()))
if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(task_rq(t), delta);
+ rq_sched_info_dequeued(rq, delta);
}
/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = task_rq(t)->clock, delta = 0;
+ unsigned long long now = rq_clock(rq), delta = 0;
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- rq_sched_info_arrive(task_rq(t), delta);
+ rq_sched_info_arrive(rq, delta);
}
/*
@@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued)
- t->sched_info.last_queued = task_rq(t)->clock;
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily. Now we can calculate how long we ran.
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
*/
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
- unsigned long long delta = task_rq(t)->clock -
+ unsigned long long delta = rq_clock(rq) -
t->sched_info.last_arrival;
- rq_sched_info_depart(task_rq(t), delta);
+ rq_sched_info_depart(rq, delta);
if (t->state == TASK_RUNNING)
- sched_info_queued(t);
+ sched_info_queued(rq, t);
}
/*
@@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
{
- struct rq *rq = task_rq(prev);
-
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
- sched_info_depart(prev);
+ sched_info_depart(rq, prev);
if (next != rq->idle)
- sched_info_arrive(next);
+ sched_info_arrive(rq, next);
}
static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
- __sched_info_switch(prev, next);
+ __sched_info_switch(rq, prev, next);
}
#else
-#define sched_info_queued(t) do { } while (0)
+#define sched_info_queued(rq, t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
-#define sched_info_dequeued(t) do { } while (0)
-#define sched_info_switch(t, next) do { } while (0)
+#define sched_info_dequeued(rq, t) do { } while (0)
+#define sched_info_depart(rq, t) do { } while (0)
+#define sched_info_arrive(rq, next) do { } while (0)
+#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
@@ -162,6 +165,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
*/
/**
+ * cputimer_running - return true if cputimer is running
+ *
+ * @tsk: Pointer to target task.
+ */
+static inline bool cputimer_running(struct task_struct *tsk)
+
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer->running)
+ return false;
+
+ /*
+ * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+ * in __exit_signal(), we won't account to the signal struct further
+ * cputime consumed by that task, even though the task can still be
+ * ticking after __exit_signal().
+ *
+ * In order to keep a consistent behaviour between thread group cputime
+ * and thread group cputimer accounting, lets also ignore the cputime
+ * elapsing after __exit_signal() in any thread group timer running.
+ *
+ * This makes sure that POSIX CPU clocks and timers are synchronized, so
+ * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+ * clock delta is behind the expiring timer value.
+ */
+ if (unlikely(!tsk->sighand))
+ return false;
+
+ return true;
+}
+
+/**
* account_group_user_time - Maintain utime for a thread group.
*
* @tsk: Pointer to task structure.
@@ -176,7 +212,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
@@ -199,7 +235,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
@@ -222,7 +258,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd2..bfe0edadbfb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -23,26 +23,31 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq)
- return stop;
+ if (!stop || !stop->on_rq)
+ return NULL;
- return NULL;
+ put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
}
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)
@@ -52,6 +57,21 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +80,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
static void set_curr_task_stop(struct rq *rq)
{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq_clock_task(rq);
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
@@ -83,7 +106,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
- .next = &rt_sched_class,
+ .next = &dl_sched_class,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index 7fdd9eaca2c..0ffa20ae657 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -1,7 +1,7 @@
/*
* Generic waiting primitives.
*
- * (C) 2004 William Irwin, Oracle
+ * (C) 2004 Nadia Yvette Chambers, Oracle
*/
#include <linux/init.h>
#include <linux/export.h>
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, wake_flags, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+ __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+ int wake_flags = 1; /* XXX WF_SYNC */
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
* wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ wait->private = current;
+ wait->func = autoremove_wake_function;
+
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list)) {
+ if (wait->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_tail(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ }
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
/**
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on
@@ -267,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit);
*
* In order for this to function properly, as it uses waitqueue_active()
* internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
* cases where bitflags are manipulated non-atomically under a lock, one
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
* because spin_unlock() does not guarantee a memory barrier.
@@ -287,3 +414,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)
return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
}
EXPORT_SYMBOL(bit_waitqueue);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+ if (BITS_PER_LONG == 64) {
+ unsigned long q = (unsigned long)p;
+ return bit_waitqueue((void *)(q & ~1), q & 1);
+ }
+ return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+ void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+ atomic_t *val = key->flags;
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ atomic_read(val) != 0)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(atomic_t *), unsigned mode)
+{
+ atomic_t *val;
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ val = q->key.flags;
+ if (atomic_read(val) == 0)
+ break;
+ ret = (*action)(val);
+ } while (!ret && atomic_read(val) != 0);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p) \
+ struct wait_bit_queue name = { \
+ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
+ .wait = { \
+ .private = current, \
+ .func = wake_atomic_t_function, \
+ .task_list = \
+ LIST_HEAD_INIT((name).wait.task_list), \
+ }, \
+ }
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+ unsigned mode)
+{
+ wait_queue_head_t *wq = atomic_t_waitqueue(p);
+ DEFINE_WAIT_ATOMIC_T(wait, p);
+
+ return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+ __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895e..301bbc24739 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,365 @@
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ * of Berkeley Packet Filters/Linux Socket Filters.
*/
+#include <linux/atomic.h>
#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ * get/put helpers should be used when accessing an instance
+ * outside of a lifetime-guarded section. In general, this
+ * is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insnsi: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer. For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory. This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+ atomic_t usage;
+ struct seccomp_filter *prev;
+ struct sk_filter *prog;
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+/*
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static void populate_seccomp_data(struct seccomp_data *sd)
+{
+ struct task_struct *task = current;
+ struct pt_regs *regs = task_pt_regs(task);
+ unsigned long args[6];
+
+ sd->nr = syscall_get_nr(task, regs);
+ sd->arch = syscall_get_arch();
+ syscall_get_arguments(task, regs, 0, 6, args);
+ sd->args[0] = args[0];
+ sd->args[1] = args[1];
+ sd->args[2] = args[2];
+ sd->args[3] = args[3];
+ sd->args[4] = args[4];
+ sd->args[5] = args[5];
+ sd->instruction_pointer = KSTK_EIP(task);
+}
+
+/**
+ * seccomp_check_filter - verify seccomp filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load. It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+ int pc;
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ ftest->code = BPF_LDX | BPF_W | BPF_ABS;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct seccomp_data) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_LD | BPF_W | BPF_LEN:
+ ftest->code = BPF_LD | BPF_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ case BPF_LDX | BPF_W | BPF_LEN:
+ ftest->code = BPF_LDX | BPF_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ case BPF_MISC | BPF_TAX:
+ case BPF_MISC | BPF_TXA:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ case BPF_JMP | BPF_JA:
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+ struct seccomp_filter *f;
+ struct seccomp_data sd;
+ u32 ret = SECCOMP_RET_ALLOW;
+
+ /* Ensure unexpected behavior doesn't result in failing open. */
+ if (WARN_ON(current->seccomp.filter == NULL))
+ return SECCOMP_RET_KILL;
+
+ populate_seccomp_data(&sd);
+
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+ * value always takes priority (ignoring the DATA).
+ */
+ for (f = current->seccomp.filter; f; f = f->prev) {
+ u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
+
+ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ ret = cur_ret;
+ }
+ return ret;
+}
+
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+ struct seccomp_filter *filter;
+ unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+ unsigned long total_insns = fprog->len;
+ struct sock_filter *fp;
+ int new_len;
+ long ret;
+
+ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ for (filter = current->seccomp.filter; filter; filter = filter->prev)
+ total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /*
+ * Installing a seccomp filter requires that the task has
+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+ * This avoids scenarios where unprivileged tasks can affect the
+ * behavior of privileged children.
+ */
+ if (!current->no_new_privs &&
+ security_capable_noaudit(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN) != 0)
+ return -EACCES;
+
+ fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
+ if (!fp)
+ return -ENOMEM;
+
+ /* Copy the instructions from fprog. */
+ ret = -EFAULT;
+ if (copy_from_user(fp, fprog->filter, fp_size))
+ goto free_prog;
+
+ /* Check and rewrite the fprog via the skb checker */
+ ret = sk_chk_filter(fp, fprog->len);
+ if (ret)
+ goto free_prog;
+
+ /* Check and rewrite the fprog for seccomp use */
+ ret = seccomp_check_filter(fp, fprog->len);
+ if (ret)
+ goto free_prog;
+
+ /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
+ ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
+ if (ret)
+ goto free_prog;
+
+ /* Allocate a new seccomp_filter */
+ ret = -ENOMEM;
+ filter = kzalloc(sizeof(struct seccomp_filter),
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ goto free_prog;
+
+ filter->prog = kzalloc(sk_filter_size(new_len),
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter->prog)
+ goto free_filter;
+
+ ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+ if (ret)
+ goto free_filter_prog;
+ kfree(fp);
+
+ atomic_set(&filter->usage, 1);
+ filter->prog->len = new_len;
+
+ sk_filter_select_runtime(filter->prog);
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+ return 0;
+
+free_filter_prog:
+ kfree(filter->prog);
+free_filter:
+ kfree(filter);
+free_prog:
+ kfree(fp);
+ return ret;
+}
+
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+static long seccomp_attach_user_filter(char __user *user_filter)
+{
+ struct sock_fprog fprog;
+ long ret = -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+ goto out;
+ ret = seccomp_attach_filter(&fprog);
+out:
+ return ret;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ if (!orig)
+ return;
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&orig->usage);
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* Clean up single-reference branches iteratively. */
+ while (orig && atomic_dec_and_test(&orig->usage)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ sk_filter_free(freeme->prog);
+ kfree(freeme);
+ }
+}
+
+/**
+ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+static void seccomp_send_sigsys(int syscall, int reason)
+{
+ struct siginfo info;
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch();
+ info.si_syscall = syscall;
+ force_sig_info(SIGSYS, &info, current);
+}
+#endif /* CONFIG_SECCOMP_FILTER */
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +380,15 @@ static int mode1_syscalls_32[] = {
};
#endif
-void __secure_computing(int this_syscall)
+int __secure_computing(int this_syscall)
{
int mode = current->seccomp.mode;
- int * syscall;
+ int exit_sig = 0;
+ int *syscall;
+ u32 ret;
switch (mode) {
- case 1:
+ case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (is_compat_task())
@@ -45,9 +396,61 @@ void __secure_computing(int this_syscall)
#endif
do {
if (*syscall == this_syscall)
- return;
+ return 0;
} while (*++syscall);
+ exit_sig = SIGKILL;
+ ret = SECCOMP_RET_KILL;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER: {
+ int data;
+ struct pt_regs *regs = task_pt_regs(current);
+ ret = seccomp_run_filters(this_syscall);
+ data = ret & SECCOMP_RET_DATA;
+ ret &= SECCOMP_RET_ACTION;
+ switch (ret) {
+ case SECCOMP_RET_ERRNO:
+ /* Set the low-order 16-bits as a errno. */
+ syscall_set_return_value(current, regs,
+ -data, 0);
+ goto skip;
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, regs);
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+ case SECCOMP_RET_TRACE:
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
+ goto skip;
+ }
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ break;
+ if (syscall_get_nr(current, regs) < 0)
+ goto skip; /* Explicit request to skip. */
+
+ return 0;
+ case SECCOMP_RET_ALLOW:
+ return 0;
+ case SECCOMP_RET_KILL:
+ default:
+ break;
+ }
+ exit_sig = SIGSYS;
break;
+ }
+#endif
default:
BUG();
}
@@ -55,8 +458,13 @@ void __secure_computing(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall);
- do_exit(SIGKILL);
+ audit_seccomp(this_syscall, exit_sig, ret);
+ do_exit(exit_sig);
+#ifdef CONFIG_SECCOMP_FILTER
+skip:
+ audit_seccomp(this_syscall, exit_sig, ret);
+#endif
+ return -1;
}
long prctl_get_seccomp(void)
@@ -64,25 +472,48 @@ long prctl_get_seccomp(void)
return current->seccomp.mode;
}
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters. Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
- long ret;
+ long ret = -EINVAL;
- /* can set it only once to be even more secure */
- ret = -EPERM;
- if (unlikely(current->seccomp.mode))
+ if (current->seccomp.mode &&
+ current->seccomp.mode != seccomp_mode)
goto out;
- ret = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ ret = 0;
#ifdef TIF_NOTSC
disable_TSC();
#endif
- ret = 0;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ ret = seccomp_attach_user_filter(filter);
+ if (ret)
+ goto out;
+ break;
+#endif
+ default:
+ goto out;
}
- out:
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+out:
return ret;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index c73c4284160..a4077e90f19 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
+#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
@@ -29,6 +30,11 @@
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
+#include <linux/uprobes.h>
+#include <linux/compat.h>
+#include <linux/cn_proc.h>
+#include <linux/compiler.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -36,6 +42,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
+#include <asm/cacheflush.h>
#include "audit.h" /* audit_signal_info() */
/*
@@ -58,21 +65,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
-static int sig_task_ignored(struct task_struct *t, int sig,
- int from_ancestor_ns)
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
{
void __user *handler;
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !from_ancestor_ns)
+ handler == SIG_DFL && !force)
return 1;
return sig_handler_ignored(handler, sig);
}
-static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+static int sig_ignored(struct task_struct *t, int sig, bool force)
{
/*
* Blocked signals are never ignored, since the
@@ -82,7 +88,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, from_ancestor_ns))
+ if (!sig_task_ignored(t, sig, force))
return 0;
/*
@@ -160,7 +166,7 @@ void recalc_sigpending(void)
#define SYNCHRONOUS_MASK \
(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
- sigmask(SIGTRAP) | sigmask(SIGFPE))
+ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
int next_signal(struct sigpending *pending, sigset_t *mask)
{
@@ -271,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
{
if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
task->jobctl &= ~JOBCTL_TRAPPING;
+ smp_mb(); /* advised by wake_up_bit() */
wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
}
}
@@ -482,6 +489,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
@@ -677,23 +687,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
-void signal_wake_up(struct task_struct *t, int resume)
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
- unsigned int mask;
-
set_tsk_thread_flag(t, TIF_SIGPENDING);
-
/*
- * For SIGKILL, we want to wake it up in the stopped/traced/killable
+ * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
- mask = TASK_INTERRUPTIBLE;
- if (resume)
- mask |= TASK_WAKEKILL;
- if (!wake_up_state(t, mask))
+ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
@@ -702,11 +706,8 @@ void signal_wake_up(struct task_struct *t, int resume)
* Returns 1 if any signals were found.
*
* All callers must be holding the siglock.
- *
- * This version takes a sigset mask and looks at all signals,
- * not just those in the first mask word.
*/
-static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
@@ -724,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
}
return 1;
}
-/*
- * Remove signals in mask from the pending set and queue.
- * Returns 1 if any signals were found.
- *
- * All callers must be holding the siglock.
- */
-static int rm_from_queue(unsigned long mask, struct sigpending *s)
-{
- struct sigqueue *q, *n;
-
- if (!sigtestsetmask(&s->signal, mask))
- return 0;
-
- sigdelsetmask(&s->signal, mask);
- list_for_each_entry_safe(q, n, &s->list, list) {
- if (q->info.si_signo < SIGRTMIN &&
- (mask & sigmask(q->info.si_signo))) {
- list_del_init(&q->list);
- __sigqueue_free(q);
- }
- }
- return 1;
-}
static inline int is_si_special(const struct siginfo *info)
{
@@ -767,14 +745,13 @@ static int kill_ok_by_cred(struct task_struct *t)
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
- if (cred->user->user_ns == tcred->user->user_ns &&
- (cred->euid == tcred->suid ||
- cred->euid == tcred->uid ||
- cred->uid == tcred->suid ||
- cred->uid == tcred->uid))
+ if (uid_eq(cred->euid, tcred->suid) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->uid, tcred->suid) ||
+ uid_eq(cred->uid, tcred->uid))
return 1;
- if (ns_capable(tcred->user->user_ns, CAP_KILL))
+ if (ns_capable(tcred->user_ns, CAP_KILL))
return 1;
return 0;
@@ -842,7 +819,7 @@ static void ptrace_trap_notify(struct task_struct *t)
assert_spin_locked(&t->sighand->siglock);
task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
- signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+ ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}
/*
@@ -855,12 +832,15 @@ static void ptrace_trap_notify(struct task_struct *t)
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
-static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
+static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
+ sigset_t flush;
- if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
+ if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
+ if (signal->flags & SIGNAL_GROUP_COREDUMP)
+ return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
*/
@@ -868,26 +848,25 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
/*
* This is a stop signal. Remove SIGCONT from all queues.
*/
- rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
- t = p;
- do {
- rm_from_queue(sigmask(SIGCONT), &t->pending);
- } while_each_thread(p, t);
+ siginitset(&flush, sigmask(SIGCONT));
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&flush, &t->pending);
} else if (sig == SIGCONT) {
unsigned int why;
/*
* Remove all stop signals from all queues, wake all threads.
*/
- rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
- t = p;
- do {
+ siginitset(&flush, SIG_KERNEL_STOP_MASK);
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t) {
+ flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
if (likely(!(t->ptrace & PT_SEIZED)))
wake_up_state(t, __TASK_STOPPED);
else
ptrace_trap_notify(t);
- } while_each_thread(p, t);
+ }
/*
* Notify the parent with CLD_CONTINUED if we were stopped.
@@ -915,7 +894,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
}
}
- return !sig_ignored(p, sig, from_ancestor_ns);
+ return !sig_ignored(p, sig, force);
}
/*
@@ -1020,15 +999,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
-/*
- * map the uid in struct cred into user namespace *ns
- */
-static inline uid_t map_cred_ns(const struct cred *cred,
- struct user_namespace *ns)
-{
- return user_ns_map_uid(ns, cred, cred->uid);
-}
-
#ifdef CONFIG_USER_NS
static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
{
@@ -1038,8 +1008,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
if (SI_FROMKERNEL(info))
return;
- info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
- current_cred(), info->si_uid);
+ rcu_read_lock();
+ info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+ make_kuid(current_user_ns(), info->si_uid));
+ rcu_read_unlock();
}
#else
static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1054,13 +1026,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
struct sigpending *pending;
struct sigqueue *q;
int override_rlimit;
-
- trace_signal_generate(sig, info, t);
+ int ret = 0, result;
assert_spin_locked(&t->sighand->siglock);
- if (!prepare_signal(sig, t, from_ancestor_ns))
- return 0;
+ result = TRACE_SIGNAL_IGNORED;
+ if (!prepare_signal(sig, t,
+ from_ancestor_ns || (info == SEND_SIG_FORCED)))
+ goto ret;
pending = group ? &t->signal->shared_pending : &t->pending;
/*
@@ -1068,8 +1041,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
* exactly one non-rt signal, so that we can get more
* detailed information about the cause of the signal.
*/
+ result = TRACE_SIGNAL_ALREADY_PENDING;
if (legacy_queue(pending, sig))
- return 0;
+ goto ret;
+
+ result = TRACE_SIGNAL_DELIVERED;
/*
* fast-pathed signals for kernel-internal things like SIGSTOP
* or SIGKILL.
@@ -1102,7 +1078,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
q->info.si_code = SI_USER;
q->info.si_pid = task_tgid_nr_ns(current,
task_active_pid_ns(t));
- q->info.si_uid = current_uid();
+ q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
break;
case (unsigned long) SEND_SIG_PRIV:
q->info.si_signo = sig;
@@ -1127,14 +1103,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
* signal was rt and sent by user using something
* other than kill().
*/
- trace_signal_overflow_fail(sig, group, info);
- return -EAGAIN;
+ result = TRACE_SIGNAL_OVERFLOW_FAIL;
+ ret = -EAGAIN;
+ goto ret;
} else {
/*
* This is a silent loss of information. We still
* send the signal, but the *info bits are lost.
*/
- trace_signal_lose_info(sig, group, info);
+ result = TRACE_SIGNAL_LOSE_INFO;
}
}
@@ -1142,7 +1119,9 @@ out_set:
signalfd_notify(t, sig);
sigaddset(&pending->signal, sig);
complete_signal(sig, t, group);
- return 0;
+ret:
+ trace_signal_generate(sig, info, t, group, result);
+ return ret;
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1158,13 +1137,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
return __send_signal(sig, info, t, group, from_ancestor_ns);
}
-static void print_fatal_signal(struct pt_regs *regs, int signr)
+static void print_fatal_signal(int signr)
{
- printk("%s/%d: potentially unexpected fatal signal %d.\n",
- current->comm, task_pid_nr(current), signr);
+ struct pt_regs *regs = signal_pt_regs();
+ printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
- printk("code at %08lx: ", regs->ip);
+ printk(KERN_INFO "code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
@@ -1172,11 +1151,11 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
- printk("%02x ", insn);
+ printk(KERN_CONT "%02x ", insn);
}
}
+ printk(KERN_CONT "\n");
#endif
- printk("\n");
preempt_disable();
show_regs(regs);
preempt_enable();
@@ -1380,10 +1359,8 @@ static int kill_as_cred_perm(const struct cred *cred,
struct task_struct *target)
{
const struct cred *pcred = __task_cred(target);
- if (cred->user_ns != pcred->user_ns)
- return 0;
- if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
- cred->uid != pcred->suid && cred->uid != pcred->uid)
+ if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
+ !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
return 0;
return 1;
}
@@ -1585,7 +1562,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
int sig = q->info.si_signo;
struct sigpending *pending;
unsigned long flags;
- int ret;
+ int ret, result;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
@@ -1594,7 +1571,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
goto ret;
ret = 1; /* the signal is ignored */
- if (!prepare_signal(sig, t, 0))
+ result = TRACE_SIGNAL_IGNORED;
+ if (!prepare_signal(sig, t, false))
goto out;
ret = 0;
@@ -1605,6 +1583,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
*/
BUG_ON(q->info.si_code != SI_TIMER);
q->info.si_overrun++;
+ result = TRACE_SIGNAL_ALREADY_PENDING;
goto out;
}
q->info.si_overrun = 0;
@@ -1614,7 +1593,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
list_add_tail(&q->list, &pending->list);
sigaddset(&pending->signal, sig);
complete_signal(sig, t, group);
+ result = TRACE_SIGNAL_DELIVERED;
out:
+ trace_signal_generate(sig, &q->info, t, group, result);
unlock_task_sighand(t, &flags);
ret:
return ret;
@@ -1633,6 +1614,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
unsigned long flags;
struct sighand_struct *psig;
bool autoreap = false;
+ cputime_t utime, stime;
BUG_ON(sig == -1);
@@ -1642,28 +1624,37 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ if (sig != SIGCHLD) {
+ /*
+ * This is only possible if parent == real_parent.
+ * Check if it has changed security domain.
+ */
+ if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+ sig = SIGCHLD;
+ }
+
info.si_signo = sig;
info.si_errno = 0;
/*
- * we are under tasklist_lock here so our parent is tied to
- * us and cannot exit and release its namespace.
+ * We are under tasklist_lock here so our parent is tied to
+ * us and cannot change.
*
- * the only it can is to switch its nsproxy with sys_unshare,
- * bu uncharing pid namespaces is not allowed, so we'll always
- * see relevant namespace
+ * task_active_pid_ns will always return the same pid namespace
+ * until a task passes through release_task.
*
* write_lock() currently calls preempt_disable() which is the
* same as rcu_read_lock(), but according to Oleg, this is not
* correct to rely on this
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
- info.si_uid = map_cred_ns(__task_cred(tsk),
- task_cred_xxx(tsk->parent, user_ns));
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
+ info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
+ task_uid(tsk));
rcu_read_unlock();
- info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
- info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
+ task_cputime(tsk, &utime, &stime);
+ info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
+ info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
@@ -1727,6 +1718,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
+ cputime_t utime, stime;
if (for_ptracer) {
parent = tsk->parent;
@@ -1741,13 +1733,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
* see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
- info.si_uid = map_cred_ns(__task_cred(tsk),
- task_cred_xxx(parent, user_ns));
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
+ info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
rcu_read_unlock();
- info.si_utime = cputime_to_clock_t(tsk->utime);
- info.si_stime = cputime_to_clock_t(tsk->stime);
+ task_cputime(tsk, &utime, &stime);
+ info.si_utime = cputime_to_clock_t(utime);
+ info.si_stime = cputime_to_clock_t(stime);
info.si_code = why;
switch (why) {
@@ -1788,6 +1780,10 @@ static inline int may_ptrace_stop(void)
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
+ *
+ * This is almost outdated, a task with the pending SIGKILL can't
+ * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+ * after SIGKILL was already dequeued.
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
@@ -1898,7 +1894,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
- schedule();
+ freezable_schedule();
} else {
/*
* By the time we got the lock, our tracer went away.
@@ -1913,6 +1909,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
+ /* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -1920,13 +1917,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
}
/*
- * While in TASK_TRACED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
- */
- try_to_freeze();
-
- /*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
@@ -1953,7 +1943,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
- info.si_uid = current_uid();
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
ptrace_stop(exit_code, why, 1, &info);
@@ -1962,6 +1952,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
void ptrace_notify(int exit_code)
{
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+ if (unlikely(current->task_works))
+ task_work_run();
spin_lock_irq(&current->sighand->siglock);
ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2032,8 +2024,8 @@ static bool do_signal_stop(int signr)
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
- for (t = next_thread(current); t != current;
- t = next_thread(t)) {
+ t = current;
+ while_each_thread(current, t) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
@@ -2080,7 +2072,7 @@ static bool do_signal_stop(int signr)
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
- schedule();
+ freezable_schedule();
return true;
} else {
/*
@@ -2126,10 +2118,9 @@ static void do_jobctl_trap(void)
}
}
-static int ptrace_signal(int signr, siginfo_t *info,
- struct pt_regs *regs, void *cookie)
+static int ptrace_signal(int signr, siginfo_t *info)
{
- ptrace_signal_deliver(regs, cookie);
+ ptrace_signal_deliver();
/*
* We do not check sig_kernel_stop(signr) but set this marker
* unconditionally because we do not know whether debugger will
@@ -2161,8 +2152,8 @@ static int ptrace_signal(int signr, siginfo_t *info,
info->si_code = SI_USER;
rcu_read_lock();
info->si_pid = task_pid_vnr(current->parent);
- info->si_uid = map_cred_ns(__task_cred(current->parent),
- current_user_ns());
+ info->si_uid = from_kuid_munged(current_user_ns(),
+ task_uid(current->parent));
rcu_read_unlock();
}
@@ -2182,15 +2173,20 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
struct signal_struct *signal = current->signal;
int signr;
-relock:
+ if (unlikely(current->task_works))
+ task_work_run();
+
+ if (unlikely(uprobe_deny_signal()))
+ return 0;
+
/*
- * We'll jump back here after any time we were stopped in TASK_STOPPED.
- * While in TASK_STOPPED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
+ * Do this once, we can't return to user-mode if freezing() == T.
+ * do_signal_stop() and ptrace_stop() do freezable_schedule() and
+ * thus do not need another check after return.
*/
try_to_freeze();
+relock:
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if
@@ -2247,8 +2243,7 @@ relock:
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, info,
- regs, cookie);
+ signr = ptrace_signal(signr, info);
if (!signr)
continue;
}
@@ -2333,7 +2328,8 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(regs, info->si_signo);
+ print_fatal_signal(info->si_signo);
+ proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
@@ -2342,7 +2338,7 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info->si_signo, info->si_signo, regs);
+ do_coredump(info);
}
/*
@@ -2356,24 +2352,43 @@ relock:
}
/**
- * block_sigmask - add @ka's signal mask to current->blocked
- * @ka: action for @signr
- * @signr: signal that has been successfully delivered
+ * signal_delivered -
+ * @sig: number of signal being delivered
+ * @info: siginfo_t of signal being delivered
+ * @ka: sigaction setting that chose the handler
+ * @regs: user register state
+ * @stepping: nonzero if debugger single-step or block-step in use
*
- * This function should be called when a signal has succesfully been
- * delivered. It adds the mask of signals for @ka to current->blocked
- * so that they are blocked during the execution of the signal
- * handler. In addition, @signr will be blocked unless %SA_NODEFER is
- * set in @ka->sa.sa_flags.
+ * This function should be called when a signal has successfully been
+ * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is set in @ka->sa.sa_flags. Tracing is notified.
*/
-void block_sigmask(struct k_sigaction *ka, int signr)
+void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+ struct pt_regs *regs, int stepping)
{
sigset_t blocked;
+ /* A signal was successfully delivered, and the
+ saved sigmask was stored on the signal frame,
+ and will be restored by sigreturn. So we can
+ simply clear the restore sigmask flag. */
+ clear_restore_sigmask();
+
sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, signr);
+ sigaddset(&blocked, sig);
set_current_blocked(&blocked);
+ tracehook_signal_handler(sig, info, ka, regs, stepping);
+}
+
+void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
+{
+ if (failed)
+ force_sigsegv(ksig->sig, current);
+ else
+ signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+ signal_pt_regs(), stepping);
}
/*
@@ -2506,7 +2521,13 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
* It is wrong to change ->blocked directly, this helper should be used
* to ensure the process can't miss a shared signal we are going to block.
*/
-void set_current_blocked(const sigset_t *newset)
+void set_current_blocked(sigset_t *newset)
+{
+ sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ __set_current_blocked(newset);
+}
+
+void __set_current_blocked(const sigset_t *newset)
{
struct task_struct *tsk = current;
@@ -2546,7 +2567,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
return -EINVAL;
}
- set_current_blocked(&newset);
+ __set_current_blocked(&newset);
return 0;
}
@@ -2587,44 +2608,99 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
return 0;
}
-long do_sigpending(void __user *set, unsigned long sigsetsize)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
+ compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
- long error = -EINVAL;
- sigset_t pending;
+#ifdef __BIG_ENDIAN
+ sigset_t old_set = current->blocked;
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (nset) {
+ compat_sigset_t new32;
+ sigset_t new_set;
+ int error;
+ if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+
+ sigset_from_compat(&new_set, &new32);
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ error = sigprocmask(how, &new_set, NULL);
+ if (error)
+ return error;
+ }
+ if (oset) {
+ compat_sigset_t old32;
+ sigset_to_compat(&old32, &old_set);
+ if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+ return 0;
+#else
+ return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
+ (sigset_t __user *)oset, sigsetsize);
+#endif
+}
+#endif
+
+static int do_sigpending(void *set, unsigned long sigsetsize)
+{
if (sigsetsize > sizeof(sigset_t))
- goto out;
+ return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
- sigorsets(&pending, &current->pending.signal,
+ sigorsets(set, &current->pending.signal,
&current->signal->shared_pending.signal);
spin_unlock_irq(&current->sighand->siglock);
/* Outside the lock because only this thread touches it. */
- sigandsets(&pending, &current->blocked, &pending);
-
- error = -EFAULT;
- if (!copy_to_user(set, &pending, sigsetsize))
- error = 0;
-
-out:
- return error;
+ sigandsets(set, &current->blocked, set);
+ return 0;
}
/**
* sys_rt_sigpending - examine a pending signal that has been raised
* while blocked
- * @set: stores pending signals
+ * @uset: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
-SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
- return do_sigpending(set, sigsetsize);
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err && copy_to_user(uset, &set, sigsetsize))
+ err = -EFAULT;
+ return err;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
+ compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err) {
+ compat_sigset_t set32;
+ sigset_to_compat(&set32, &set);
+ /* we can get here only if sigsetsize <= sizeof(set) */
+ if (copy_to_user(uset, &set32, sigsetsize))
+ err = -EFAULT;
+ }
+ return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
+#endif
+}
+#endif
+
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
-int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
{
int err;
@@ -2686,6 +2762,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
+#ifdef __ARCH_SIGSYS
+ case __SI_SYS:
+ err |= __put_user(from->si_call_addr, &to->si_call_addr);
+ err |= __put_user(from->si_syscall, &to->si_syscall);
+ err |= __put_user(from->si_arch, &to->si_arch);
+ break;
+#endif
default: /* this is just in case for now ... */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
@@ -2742,11 +2825,11 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
- timeout = schedule_timeout_interruptible(timeout);
+ timeout = freezable_schedule_timeout_interruptible(timeout);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
- siginitset(&tsk->real_blocked, 0);
+ sigemptyset(&tsk->real_blocked);
sig = dequeue_signal(tsk, &mask, info);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -2808,7 +2891,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = task_tgid_vnr(current);
- info.si_uid = current_uid();
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return kill_something_info(sig, &info, pid);
}
@@ -2845,13 +2928,13 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
info.si_pid = task_tgid_vnr(current);
- info.si_uid = current_uid();
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return do_send_specific(tgid, pid, sig, &info);
}
@@ -2891,6 +2974,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
return do_tkill(0, pid, sig);
}
+static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
+{
+ /* Not even root can pretend to send signals from the kernel.
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid)) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
+ return -EPERM;
+ }
+ info->si_signo = sig;
+
+ /* POSIX.1b doesn't mention process groups. */
+ return kill_proc_info(sig, info, pid);
+}
+
/**
* sys_rt_sigqueueinfo - send signal information to a signal
* @pid: the PID of the thread
@@ -2901,25 +3001,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
siginfo_t info;
-
if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
return -EFAULT;
+ return do_rt_sigqueueinfo(pid, sig, &info);
+}
- /* Not even root can pretend to send signals from the kernel.
- * Nor can they impersonate a kill()/tgkill(), which adds source info.
- */
- if (info.si_code >= 0 || info.si_code == SI_TKILL) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info.si_code < 0);
- return -EPERM;
- }
- info.si_signo = sig;
-
- /* POSIX.1b doesn't mention process groups. */
- return kill_proc_info(sig, &info, pid);
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+ int ret = copy_siginfo_from_user32(&info, uinfo);
+ if (unlikely(ret))
+ return ret;
+ return do_rt_sigqueueinfo(pid, sig, &info);
}
+#endif
-long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
@@ -2928,7 +3029,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
- if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+ if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
+ (task_pid_vnr(current) != pid)) {
/* We used to allow any < 0 si_code */
WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
@@ -2949,18 +3051,54 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
+ compat_pid_t, tgid,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+#endif
+
+/*
+ * For kthreads only, must not be used if cloned with CLONE_SIGHAND
+ */
+void kernel_sigaction(int sig, __sighandler_t action)
+{
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[sig - 1].sa.sa_handler = action;
+ if (action == SIG_IGN) {
+ sigset_t mask;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+
+ flush_sigqueue_mask(&mask, &current->signal->shared_pending);
+ flush_sigqueue_mask(&mask, &current->pending);
+ recalc_sigpending();
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+}
+EXPORT_SYMBOL(kernel_sigaction);
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
- struct task_struct *t = current;
+ struct task_struct *p = current, *t;
struct k_sigaction *k;
sigset_t mask;
if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
return -EINVAL;
- k = &t->sighand->action[sig-1];
+ k = &p->sighand->action[sig-1];
- spin_lock_irq(&current->sighand->siglock);
+ spin_lock_irq(&p->sighand->siglock);
if (oact)
*oact = *k;
@@ -2979,22 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
- if (sig_handler_ignored(sig_handler(t, sig), sig)) {
+ if (sig_handler_ignored(sig_handler(p, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
- rm_from_queue_full(&mask, &t->signal->shared_pending);
- do {
- rm_from_queue_full(&mask, &t->pending);
- t = next_thread(t);
- } while (t != current);
+ flush_sigqueue_mask(&mask, &p->signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&mask, &t->pending);
}
}
- spin_unlock_irq(&current->sighand->siglock);
+ spin_unlock_irq(&p->sighand->siglock);
return 0;
}
-int
+static int
do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
{
stack_t oss;
@@ -3059,6 +3195,76 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
out:
return error;
}
+SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
+{
+ return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+}
+
+int restore_altstack(const stack_t __user *uss)
+{
+ int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ /* squash all but EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __save_altstack(stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+ const compat_stack_t __user *, uss_ptr,
+ compat_stack_t __user *, uoss_ptr)
+{
+ stack_t uss, uoss;
+ int ret;
+ mm_segment_t seg;
+
+ if (uss_ptr) {
+ compat_stack_t uss32;
+
+ memset(&uss, 0, sizeof(stack_t));
+ if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
+ return -EFAULT;
+ uss.ss_sp = compat_ptr(uss32.ss_sp);
+ uss.ss_flags = uss32.ss_flags;
+ uss.ss_size = uss32.ss_size;
+ }
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+ ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
+ (stack_t __force __user *) &uoss,
+ compat_user_stack_pointer());
+ set_fs(seg);
+ if (ret >= 0 && uoss_ptr) {
+ if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
+ __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+ __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+ __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+int compat_restore_altstack(const compat_stack_t __user *uss)
+{
+ int err = compat_sys_sigaltstack(uss, NULL);
+ /* squash all but -EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+#endif
#ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3068,7 +3274,7 @@ out:
*/
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
{
- return do_sigpending(set, sizeof(*set));
+ return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
}
#endif
@@ -3095,7 +3301,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(*nset)))
return -EFAULT;
- new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
new_blocked = current->blocked;
@@ -3125,7 +3330,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
-#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+#ifndef CONFIG_ODD_RT_SIGACTION
/**
* sys_rt_sigaction - alter an action taken by a process
* @sig: signal to be sent
@@ -3159,9 +3364,134 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
out:
return ret;
}
-#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
+ const struct compat_sigaction __user *, act,
+ struct compat_sigaction __user *, oact,
+ compat_size_t, sigsetsize)
+{
+ struct k_sigaction new_ka, old_ka;
+ compat_sigset_t mask;
+#ifdef __ARCH_HAS_SA_RESTORER
+ compat_uptr_t restorer;
+#endif
+ int ret;
-#ifdef __ARCH_WANT_SYS_SGETMASK
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+
+ if (act) {
+ compat_uptr_t handler;
+ ret = get_user(handler, &act->sa_handler);
+ new_ka.sa.sa_handler = compat_ptr(handler);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= get_user(restorer, &act->sa_restorer);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+#endif
+ ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
+ if (ret)
+ return -EFAULT;
+ sigset_from_compat(&new_ka.sa.sa_mask, &mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+ if (!ret && oact) {
+ sigset_to_compat(&mask, &old_ka.sa.sa_mask);
+ ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler);
+ ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer);
+#endif
+ }
+ return ret;
+}
+#endif
+#endif /* !CONFIG_ODD_RT_SIGACTION */
+
+#ifdef CONFIG_OLD_SIGACTION
+SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct old_sigaction __user *, act,
+ struct old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+
+ if (act) {
+ old_sigset_t mask;
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+#ifdef CONFIG_COMPAT_OLD_SIGACTION
+COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct compat_old_sigaction __user *, act,
+ struct compat_old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+ compat_old_sigset_t mask;
+ compat_uptr_t handler, restorer;
+
+ if (act) {
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(handler, &act->sa_handler) ||
+ __get_user(restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ new_ka.sa.sa_handler = compat_ptr(handler);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_SGETMASK_SYSCALL
/*
* For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3177,12 +3507,12 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
int old = current->blocked.sig[0];
sigset_t newset;
- siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
+ siginitset(&newset, newmask);
set_current_blocked(&newset);
return old;
}
-#endif /* __ARCH_WANT_SGETMASK */
+#endif /* CONFIG_SGETMASK_SYSCALL */
#ifdef __ARCH_WANT_SYS_SIGNAL
/*
@@ -3216,7 +3546,17 @@ SYSCALL_DEFINE0(pause)
#endif
-#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+int sigsuspend(sigset_t *set)
+{
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(set);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_restore_sigmask();
+ return -ERESTARTNOHAND;
+}
+
/**
* sys_rt_sigsuspend - replace the signal mask for a value with the
* @unewset value until a signal is received
@@ -3233,19 +3573,49 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
if (copy_from_user(&newset, unewset, sizeof(newset)))
return -EFAULT;
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ return sigsuspend(&newset);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t newset;
+ compat_sigset_t newset32;
- current->saved_sigmask = current->blocked;
- set_current_blocked(&newset);
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
+ if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&newset, &newset32);
+ return sigsuspend(&newset);
+#else
+ /* on little-endian bitmaps don't care about granularity */
+ return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
+#endif
}
-#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+#endif
+
+#ifdef CONFIG_OLD_SIGSUSPEND
+SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND3
+SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489..80c33f8de14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,34 +13,23 @@
#include <linux/smp.h>
#include <linux/cpu.h>
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
- struct list_head queue;
- raw_spinlock_t lock;
-} call_function __cacheline_aligned_in_smp =
- {
- .queue = LIST_HEAD_INIT(call_function.queue),
- .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
- };
+#include "smpboot.h"
enum {
CSD_FLAG_LOCK = 0x01,
+ CSD_FLAG_WAIT = 0x02,
};
struct call_function_data {
- struct call_single_data csd;
- atomic_t refs;
+ struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
-struct call_single_queue {
- struct list_head list;
- raw_spinlock_t lock;
-};
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -54,15 +43,36 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return notifier_from_errno(-ENOMEM);
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
+ free_cpumask_var(cfd->cpumask);
+ return notifier_from_errno(-ENOMEM);
+ }
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+ /* Fall-through to the CPU_DEAD[_FROZEN] case. */
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
+ free_percpu(cfd->csd);
+ break;
+
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
break;
#endif
};
@@ -70,7 +80,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+static struct notifier_block hotplug_cfd_notifier = {
.notifier_call = hotplug_cfd,
};
@@ -79,12 +89,8 @@ void __init call_function_init(void)
void *cpu = (void *)(long)smp_processor_id();
int i;
- for_each_possible_cpu(i) {
- struct call_single_queue *q = &per_cpu(call_single_queue, i);
-
- raw_spin_lock_init(&q->lock);
- INIT_LIST_HEAD(&q->list);
- }
+ for_each_possible_cpu(i)
+ init_llist_head(&per_cpu(call_single_queue, i));
hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
register_cpu_notifier(&hotplug_cfd_notifier);
@@ -97,16 +103,16 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void csd_lock_wait(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *csd)
{
- while (data->flags & CSD_FLAG_LOCK)
+ while (csd->flags & CSD_FLAG_LOCK)
cpu_relax();
}
-static void csd_lock(struct call_single_data *data)
+static void csd_lock(struct call_single_data *csd)
{
- csd_lock_wait(data);
- data->flags = CSD_FLAG_LOCK;
+ csd_lock_wait(csd);
+ csd->flags |= CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
@@ -116,34 +122,57 @@ static void csd_lock(struct call_single_data *data)
smp_mb();
}
-static void csd_unlock(struct call_single_data *data)
+static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+ WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_mb();
- data->flags &= ~CSD_FLAG_LOCK;
+ csd->flags &= ~CSD_FLAG_LOCK;
}
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+
/*
* Insert a previously allocated call_single_data element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static
-void generic_exec_single(int cpu, struct call_single_data *data, int wait)
+static int generic_exec_single(int cpu, struct call_single_data *csd,
+ smp_call_func_t func, void *info, int wait)
{
- struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
+ struct call_single_data csd_stack = { .flags = 0 };
unsigned long flags;
- int ipi;
- raw_spin_lock_irqsave(&dst->lock, flags);
- ipi = list_empty(&dst->list);
- list_add_tail(&data->list, &dst->list);
- raw_spin_unlock_irqrestore(&dst->lock, flags);
+
+ if (cpu == smp_processor_id()) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ return 0;
+ }
+
+
+ if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO;
+
+
+ if (!csd) {
+ csd = &csd_stack;
+ if (!wait)
+ csd = &__get_cpu_var(csd_data);
+ }
+
+ csd_lock(csd);
+
+ csd->func = func;
+ csd->info = info;
+
+ if (wait)
+ csd->flags |= CSD_FLAG_WAIT;
/*
* The list addition should be visible before sending the IPI
@@ -156,136 +185,74 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
- if (ipi)
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
arch_send_call_function_single_ipi(cpu);
if (wait)
- csd_lock_wait(data);
+ csd_lock_wait(csd);
+
+ return 0;
}
-/*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
+/**
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
*/
-void generic_smp_call_function_interrupt(void)
+void generic_smp_call_function_single_interrupt(void)
{
- struct call_function_data *data;
- int cpu = smp_processor_id();
-
- /*
- * Shouldn't receive this interrupt on a cpu that is not yet online.
- */
- WARN_ON_ONCE(!cpu_online(cpu));
-
- /*
- * Ensure entry is visible on call_function_queue after we have
- * entered the IPI. See comment in smp_call_function_many.
- * If we don't have this, then we may miss an entry on the list
- * and never get another IPI to process it.
- */
- smp_mb();
-
- /*
- * It's ok to use list_for_each_rcu() here even though we may
- * delete 'pos', since list_del_rcu() doesn't clear ->next
- */
- list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
- int refs;
- smp_call_func_t func;
-
- /*
- * Since we walk the list without any locks, we might
- * see an entry that was completed, removed from the
- * list and is in the process of being reused.
- *
- * We must check that the cpu is in the cpumask before
- * checking the refs, and both must be set before
- * executing the callback on this cpu.
- */
-
- if (!cpumask_test_cpu(cpu, data->cpumask))
- continue;
-
- smp_rmb();
-
- if (atomic_read(&data->refs) == 0)
- continue;
-
- func = data->csd.func; /* save for later warn */
- func(data->csd.info);
-
- /*
- * If the cpu mask is not still set then func enabled
- * interrupts (BUG), and this cpu took another smp call
- * function interrupt and executed func(info) twice
- * on this cpu. That nested execution decremented refs.
- */
- if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
- WARN(1, "%pf enabled interrupts and double executed\n", func);
- continue;
- }
-
- refs = atomic_dec_return(&data->refs);
- WARN_ON(refs < 0);
-
- if (refs)
- continue;
-
- WARN_ON(!cpumask_empty(data->cpumask));
-
- raw_spin_lock(&call_function.lock);
- list_del_rcu(&data->csd.list);
- raw_spin_unlock(&call_function.lock);
-
- csd_unlock(&data->csd);
- }
-
+ flush_smp_call_function_queue(true);
}
-/*
- * Invoked by arch to handle an IPI for call function single. Must be
- * called from the arch with interrupts disabled.
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ * offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
*/
-void generic_smp_call_function_single_interrupt(void)
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
- struct call_single_queue *q = &__get_cpu_var(call_single_queue);
- unsigned int data_flags;
- LIST_HEAD(list);
+ struct llist_head *head;
+ struct llist_node *entry;
+ struct call_single_data *csd, *csd_next;
+ static bool warned;
- /*
- * Shouldn't receive this interrupt on a cpu that is not yet online.
- */
- WARN_ON_ONCE(!cpu_online(smp_processor_id()));
+ WARN_ON(!irqs_disabled());
- raw_spin_lock(&q->lock);
- list_replace_init(&q->list, &list);
- raw_spin_unlock(&q->lock);
+ head = &__get_cpu_var(call_single_queue);
+ entry = llist_del_all(head);
+ entry = llist_reverse_order(entry);
- while (!list_empty(&list)) {
- struct call_single_data *data;
-
- data = list_entry(list.next, struct call_single_data, list);
- list_del(&data->list);
+ /* There shouldn't be any pending callbacks on an offline CPU. */
+ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
+ !warned && !llist_empty(head))) {
+ warned = true;
+ WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
/*
- * 'data' can be invalid after this call if flags == 0
- * (when called through generic_exec_single()),
- * so save them away before making the call:
+ * We don't have to use the _safe() variant here
+ * because we are not invoking the IPI handlers yet.
*/
- data_flags = data->flags;
-
- data->func(data->info);
+ llist_for_each_entry(csd, entry, llist)
+ pr_warn("IPI callback %pS sent to offline CPU\n",
+ csd->func);
+ }
- /*
- * Unlocked CSDs are valid through generic_exec_single():
- */
- if (data_flags & CSD_FLAG_LOCK)
- csd_unlock(data);
+ llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ csd->func(csd->info);
+ csd_unlock(csd);
}
}
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
-
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
@@ -297,12 +264,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
- struct call_single_data d = {
- .flags = 0,
- };
- unsigned long flags;
int this_cpu;
- int err = 0;
+ int err;
/*
* prevent preemption and reschedule on another processor,
@@ -319,32 +282,41 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
- if (cpu == this_cpu) {
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- } else {
- if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = &d;
+ err = generic_exec_single(cpu, NULL, func, info, wait);
- if (!wait)
- data = &__get_cpu_var(csd_data);
+ put_cpu();
- csd_lock(data);
+ return err;
+}
+EXPORT_SYMBOL(smp_call_function_single);
- data->func = func;
- data->info = info;
- generic_exec_single(cpu, data, wait);
- } else {
- err = -ENXIO; /* CPU not online */
- }
- }
+/**
+ * smp_call_function_single_async(): Run an asynchronous function on a
+ * specific CPU.
+ * @cpu: The CPU to run on.
+ * @csd: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but the call is asynchonous and
+ * can thus be done from contexts with disabled interrupts.
+ *
+ * The caller passes his own pre-allocated data structure
+ * (ie: embedded in an object) and is responsible for synchronizing it
+ * such that the IPIs performed on the @csd are strictly serialized.
+ *
+ * NOTE: Be careful, there is unfortunately no current debugging facility to
+ * validate the correctness of this serialization.
+ */
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+{
+ int err = 0;
- put_cpu();
+ preempt_disable();
+ err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+ preempt_enable();
return err;
}
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL_GPL(smp_call_function_single_async);
/*
* smp_call_function_any - Run a function on any of the given cpus
@@ -354,8 +326,6 @@ EXPORT_SYMBOL(smp_call_function_single);
* @wait: If true, wait until function has completed.
*
* Returns 0 on success, else a negative status code (if no cpus were online).
- * Note that @wait will be implicitly turned on in case of allocation failures,
- * since we fall back to on-stack allocation.
*
* Selection preference:
* 1) current cpu if in @mask
@@ -392,43 +362,6 @@ call:
EXPORT_SYMBOL_GPL(smp_call_function_any);
/**
- * __smp_call_function_single(): Run a function on a specific CPU
- * @cpu: The CPU to run on.
- * @data: Pre-allocated and setup data structure
- * @wait: If true, wait until function has completed on specified CPU.
- *
- * Like smp_call_function_single(), but allow caller to pass in a
- * pre-allocated data structure. Useful for embedding @data inside
- * other structures, for instance.
- */
-void __smp_call_function_single(int cpu, struct call_single_data *data,
- int wait)
-{
- unsigned int this_cpu;
- unsigned long flags;
-
- this_cpu = get_cpu();
- /*
- * Can deadlock when called with interrupts disabled.
- * We allow cpu's that are not yet online though, as no one else can
- * send smp call function interrupt to this cpu and as such deadlocks
- * can't happen.
- */
- WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
- && !oops_in_progress);
-
- if (cpu == this_cpu) {
- local_irq_save(flags);
- data->func(data->info);
- local_irq_restore(flags);
- } else {
- csd_lock(data);
- generic_exec_single(cpu, data, wait);
- }
- put_cpu();
-}
-
-/**
* smp_call_function_many(): Run a function on a set of other CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
@@ -445,9 +378,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- struct call_function_data *data;
- unsigned long flags;
- int refs, cpu, next_cpu, this_cpu = smp_processor_id();
+ struct call_function_data *cfd;
+ int cpu, next_cpu, this_cpu = smp_processor_id();
/*
* Can deadlock when called with interrupts disabled.
@@ -478,80 +410,35 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- data = &__get_cpu_var(cfd_data);
- csd_lock(&data->csd);
+ cfd = &__get_cpu_var(cfd_data);
- /* This BUG_ON verifies our reuse assertions and can be removed */
- BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
-
- /*
- * The global call function queue list add and delete are protected
- * by a lock, but the list is traversed without any lock, relying
- * on the rcu list add and delete to allow safe concurrent traversal.
- * We reuse the call function data without waiting for any grace
- * period after some other cpu removes it from the global queue.
- * This means a cpu might find our data block as it is being
- * filled out.
- *
- * We hold off the interrupt handler on the other cpu by
- * ordering our writes to the cpu mask vs our setting of the
- * refs counter. We assert only the cpu owning the data block
- * will set a bit in cpumask, and each bit will only be cleared
- * by the subject cpu. Each cpu must first find its bit is
- * set and then check that refs is set indicating the element is
- * ready to be processed, otherwise it must skip the entry.
- *
- * On the previous iteration refs was set to 0 by another cpu.
- * To avoid the use of transitivity, set the counter to 0 here
- * so the wmb will pair with the rmb in the interrupt handler.
- */
- atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
-
- data->csd.func = func;
- data->csd.info = info;
-
- /* Ensure 0 refs is visible before mask. Also orders func and info */
- smp_wmb();
-
- /* We rely on the "and" being processed before the store */
- cpumask_and(data->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, data->cpumask);
- refs = cpumask_weight(data->cpumask);
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
- if (unlikely(!refs)) {
- csd_unlock(&data->csd);
+ if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
- }
- raw_spin_lock_irqsave(&call_function.lock, flags);
- /*
- * Place entry at the _HEAD_ of the list, so that any cpu still
- * observing the entry in generic_smp_call_function_interrupt()
- * will not miss any other list entries:
- */
- list_add_rcu(&data->csd.list, &call_function.queue);
- /*
- * We rely on the wmb() in list_add_rcu to complete our writes
- * to the cpumask before this write to refs, which indicates
- * data is on the list and is ready to be processed.
- */
- atomic_set(&data->refs, refs);
- raw_spin_unlock_irqrestore(&call_function.lock, flags);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
- /*
- * Make the list addition visible before sending the ipi.
- * (IPIs must obey or appear to obey normal Linux cache
- * coherency rules -- see comment in generic_exec_single).
- */
- smp_mb();
+ csd_lock(csd);
+ csd->func = func;
+ csd->info = info;
+ llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ }
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(data->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask);
- /* Optionally wait for the CPUs to complete */
- if (wait)
- csd_lock_wait(&data->csd);
+ if (wait) {
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd;
+
+ csd = per_cpu_ptr(cfd->csd, cpu);
+ csd_lock_wait(csd);
+ }
+ }
}
EXPORT_SYMBOL(smp_call_function_many);
@@ -580,27 +467,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
}
EXPORT_SYMBOL(smp_call_function);
-void ipi_call_lock(void)
-{
- raw_spin_lock(&call_function.lock);
-}
-
-void ipi_call_unlock(void)
-{
- raw_spin_unlock(&call_function.lock);
-}
-
-void ipi_call_lock_irq(void)
-{
- raw_spin_lock_irq(&call_function.lock);
-}
-
-void ipi_call_unlock_irq(void)
-{
- raw_spin_unlock_irq(&call_function.lock);
-}
-#endif /* USE_GENERIC_SMP_HELPERS */
-
/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);
@@ -664,11 +530,18 @@ void __init setup_nr_cpu_ids(void)
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
+void __weak smp_announce(void)
+{
+ printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
+}
+
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
unsigned int cpu;
+ idle_threads_init();
+
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
@@ -678,7 +551,7 @@ void __init smp_init(void)
}
/* Any cleanup work */
- printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+ smp_announce();
smp_cpus_done(setup_max_cpus);
}
@@ -701,3 +574,119 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
return ret;
}
EXPORT_SYMBOL(on_each_cpu);
+
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. The
+ * exception is that it may be used during early boot while
+ * early_boot_irqs_disabled is set.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+ void *info, bool wait)
+{
+ int cpu = get_cpu();
+
+ smp_call_function_many(mask, func, info, wait);
+ if (cpumask_test_cpu(cpu, mask)) {
+ unsigned long flags;
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+ put_cpu();
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * on_each_cpu_cond(): Call a function on each processor for which
+ * the supplied function cond_func returns true, optionally waiting
+ * for all the required CPUs to finish. This may include the local
+ * processor.
+ * @cond_func: A callback function that is passed a cpu id and
+ * the the info parameter. The function is called
+ * with preemption disabled. The function should
+ * return a blooean value indicating whether to IPI
+ * the specified CPU.
+ * @func: The function to run on all applicable CPUs.
+ * This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to both functions.
+ * @wait: If true, wait (atomically) until function has
+ * completed on other CPUs.
+ * @gfp_flags: GFP flags to use when allocating the cpumask
+ * used internally by the function.
+ *
+ * The function might sleep if the GFP flags indicates a non
+ * atomic allocation is allowed.
+ *
+ * Preemption is disabled to protect against CPUs going offline but not online.
+ * CPUs going online during the call will not be seen or sent an IPI.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+ smp_call_func_t func, void *info, bool wait,
+ gfp_t gfp_flags)
+{
+ cpumask_var_t cpus;
+ int cpu, ret;
+
+ might_sleep_if(gfp_flags & __GFP_WAIT);
+
+ if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ if (cond_func(cpu, info))
+ cpumask_set_cpu(cpu, cpus);
+ on_each_cpu_mask(cpus, func, info, wait);
+ preempt_enable();
+ free_cpumask_var(cpus);
+ } else {
+ /*
+ * No free cpumask, bother. No matter, we'll
+ * just have to IPI them one by one.
+ */
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ if (cond_func(cpu, info)) {
+ ret = smp_call_function_single(cpu, func,
+ info, wait);
+ WARN_ON_ONCE(!ret);
+ }
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
+
+static void do_nothing(void *unused)
+{
+}
+
+/**
+ * kick_all_cpus_sync - Force all cpus out of idle
+ *
+ * Used to synchronize the update of pm_idle function pointer. It's
+ * called after the pointer is updated and returns after the dummy
+ * callback function has been executed on all cpus. The execution of
+ * the function can only happen on the remote cpus after they have
+ * left the idle function which had been called via pm_idle function
+ * pointer. So it's guaranteed that nothing uses the previous pointer
+ * anymore.
+ */
+void kick_all_cpus_sync(void)
+{
+ /* Make sure the change is visible before we kick the cpus */
+ smp_mb();
+ smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 00000000000..eb89e180740
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,313 @@
+/*
+ * Common SMP CPU bringup/teardown functions
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
+
+#include "smpboot.h"
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+/*
+ * For the hotplug case we keep the task structs around and reuse
+ * them.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+
+struct task_struct *idle_thread_get(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk)
+ return ERR_PTR(-ENOMEM);
+ init_idle(tsk, cpu);
+ return tsk;
+}
+
+void __init idle_thread_set_boot_cpu(void)
+{
+ per_cpu(idle_threads, smp_processor_id()) = current;
+}
+
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu: The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
+static inline void idle_init(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk) {
+ tsk = fork_idle(cpu);
+ if (IS_ERR(tsk))
+ pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
+ else
+ per_cpu(idle_threads, cpu) = tsk;
+ }
+}
+
+/**
+ * idle_threads_init - Initialize idle threads for all cpus
+ */
+void __init idle_threads_init(void)
+{
+ unsigned int cpu, boot_cpu;
+
+ boot_cpu = smp_processor_id();
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != boot_cpu)
+ idle_init(cpu);
+ }
+}
+#endif
+
+#endif /* #ifdef CONFIG_SMP */
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+ unsigned int cpu;
+ unsigned int status;
+ struct smp_hotplug_thread *ht;
+};
+
+enum {
+ HP_THREAD_NONE = 0,
+ HP_THREAD_ACTIVE,
+ HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @data: thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+ struct smpboot_thread_data *td = data;
+ struct smp_hotplug_thread *ht = td->ht;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ preempt_disable();
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->cleanup)
+ ht->cleanup(td->cpu, cpu_online(td->cpu));
+ kfree(td);
+ return 0;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->park && td->status == HP_THREAD_ACTIVE) {
+ BUG_ON(td->cpu != smp_processor_id());
+ ht->park(td->cpu);
+ td->status = HP_THREAD_PARKED;
+ }
+ kthread_parkme();
+ /* We might have been woken for stop */
+ continue;
+ }
+
+ BUG_ON(td->cpu != smp_processor_id());
+
+ /* Check for state change setup */
+ switch (td->status) {
+ case HP_THREAD_NONE:
+ preempt_enable();
+ if (ht->setup)
+ ht->setup(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ case HP_THREAD_PARKED:
+ preempt_enable();
+ if (ht->unpark)
+ ht->unpark(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ }
+
+ if (!ht->thread_should_run(td->cpu)) {
+ preempt_enable();
+ schedule();
+ } else {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ ht->thread_fn(td->cpu);
+ }
+ }
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ struct smpboot_thread_data *td;
+
+ if (tsk)
+ return 0;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+ if (!td)
+ return -ENOMEM;
+ td->cpu = cpu;
+ td->ht = ht;
+
+ tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+ ht->thread_comm);
+ if (IS_ERR(tsk)) {
+ kfree(td);
+ return PTR_ERR(tsk);
+ }
+ get_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = tsk;
+ if (ht->create) {
+ /*
+ * Make sure that the task has actually scheduled out
+ * into park position, before calling the create
+ * callback. At least the migration thread callback
+ * requires that the task is off the runqueue.
+ */
+ if (!wait_task_inactive(tsk, TASK_PARKED))
+ WARN_ON(1);
+ else
+ ht->create(cpu);
+ }
+ return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list) {
+ ret = __smpboot_create_thread(cur, cpu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (ht->pre_unpark)
+ ht->pre_unpark(cpu);
+ kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_unpark_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk && !ht->selfparking)
+ kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry_reverse(cur, &hotplug_threads, list)
+ smpboot_park_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+ unsigned int cpu;
+
+ /* We need to destroy also the parked threads of offline cpus */
+ for_each_possible_cpu(cpu) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk) {
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = NULL;
+ }
+ }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ unsigned int cpu;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ for_each_online_cpu(cpu) {
+ ret = __smpboot_create_thread(plug_thread, cpu);
+ if (ret) {
+ smpboot_destroy_threads(plug_thread);
+ goto out;
+ }
+ smpboot_unpark_thread(plug_thread, cpu);
+ }
+ list_add(&plug_thread->list, &hotplug_threads);
+out:
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ list_del(&plug_thread->list);
+ smpboot_destroy_threads(plug_thread);
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 00000000000..72415a0eb95
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,20 @@
+#ifndef SMPBOOT_H
+#define SMPBOOT_H
+
+struct task_struct;
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+struct task_struct *idle_thread_get(unsigned int cpu);
+void idle_thread_set_boot_cpu(void);
+void idle_threads_init(void);
+#else
+static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
+static inline void idle_thread_set_boot_cpu(void) { }
+static inline void idle_threads_init(void) { }
+#endif
+
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
+#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4eb3a0fa351..5918d227730 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,10 +6,10 @@
* Distribute under GPLv2.
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
- *
- * Remote softirq infrastructure is by Jens Axboe.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/export.h>
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
@@ -23,12 +23,13 @@
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <linux/tick.h>
+#include <linux/irq.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
-#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
@@ -56,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
-char *softirq_to_name[NR_SOFTIRQS] = {
+const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
@@ -91,7 +92,7 @@ static void wakeup_softirqd(void)
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip, unsigned int cnt)
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -99,47 +100,33 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
raw_local_irq_save(flags);
/*
- * The preempt tracer hooks into add_preempt_count and will break
+ * The preempt tracer hooks into preempt_count_add and will break
* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
* is set and before current->softirq_enabled is cleared.
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
- preempt_count() += cnt;
+ __preempt_count_add(cnt);
/*
* Were softirqs turned off above:
*/
- if (softirq_count() == cnt)
+ if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == cnt)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
-#else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
-{
- add_preempt_count(cnt);
- barrier();
-}
+EXPORT_SYMBOL(__local_bh_disable_ip);
#endif /* CONFIG_TRACE_IRQFLAGS */
-void local_bh_disable(void)
-{
- __local_bh_disable((unsigned long)__builtin_return_address(0),
- SOFTIRQ_DISABLE_OFFSET);
-}
-
-EXPORT_SYMBOL(local_bh_disable);
-
static void __local_bh_enable(unsigned int cnt)
{
- WARN_ON_ONCE(in_irq());
WARN_ON_ONCE(!irqs_disabled());
- if (softirq_count() == cnt)
- trace_softirqs_on((unsigned long)__builtin_return_address(0));
- sub_preempt_count(cnt);
+ if (softirq_count() == (cnt & SOFTIRQ_MASK))
+ trace_softirqs_on(_RET_IP_);
+ preempt_count_sub(cnt);
}
/*
@@ -149,12 +136,12 @@ static void __local_bh_enable(unsigned int cnt)
*/
void _local_bh_enable(void)
{
+ WARN_ON_ONCE(in_irq());
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
-
EXPORT_SYMBOL(_local_bh_enable);
-static inline void _local_bh_enable_ip(unsigned long ip)
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
WARN_ON_ONCE(in_irq() || irqs_disabled());
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -168,57 +155,97 @@ static inline void _local_bh_enable_ip(unsigned long ip)
/*
* Keep preemption disabled until we are done with
* softirq processing:
- */
- sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
+ */
+ preempt_count_sub(cnt - 1);
- if (unlikely(!in_interrupt() && local_softirq_pending()))
+ if (unlikely(!in_interrupt() && local_softirq_pending())) {
+ /*
+ * Run softirq if any pending. And do it in its own stack
+ * as we may be calling this deep in a task call stack already.
+ */
do_softirq();
+ }
- dec_preempt_count();
+ preempt_count_dec();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_enable();
#endif
preempt_check_resched();
}
-
-void local_bh_enable(void)
-{
- _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
-}
-EXPORT_SYMBOL(local_bh_enable);
-
-void local_bh_enable_ip(unsigned long ip)
-{
- _local_bh_enable_ip(ip);
-}
-EXPORT_SYMBOL(local_bh_enable_ip);
+EXPORT_SYMBOL(__local_bh_enable_ip);
/*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
- * and we fall back to softirqd after that.
+ * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
+ * but break the loop if need_resched() is set or after 2 ms.
+ * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
+ * certain cases, such as stop_machine(), jiffies may cease to
+ * increment and so we need the MAX_SOFTIRQ_RESTART limit as
+ * well to make sure we eventually return from this method.
*
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
* The two things to balance is latency against fairness -
* we want to handle softirqs as soon as possible, but they
* should not be able to lock up the box.
*/
+#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
#define MAX_SOFTIRQ_RESTART 10
-asmlinkage void __do_softirq(void)
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * When we run softirqs from irq_exit() and thus on the hardirq stack we need
+ * to keep the lockdep irq context tracking as tight as possible in order to
+ * not miss-qualify lock contexts and miss possible deadlocks.
+ */
+
+static inline bool lockdep_softirq_start(void)
+{
+ bool in_hardirq = false;
+
+ if (trace_hardirq_context(current)) {
+ in_hardirq = true;
+ trace_hardirq_exit();
+ }
+
+ lockdep_softirq_enter();
+
+ return in_hardirq;
+}
+
+static inline void lockdep_softirq_end(bool in_hardirq)
{
+ lockdep_softirq_exit();
+
+ if (in_hardirq)
+ trace_hardirq_enter();
+}
+#else
+static inline bool lockdep_softirq_start(void) { return false; }
+static inline void lockdep_softirq_end(bool in_hardirq) { }
+#endif
+
+asmlinkage __visible void __do_softirq(void)
+{
+ unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
+ unsigned long old_flags = current->flags;
+ int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
+ bool in_hardirq;
__u32 pending;
- int max_restart = MAX_SOFTIRQ_RESTART;
- int cpu;
+ int softirq_bit;
+
+ /*
+ * Mask out PF_MEMALLOC s current task context is borrowed for the
+ * softirq. A softirq handled such as network RX might set PF_MEMALLOC
+ * again if the socket is related to swap
+ */
+ current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
- account_system_vtime(current);
+ account_irq_enter_time(current);
- __local_bh_disable((unsigned long)__builtin_return_address(0),
- SOFTIRQ_OFFSET);
- lockdep_softirq_enter();
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ in_hardirq = lockdep_softirq_start();
- cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
@@ -227,49 +254,50 @@ restart:
h = softirq_vec;
- do {
- if (pending & 1) {
- unsigned int vec_nr = h - softirq_vec;
- int prev_count = preempt_count();
-
- kstat_incr_softirqs_this_cpu(vec_nr);
-
- trace_softirq_entry(vec_nr);
- h->action(h);
- trace_softirq_exit(vec_nr);
- if (unlikely(prev_count != preempt_count())) {
- printk(KERN_ERR "huh, entered softirq %u %s %p"
- "with preempt_count %08x,"
- " exited with %08x?\n", vec_nr,
- softirq_to_name[vec_nr], h->action,
- prev_count, preempt_count());
- preempt_count() = prev_count;
- }
+ while ((softirq_bit = ffs(pending))) {
+ unsigned int vec_nr;
+ int prev_count;
+
+ h += softirq_bit - 1;
+
+ vec_nr = h - softirq_vec;
+ prev_count = preempt_count();
- rcu_bh_qs(cpu);
+ kstat_incr_softirqs_this_cpu(vec_nr);
+
+ trace_softirq_entry(vec_nr);
+ h->action(h);
+ trace_softirq_exit(vec_nr);
+ if (unlikely(prev_count != preempt_count())) {
+ pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
+ vec_nr, softirq_to_name[vec_nr], h->action,
+ prev_count, preempt_count());
+ preempt_count_set(prev_count);
}
h++;
- pending >>= 1;
- } while (pending);
+ pending >>= softirq_bit;
+ }
+ rcu_bh_qs(smp_processor_id());
local_irq_disable();
pending = local_softirq_pending();
- if (pending && --max_restart)
- goto restart;
+ if (pending) {
+ if (time_before(jiffies, end) && !need_resched() &&
+ --max_restart)
+ goto restart;
- if (pending)
wakeup_softirqd();
+ }
- lockdep_softirq_exit();
-
- account_system_vtime(current);
+ lockdep_softirq_end(in_hardirq);
+ account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
+ tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
-#ifndef __ARCH_HAS_DO_SOFTIRQ
-
-asmlinkage void do_softirq(void)
+asmlinkage __visible void do_softirq(void)
{
__u32 pending;
unsigned long flags;
@@ -282,78 +310,85 @@ asmlinkage void do_softirq(void)
pending = local_softirq_pending();
if (pending)
- __do_softirq();
+ do_softirq_own_stack();
local_irq_restore(flags);
}
-#endif
-
/*
* Enter an interrupt context.
*/
void irq_enter(void)
{
- int cpu = smp_processor_id();
-
rcu_irq_enter();
- if (idle_cpu(cpu) && !in_interrupt()) {
+ if (is_idle_task(current) && !in_interrupt()) {
/*
* Prevent raise_softirq from needlessly waking up ksoftirqd
* here, as softirq will be serviced on return from interrupt.
*/
local_bh_disable();
- tick_check_idle(cpu);
+ tick_irq_enter();
_local_bh_enable();
}
__irq_enter();
}
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
static inline void invoke_softirq(void)
{
- if (!force_irqthreads)
+ if (!force_irqthreads) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
+ /*
+ * We can safely execute softirq on the current stack if
+ * it is the irq stack, because it should be near empty
+ * at this stage.
+ */
__do_softirq();
- else {
- __local_bh_disable((unsigned long)__builtin_return_address(0),
- SOFTIRQ_OFFSET);
+#else
+ /*
+ * Otherwise, irq_exit() is called on the task stack that can
+ * be potentially deep already. So call softirq in its own stack
+ * to prevent from any overrun.
+ */
+ do_softirq_own_stack();
+#endif
+ } else {
wakeup_softirqd();
- __local_bh_enable(SOFTIRQ_OFFSET);
}
}
-#else
-static inline void invoke_softirq(void)
+
+static inline void tick_irq_exit(void)
{
- if (!force_irqthreads)
- do_softirq();
- else {
- __local_bh_disable((unsigned long)__builtin_return_address(0),
- SOFTIRQ_OFFSET);
- wakeup_softirqd();
- __local_bh_enable(SOFTIRQ_OFFSET);
+#ifdef CONFIG_NO_HZ_COMMON
+ int cpu = smp_processor_id();
+
+ /* Make sure that timer wheel updates are propagated */
+ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if (!in_interrupt())
+ tick_nohz_irq_exit();
}
-}
#endif
+}
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
void irq_exit(void)
{
- account_system_vtime(current);
- trace_hardirq_exit();
- sub_preempt_count(IRQ_EXIT_OFFSET);
+#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+ local_irq_disable();
+#else
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+ account_irq_exit_time(current);
+ preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
-#ifdef CONFIG_NO_HZ
- /* Make sure that timer wheel updates are propagated */
- if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
- tick_nohz_irq_exit();
-#endif
+ tick_irq_exit();
rcu_irq_exit();
- preempt_enable_no_resched();
+ trace_hardirq_exit(); /* must be last! */
}
/*
@@ -385,6 +420,12 @@ void raise_softirq(unsigned int nr)
local_irq_restore(flags);
}
+void __raise_softirq_irqoff(unsigned int nr)
+{
+ trace_softirq_raise(nr);
+ or_softirq_pending(1UL << nr);
+}
+
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
@@ -393,8 +434,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
/*
* Tasklets
*/
-struct tasklet_head
-{
+struct tasklet_head {
struct tasklet_struct *head;
struct tasklet_struct **tail;
};
@@ -413,7 +453,6 @@ void __tasklet_schedule(struct tasklet_struct *t)
raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
-
EXPORT_SYMBOL(__tasklet_schedule);
void __tasklet_hi_schedule(struct tasklet_struct *t)
@@ -427,7 +466,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_restore(flags);
}
-
EXPORT_SYMBOL(__tasklet_hi_schedule);
void __tasklet_hi_schedule_first(struct tasklet_struct *t)
@@ -438,7 +476,6 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
__this_cpu_write(tasklet_hi_vec.head, t);
__raise_softirq_irqoff(HI_SOFTIRQ);
}
-
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
static void tasklet_action(struct softirq_action *a)
@@ -458,7 +495,8 @@ static void tasklet_action(struct softirq_action *a)
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED,
+ &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
@@ -493,7 +531,8 @@ static void tasklet_hi_action(struct softirq_action *a)
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED,
+ &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
@@ -511,7 +550,6 @@ static void tasklet_hi_action(struct softirq_action *a)
}
}
-
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
@@ -521,13 +559,12 @@ void tasklet_init(struct tasklet_struct *t,
t->func = func;
t->data = data;
}
-
EXPORT_SYMBOL(tasklet_init);
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
- printk("Attempt to kill tasklet from interrupt\n");
+ pr_notice("Attempt to kill tasklet from interrupt\n");
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
do {
@@ -537,7 +574,6 @@ void tasklet_kill(struct tasklet_struct *t)
tasklet_unlock_wait(t);
clear_bit(TASKLET_STATE_SCHED, &t->state);
}
-
EXPORT_SYMBOL(tasklet_kill);
/*
@@ -591,197 +627,41 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
}
EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
-/*
- * Remote softirq bits
- */
-
-DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
-EXPORT_PER_CPU_SYMBOL(softirq_work_list);
-
-static void __local_trigger(struct call_single_data *cp, int softirq)
-{
- struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
-
- list_add_tail(&cp->list, head);
-
- /* Trigger the softirq only if the list was previously empty. */
- if (head->next == &cp->list)
- raise_softirq_irqoff(softirq);
-}
-
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static void remote_softirq_receive(void *data)
-{
- struct call_single_data *cp = data;
- unsigned long flags;
- int softirq;
-
- softirq = cp->priv;
-
- local_irq_save(flags);
- __local_trigger(cp, softirq);
- local_irq_restore(flags);
-}
-
-static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
- if (cpu_online(cpu)) {
- cp->func = remote_softirq_receive;
- cp->info = cp;
- cp->flags = 0;
- cp->priv = softirq;
-
- __smp_call_function_single(cpu, cp, 0);
- return 0;
- }
- return 1;
-}
-#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
-static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
- return 1;
-}
-#endif
-
-/**
- * __send_remote_softirq - try to schedule softirq work on a remote cpu
- * @cp: private SMP call function data area
- * @cpu: the remote cpu
- * @this_cpu: the currently executing cpu
- * @softirq: the softirq for the work
- *
- * Attempt to schedule softirq work on a remote cpu. If this cannot be
- * done, the work is instead queued up on the local cpu.
- *
- * Interrupts must be disabled.
- */
-void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
-{
- if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
- __local_trigger(cp, softirq);
-}
-EXPORT_SYMBOL(__send_remote_softirq);
-
-/**
- * send_remote_softirq - try to schedule softirq work on a remote cpu
- * @cp: private SMP call function data area
- * @cpu: the remote cpu
- * @softirq: the softirq for the work
- *
- * Like __send_remote_softirq except that disabling interrupts and
- * computing the current cpu is done for the caller.
- */
-void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
- unsigned long flags;
- int this_cpu;
-
- local_irq_save(flags);
- this_cpu = smp_processor_id();
- __send_remote_softirq(cp, cpu, this_cpu, softirq);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(send_remote_softirq);
-
-static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
- int i;
-
- local_irq_disable();
- for (i = 0; i < NR_SOFTIRQS; i++) {
- struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
- struct list_head *local_head;
-
- if (list_empty(head))
- continue;
-
- local_head = &__get_cpu_var(softirq_work_list[i]);
- list_splice_init(head, local_head);
- raise_softirq_irqoff(i);
- }
- local_irq_enable();
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
- .notifier_call = remote_softirq_cpu_notify,
-};
-
void __init softirq_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
- int i;
-
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
- for (i = 0; i < NR_SOFTIRQS; i++)
- INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
}
- register_hotcpu_notifier(&remote_softirq_cpu_notifier);
-
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
-static int run_ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_should_run(unsigned int cpu)
{
- set_current_state(TASK_INTERRUPTIBLE);
-
- while (!kthread_should_stop()) {
- preempt_disable();
- if (!local_softirq_pending()) {
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-
- __set_current_state(TASK_RUNNING);
-
- while (local_softirq_pending()) {
- /* Preempt disable stops cpu going offline.
- If already offline, we'll be on wrong CPU:
- don't process */
- if (cpu_is_offline((long)__bind_cpu))
- goto wait_to_die;
- local_irq_disable();
- if (local_softirq_pending())
- __do_softirq();
- local_irq_enable();
- preempt_enable_no_resched();
- cond_resched();
- preempt_disable();
- rcu_note_context_switch((long)__bind_cpu);
- }
- preempt_enable();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
- return 0;
+ return local_softirq_pending();
+}
-wait_to_die:
- preempt_enable();
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ if (local_softirq_pending()) {
+ /*
+ * We can safely run softirq on inline stack, as we are not deep
+ * in the task stack here.
+ */
+ __do_softirq();
+ rcu_note_context_switch(cpu);
+ local_irq_enable();
+ cond_resched();
+ return;
}
- __set_current_state(TASK_RUNNING);
- return 0;
+ local_irq_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -843,69 +723,37 @@ static void takeover_tasklets(unsigned int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
- int hotcpu = (unsigned long)hcpu;
- struct task_struct *p;
-
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- p = kthread_create_on_node(run_ksoftirqd,
- hcpu,
- cpu_to_node(hotcpu),
- "ksoftirqd/%d", hotcpu);
- if (IS_ERR(p)) {
- printk("ksoftirqd for %i failed\n", hotcpu);
- return notifier_from_errno(PTR_ERR(p));
- }
- kthread_bind(p, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = p;
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- wake_up_process(per_cpu(ksoftirqd, hotcpu));
- break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!per_cpu(ksoftirqd, hotcpu))
- break;
- /* Unbind so it can run. Fall thru. */
- kthread_bind(per_cpu(ksoftirqd, hotcpu),
- cpumask_any(cpu_online_mask));
case CPU_DEAD:
- case CPU_DEAD_FROZEN: {
- static const struct sched_param param = {
- .sched_priority = MAX_RT_PRIO-1
- };
-
- p = per_cpu(ksoftirqd, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = NULL;
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- kthread_stop(p);
- takeover_tasklets(hotcpu);
+ case CPU_DEAD_FROZEN:
+ takeover_tasklets((unsigned long)hcpu);
break;
- }
#endif /* CONFIG_HOTPLUG_CPU */
- }
+ }
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
.notifier_call = cpu_callback
};
+static struct smp_hotplug_thread softirq_threads = {
+ .store = &ksoftirqd,
+ .thread_should_run = ksoftirqd_should_run,
+ .thread_fn = run_ksoftirqd,
+ .thread_comm = "ksoftirqd/%u",
+};
+
static __init int spawn_ksoftirqd(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-
- BUG_ON(err != NOTIFY_OK);
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
+
+ BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
return 0;
}
early_initcall(spawn_ksoftirqd);
@@ -920,7 +768,6 @@ int __init __weak early_irq_init(void)
return 0;
}
-#ifdef CONFIG_GENERIC_HARDIRQS
int __init __weak arch_probe_nr_irqs(void)
{
return NR_IRQS_LEGACY;
@@ -930,4 +777,8 @@ int __init __weak arch_early_irq_init(void)
{
return 0;
}
-#endif
+
+unsigned int __weak arch_dynirq_lower_bound(unsigned int from)
+{
+ return from;
+}
diff --git a/kernel/srcu.c b/kernel/srcu.c
deleted file mode 100644
index 0febf61e1aa..00000000000
--- a/kernel/srcu.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-
-#include <linux/export.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/srcu.h>
-
-static int init_srcu_struct_fields(struct srcu_struct *sp)
-{
- sp->completed = 0;
- mutex_init(&sp->mutex);
- sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
- return sp->per_cpu_ref ? 0 : -ENOMEM;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
- struct lock_class_key *key)
-{
- /* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(__init_srcu_struct);
-
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
- *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function. Each srcu_struct represents a separate domain
- * of SRCU protection.
- */
-int init_srcu_struct(struct srcu_struct *sp)
-{
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/*
- * srcu_readers_active_idx -- returns approximate number of readers
- * active on the specified rank of per-CPU counters.
- */
-
-static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- int sum;
-
- sum = 0;
- for_each_possible_cpu(cpu)
- sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
- return sum;
-}
-
-/**
- * srcu_readers_active - returns approximate number of readers.
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
- *
- * Note that this is not an atomic primitive, and can therefore suffer
- * severe errors when invoked on an active srcu_struct. That said, it
- * can be useful as an error check at cleanup time.
- */
-static int srcu_readers_active(struct srcu_struct *sp)
-{
- return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
-}
-
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
-{
- int sum;
-
- sum = srcu_readers_active(sp);
- WARN_ON(sum); /* Leakage unless caller handles error. */
- if (sum != 0)
- return;
- free_percpu(sp->per_cpu_ref);
- sp->per_cpu_ref = NULL;
-}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct. Must be called from process context.
- * Returns an index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- preempt_disable();
- idx = sp->completed & 0x1;
- barrier(); /* ensure compiler looks -once- at sp->completed. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
- preempt_enable();
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
- * Removes the count for the old reader from the appropriate per-CPU
- * element of the srcu_struct. Note that this may well be a different
- * CPU than that which was incremented by the corresponding srcu_read_lock().
- * Must be called from process context.
- */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
-{
- preempt_disable();
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
- preempt_enable();
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods. This approach
- * has done well in testing, so there is no need for a config parameter.
- */
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
-
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- */
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
-{
- int idx;
-
- idx = sp->completed;
- mutex_lock(&sp->mutex);
-
- /*
- * Check to see if someone else did the work for us while we were
- * waiting to acquire the lock. We need -two- advances of
- * the counter, not just one. If there was but one, we might have
- * shown up -after- our helper's first synchronize_sched(), thus
- * having failed to prevent CPU-reordering races with concurrent
- * srcu_read_unlock()s on other CPUs (see comment below). So we
- * either (1) wait for two or (2) supply the second ourselves.
- */
-
- if ((sp->completed - idx) >= 2) {
- mutex_unlock(&sp->mutex);
- return;
- }
-
- sync_func(); /* Force memory barrier on all CPUs. */
-
- /*
- * The preceding synchronize_sched() ensures that any CPU that
- * sees the new value of sp->completed will also see any preceding
- * changes to data structures made by this CPU. This prevents
- * some other CPU from reordering the accesses in its SRCU
- * read-side critical section to precede the corresponding
- * srcu_read_lock() -- ensuring that such references will in
- * fact be protected.
- *
- * So it is now safe to do the flip.
- */
-
- idx = sp->completed & 0x1;
- sp->completed++;
-
- sync_func(); /* Force memory barrier on all CPUs. */
-
- /*
- * At this point, because of the preceding synchronize_sched(),
- * all srcu_read_lock() calls using the old counters have completed.
- * Their corresponding critical sections might well be still
- * executing, but the srcu_read_lock() primitives themselves
- * will have finished executing. We initially give readers
- * an arbitrarily chosen 10 microseconds to get out of their
- * SRCU read-side critical sections, then loop waiting 1/HZ
- * seconds per iteration. The 10-microsecond value has done
- * very well in testing.
- */
-
- if (srcu_readers_active_idx(sp, idx))
- udelay(SYNCHRONIZE_SRCU_READER_DELAY);
- while (srcu_readers_active_idx(sp, idx))
- schedule_timeout_interruptible(1);
-
- sync_func(); /* Force memory barrier on all CPUs. */
-
- /*
- * The preceding synchronize_sched() forces all srcu_read_unlock()
- * primitives that were executing concurrently with the preceding
- * for_each_possible_cpu() loop to have completed by this point.
- * More importantly, it also forces the corresponding SRCU read-side
- * critical sections to have also completed, and the corresponding
- * references to SRCU-protected data items to be dropped.
- *
- * Note:
- *
- * Despite what you might think at first glance, the
- * preceding synchronize_sched() -must- be within the
- * critical section ended by the following mutex_unlock().
- * Otherwise, a task taking the early exit can race
- * with a srcu_read_unlock(), which might have executed
- * just before the preceding srcu_readers_active() check,
- * and whose CPU might have reordered the srcu_read_unlock()
- * with the preceding critical section. In this case, there
- * is nothing preventing the synchronize_sched() task that is
- * taking the early exit from freeing a data structure that
- * is still being referenced (out of order) by the task
- * doing the srcu_read_unlock().
- *
- * Alternatively, the comparison with "2" on the early exit
- * could be changed to "3", but this increases synchronize_srcu()
- * latency for bulk loads. So the current code is preferred.
- */
-
- mutex_unlock(&sp->mutex);
-}
-
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates. Can block; must be called from
- * process context.
- *
- * Note that it is illegal to call synchronize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section.
- */
-void synchronize_srcu(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, synchronize_sched);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
-
-/**
- * synchronize_srcu_expedited - like synchronize_srcu, but less patient
- * @sp: srcu_struct with which to synchronize.
- *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates. Can block; must be called from
- * process context.
- *
- * Note that it is illegal to call synchronize_srcu_expedited()
- * from the corresponding SRCU read-side critical section; doing so
- * will result in deadlock. However, it is perfectly legal to call
- * synchronize_srcu_expedited() on one srcu_struct from some other
- * srcu_struct's read-side critical section.
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, synchronize_sched_expedited);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
-
-long srcu_batches_completed(struct srcu_struct *sp)
-{
- return sp->completed;
-}
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e96571..695f0c6cd16 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,8 +18,9 @@
#include <linux/stop_machine.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
-
+#include <linux/smpboot.h>
#include <linux/atomic.h>
+#include <linux/lglock.h>
/*
* Structure to determine completion condition and record errors. May
@@ -37,12 +38,20 @@ struct cpu_stopper {
spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
- struct task_struct *thread; /* stopper thread */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
+/*
+ * Avoids a race between stop_two_cpus and global stop_cpus, where
+ * the stoppers could get queued up in reverse order, leading to
+ * system deadlock. Using an lglock means stop_two_cpus remains
+ * relatively cheap.
+ */
+DEFINE_STATIC_LGLOCK(stop_cpus_lock);
+
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
@@ -62,16 +71,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
}
/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work)
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
+
unsigned long flags;
spin_lock_irqsave(&stopper->lock, flags);
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
- wake_up_process(stopper->thread);
+ wake_up_process(p);
} else
cpu_stop_signal_done(work->done, false);
@@ -108,8 +119,186 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
cpu_stop_init_done(&done, 1);
- cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+ cpu_stop_queue_work(cpu, &work);
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/* This controls the threads on each CPU. */
+enum multi_stop_state {
+ /* Dummy starting state for thread. */
+ MULTI_STOP_NONE,
+ /* Awaiting everyone to be scheduled. */
+ MULTI_STOP_PREPARE,
+ /* Disable interrupts. */
+ MULTI_STOP_DISABLE_IRQ,
+ /* Run the function */
+ MULTI_STOP_RUN,
+ /* Exit */
+ MULTI_STOP_EXIT,
+};
+
+struct multi_stop_data {
+ int (*fn)(void *);
+ void *data;
+ /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+ unsigned int num_threads;
+ const struct cpumask *active_cpus;
+
+ enum multi_stop_state state;
+ atomic_t thread_ack;
+};
+
+static void set_state(struct multi_stop_data *msdata,
+ enum multi_stop_state newstate)
+{
+ /* Reset ack counter. */
+ atomic_set(&msdata->thread_ack, msdata->num_threads);
+ smp_wmb();
+ msdata->state = newstate;
+}
+
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct multi_stop_data *msdata)
+{
+ if (atomic_dec_and_test(&msdata->thread_ack))
+ set_state(msdata, msdata->state + 1);
+}
+
+/* This is the cpu_stop function which stops the CPU. */
+static int multi_cpu_stop(void *data)
+{
+ struct multi_stop_data *msdata = data;
+ enum multi_stop_state curstate = MULTI_STOP_NONE;
+ int cpu = smp_processor_id(), err = 0;
+ unsigned long flags;
+ bool is_active;
+
+ /*
+ * When called from stop_machine_from_inactive_cpu(), irq might
+ * already be disabled. Save the state and restore it on exit.
+ */
+ local_save_flags(flags);
+
+ if (!msdata->active_cpus)
+ is_active = cpu == cpumask_first(cpu_online_mask);
+ else
+ is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+
+ /* Simple state machine */
+ do {
+ /* Chill out and ensure we re-read multi_stop_state. */
+ cpu_relax();
+ if (msdata->state != curstate) {
+ curstate = msdata->state;
+ switch (curstate) {
+ case MULTI_STOP_DISABLE_IRQ:
+ local_irq_disable();
+ hard_irq_disable();
+ break;
+ case MULTI_STOP_RUN:
+ if (is_active)
+ err = msdata->fn(msdata->data);
+ break;
+ default:
+ break;
+ }
+ ack_state(msdata);
+ }
+ } while (curstate != MULTI_STOP_EXIT);
+
+ local_irq_restore(flags);
+ return err;
+}
+
+struct irq_cpu_stop_queue_work_info {
+ int cpu1;
+ int cpu2;
+ struct cpu_stop_work *work1;
+ struct cpu_stop_work *work2;
+};
+
+/*
+ * This function is always run with irqs and preemption disabled.
+ * This guarantees that both work1 and work2 get queued, before
+ * our local migrate thread gets the chance to preempt us.
+ */
+static void irq_cpu_stop_queue_work(void *arg)
+{
+ struct irq_cpu_stop_queue_work_info *info = arg;
+ cpu_stop_queue_work(info->cpu1, info->work1);
+ cpu_stop_queue_work(info->cpu2, info->work2);
+}
+
+/**
+ * stop_two_cpus - stops two cpus
+ * @cpu1: the cpu to stop
+ * @cpu2: the other cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Stops both the current and specified CPU and runs @fn on one of them.
+ *
+ * returns when both are completed.
+ */
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+ struct cpu_stop_work work1, work2;
+ struct irq_cpu_stop_queue_work_info call_args;
+ struct multi_stop_data msdata;
+
+ preempt_disable();
+ msdata = (struct multi_stop_data){
+ .fn = fn,
+ .data = arg,
+ .num_threads = 2,
+ .active_cpus = cpumask_of(cpu1),
+ };
+
+ work1 = work2 = (struct cpu_stop_work){
+ .fn = multi_cpu_stop,
+ .arg = &msdata,
+ .done = &done
+ };
+
+ call_args = (struct irq_cpu_stop_queue_work_info){
+ .cpu1 = cpu1,
+ .cpu2 = cpu2,
+ .work1 = &work1,
+ .work2 = &work2,
+ };
+
+ cpu_stop_init_done(&done, 2);
+ set_state(&msdata, MULTI_STOP_PREPARE);
+
+ /*
+ * If we observe both CPUs active we know _cpu_down() cannot yet have
+ * queued its stop_machine works and therefore ours will get executed
+ * first. Or its not either one of our CPUs that's getting unplugged,
+ * in which case we don't care.
+ *
+ * This relies on the stopper workqueues to be FIFO.
+ */
+ if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+ preempt_enable();
+ return -ENOENT;
+ }
+
+ lg_local_lock(&stop_cpus_lock);
+ /*
+ * Queuing needs to be done by the lowest numbered CPU, to ensure
+ * that works are always queued in the same order on every CPU.
+ * This prevents deadlocks.
+ */
+ smp_call_function_single(min(cpu1, cpu2),
+ &irq_cpu_stop_queue_work,
+ &call_args, 1);
+ lg_local_unlock(&stop_cpus_lock);
+ preempt_enable();
+
wait_for_completion(&done.completion);
+
return done.executed ? done.ret : -ENOENT;
}
@@ -118,6 +307,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
* @cpu: cpu to stop
* @fn: function to execute
* @arg: argument to @fn
+ * @work_buf: pointer to cpu_stop_work structure
*
* Similar to stop_one_cpu() but doesn't wait for completion. The
* caller is responsible for ensuring @work_buf is currently unused
@@ -130,7 +320,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
- cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+ cpu_stop_queue_work(cpu, work_buf);
}
/* static data for stop_cpus */
@@ -157,11 +347,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- preempt_disable();
+ lg_global_lock(&stop_cpus_lock);
for_each_cpu(cpu, cpumask)
- cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
- &per_cpu(stop_cpus_work, cpu));
- preempt_enable();
+ cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
+ lg_global_unlock(&stop_cpus_lock);
}
static int __stop_cpus(const struct cpumask *cpumask,
@@ -244,20 +433,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-static int cpu_stopper_thread(void *data)
+static int cpu_stop_should_run(unsigned int cpu)
{
- struct cpu_stopper *stopper = data;
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ unsigned long flags;
+ int run;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+ run = !list_empty(&stopper->works);
+ spin_unlock_irqrestore(&stopper->lock, flags);
+ return run;
+}
+
+static void cpu_stopper_thread(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct cpu_stop_work *work;
int ret;
repeat:
- set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
work = NULL;
spin_lock_irq(&stopper->lock);
if (!list_empty(&stopper->works)) {
@@ -273,8 +467,6 @@ repeat:
struct cpu_stop_done *done = work->done;
char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
- __set_current_state(TASK_RUNNING);
-
/* cpu stop callbacks are not allowed to sleep */
preempt_disable();
@@ -290,88 +482,55 @@ repeat:
ksym_buf), arg);
cpu_stop_signal_done(done, true);
- } else
- schedule();
-
- goto repeat;
+ goto repeat;
+ }
}
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
-static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static void cpu_stop_create(unsigned int cpu)
+{
+ sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+}
+
+static void cpu_stop_park(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct task_struct *p;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- BUG_ON(stopper->thread || stopper->enabled ||
- !list_empty(&stopper->works));
- p = kthread_create_on_node(cpu_stopper_thread,
- stopper,
- cpu_to_node(cpu),
- "migration/%d", cpu);
- if (IS_ERR(p))
- return notifier_from_errno(PTR_ERR(p));
- get_task_struct(p);
- kthread_bind(p, cpu);
- sched_set_stop_task(cpu, p);
- stopper->thread = p;
- break;
-
- case CPU_ONLINE:
- /* strictly unnecessary, as first user will wake it */
- wake_up_process(stopper->thread);
- /* mark enabled */
- spin_lock_irq(&stopper->lock);
- stopper->enabled = true;
- spin_unlock_irq(&stopper->lock);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_POST_DEAD:
- {
- struct cpu_stop_work *work;
-
- sched_set_stop_task(cpu, NULL);
- /* kill the stopper */
- kthread_stop(stopper->thread);
- /* drain remaining works */
- spin_lock_irq(&stopper->lock);
- list_for_each_entry(work, &stopper->works, list)
- cpu_stop_signal_done(work->done, false);
- stopper->enabled = false;
- spin_unlock_irq(&stopper->lock);
- /* release the stopper */
- put_task_struct(stopper->thread);
- stopper->thread = NULL;
- break;
- }
-#endif
- }
+ struct cpu_stop_work *work;
+ unsigned long flags;
- return NOTIFY_OK;
+ /* drain remaining works */
+ spin_lock_irqsave(&stopper->lock, flags);
+ list_for_each_entry(work, &stopper->works, list)
+ cpu_stop_signal_done(work->done, false);
+ stopper->enabled = false;
+ spin_unlock_irqrestore(&stopper->lock, flags);
}
-/*
- * Give it a higher priority so that cpu stopper is available to other
- * cpu notifiers. It currently shares the same priority as sched
- * migration_notifier.
- */
-static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
- .notifier_call = cpu_stop_cpu_callback,
- .priority = 10,
+static void cpu_stop_unpark(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+ spin_lock_irq(&stopper->lock);
+ stopper->enabled = true;
+ spin_unlock_irq(&stopper->lock);
+}
+
+static struct smp_hotplug_thread cpu_stop_threads = {
+ .store = &cpu_stopper_task,
+ .thread_should_run = cpu_stop_should_run,
+ .thread_fn = cpu_stopper_thread,
+ .thread_comm = "migration/%u",
+ .create = cpu_stop_create,
+ .setup = cpu_stop_unpark,
+ .park = cpu_stop_park,
+ .pre_unpark = cpu_stop_unpark,
+ .selfparking = true,
};
static int __init cpu_stop_init(void)
{
- void *bcpu = (void *)(long)smp_processor_id();
unsigned int cpu;
- int err;
for_each_possible_cpu(cpu) {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,113 +539,22 @@ static int __init cpu_stop_init(void)
INIT_LIST_HEAD(&stopper->works);
}
- /* start one for the boot cpu */
- err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
- bcpu);
- BUG_ON(err != NOTIFY_OK);
- cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
- register_cpu_notifier(&cpu_stop_cpu_notifier);
-
+ BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
stop_machine_initialized = true;
-
return 0;
}
early_initcall(cpu_stop_init);
#ifdef CONFIG_STOP_MACHINE
-/* This controls the threads on each CPU. */
-enum stopmachine_state {
- /* Dummy starting state for thread. */
- STOPMACHINE_NONE,
- /* Awaiting everyone to be scheduled. */
- STOPMACHINE_PREPARE,
- /* Disable interrupts. */
- STOPMACHINE_DISABLE_IRQ,
- /* Run the function */
- STOPMACHINE_RUN,
- /* Exit */
- STOPMACHINE_EXIT,
-};
-
-struct stop_machine_data {
- int (*fn)(void *);
- void *data;
- /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
- unsigned int num_threads;
- const struct cpumask *active_cpus;
-
- enum stopmachine_state state;
- atomic_t thread_ack;
-};
-
-static void set_state(struct stop_machine_data *smdata,
- enum stopmachine_state newstate)
-{
- /* Reset ack counter. */
- atomic_set(&smdata->thread_ack, smdata->num_threads);
- smp_wmb();
- smdata->state = newstate;
-}
-
-/* Last one to ack a state moves to the next state. */
-static void ack_state(struct stop_machine_data *smdata)
-{
- if (atomic_dec_and_test(&smdata->thread_ack))
- set_state(smdata, smdata->state + 1);
-}
-
-/* This is the cpu_stop function which stops the CPU. */
-static int stop_machine_cpu_stop(void *data)
-{
- struct stop_machine_data *smdata = data;
- enum stopmachine_state curstate = STOPMACHINE_NONE;
- int cpu = smp_processor_id(), err = 0;
- unsigned long flags;
- bool is_active;
-
- /*
- * When called from stop_machine_from_inactive_cpu(), irq might
- * already be disabled. Save the state and restore it on exit.
- */
- local_save_flags(flags);
-
- if (!smdata->active_cpus)
- is_active = cpu == cpumask_first(cpu_online_mask);
- else
- is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-
- /* Simple state machine */
- do {
- /* Chill out and ensure we re-read stopmachine_state. */
- cpu_relax();
- if (smdata->state != curstate) {
- curstate = smdata->state;
- switch (curstate) {
- case STOPMACHINE_DISABLE_IRQ:
- local_irq_disable();
- hard_irq_disable();
- break;
- case STOPMACHINE_RUN:
- if (is_active)
- err = smdata->fn(smdata->data);
- break;
- default:
- break;
- }
- ack_state(smdata);
- }
- } while (curstate != STOPMACHINE_EXIT);
-
- local_irq_restore(flags);
- return err;
-}
-
int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
- struct stop_machine_data smdata = { .fn = fn, .data = data,
- .num_threads = num_online_cpus(),
- .active_cpus = cpus };
+ struct multi_stop_data msdata = {
+ .fn = fn,
+ .data = data,
+ .num_threads = num_online_cpus(),
+ .active_cpus = cpus,
+ };
if (!stop_machine_initialized) {
/*
@@ -497,7 +565,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
unsigned long flags;
int ret;
- WARN_ON_ONCE(smdata.num_threads != 1);
+ WARN_ON_ONCE(msdata.num_threads != 1);
local_irq_save(flags);
hard_irq_disable();
@@ -508,8 +576,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
}
/* Set the initial state and stop all online cpus. */
- set_state(&smdata, STOPMACHINE_PREPARE);
- return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+ set_state(&msdata, MULTI_STOP_PREPARE);
+ return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}
int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -549,25 +617,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
const struct cpumask *cpus)
{
- struct stop_machine_data smdata = { .fn = fn, .data = data,
+ struct multi_stop_data msdata = { .fn = fn, .data = data,
.active_cpus = cpus };
struct cpu_stop_done done;
int ret;
/* Local CPU must be inactive and CPU hotplug in progress. */
BUG_ON(cpu_active(raw_smp_processor_id()));
- smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
+ msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
/* No proper task established and can't sleep - busy wait for lock. */
while (!mutex_trylock(&stop_cpus_mutex))
cpu_relax();
/* Schedule work on other CPUs and execute directly for local CPU */
- set_state(&smdata, STOPMACHINE_PREPARE);
+ set_state(&msdata, MULTI_STOP_PREPARE);
cpu_stop_init_done(&done, num_active_cpus());
- queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+ queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
&done);
- ret = stop_machine_cpu_stop(&smdata);
+ ret = multi_cpu_stop(&msdata);
/* Busy wait for completion. */
while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index 40701538fbd..66a751ebf9d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,7 +16,6 @@
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
-#include <linux/kexec.h>
#include <linux/workqueue.h>
#include <linux/capability.h>
#include <linux/device.h>
@@ -36,6 +35,8 @@
#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/mount.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/version.h>
@@ -45,6 +46,12 @@
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
+
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#include <linux/uidgid.h>
+#include <linux/cred.h>
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
@@ -93,10 +100,8 @@
int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;
-#ifdef CONFIG_UID16
EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);
-#endif
/*
* the same as above, but for filesystems which can only store a 16-bit
@@ -110,20 +115,6 @@ EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
/*
- * this indicates whether you can reboot with ctrl-alt-del: the default is yes
- */
-
-int C_A_D = 1;
-struct pid *cad_pid;
-EXPORT_SYMBOL(cad_pid);
-
-/*
- * If set, this is used for preparing the system to power off.
- */
-
-void (*pm_power_off_prepare)(void);
-
-/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
*
@@ -133,11 +124,10 @@ static bool set_one_prio_perm(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred = __task_cred(p);
- if (pcred->user->user_ns == cred->user->user_ns &&
- (pcred->uid == cred->euid ||
- pcred->euid == cred->euid))
+ if (uid_eq(pcred->uid, cred->euid) ||
+ uid_eq(pcred->euid, cred->euid))
return true;
- if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+ if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
return true;
return false;
}
@@ -177,16 +167,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
const struct cred *cred = current_cred();
int error = -EINVAL;
struct pid *pgrp;
+ kuid_t uid;
if (which > PRIO_USER || which < PRIO_PROCESS)
goto out;
/* normalize: avoid signed division (rounding problems) */
error = -ESRCH;
- if (niceval < -20)
- niceval = -20;
- if (niceval > 19)
- niceval = 19;
+ if (niceval < MIN_NICE)
+ niceval = MIN_NICE;
+ if (niceval > MAX_NICE)
+ niceval = MAX_NICE;
rcu_read_lock();
read_lock(&tasklist_lock);
@@ -209,18 +200,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
- user = (struct user_struct *) cred->user;
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
if (!who)
- who = cred->uid;
- else if ((who != cred->uid) &&
- !(user = find_user(who)))
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid) &&
+ !(user = find_user(uid)))
goto out_unlock; /* No processes for this user */
do_each_thread(g, p) {
- if (__task_cred(p)->uid == who)
+ if (uid_eq(task_uid(p), uid))
error = set_one_prio(p, niceval, error);
} while_each_thread(g, p);
- if (who != cred->uid)
+ if (!uid_eq(uid, cred->uid))
free_uid(user); /* For find_user() */
break;
}
@@ -244,6 +236,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
const struct cred *cred = current_cred();
long niceval, retval = -ESRCH;
struct pid *pgrp;
+ kuid_t uid;
if (which > PRIO_USER || which < PRIO_PROCESS)
return -EINVAL;
@@ -257,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
p = current;
if (p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
@@ -268,27 +261,28 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
pgrp = task_pgrp(current);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
- user = (struct user_struct *) cred->user;
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
if (!who)
- who = cred->uid;
- else if ((who != cred->uid) &&
- !(user = find_user(who)))
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid) &&
+ !(user = find_user(uid)))
goto out_unlock; /* No processes for this user */
do_each_thread(g, p) {
- if (__task_cred(p)->uid == who) {
- niceval = 20 - task_nice(p);
+ if (uid_eq(task_uid(p), uid)) {
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
} while_each_thread(g, p);
- if (who != cred->uid)
+ if (!uid_eq(uid, cred->uid))
free_uid(user); /* for find_user() */
break;
}
@@ -299,231 +293,6 @@ out_unlock:
return retval;
}
-/**
- * emergency_restart - reboot the system
- *
- * Without shutting down any hardware or taking any locks
- * reboot the system. This is called when we know we are in
- * trouble so this is our best effort to reboot. This is
- * safe to call in interrupt context.
- */
-void emergency_restart(void)
-{
- kmsg_dump(KMSG_DUMP_EMERG);
- machine_emergency_restart();
-}
-EXPORT_SYMBOL_GPL(emergency_restart);
-
-void kernel_restart_prepare(char *cmd)
-{
- blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
- system_state = SYSTEM_RESTART;
- usermodehelper_disable();
- device_shutdown();
- syscore_shutdown();
-}
-
-/**
- * register_reboot_notifier - Register function to be called at reboot time
- * @nb: Info about notifier function to be called
- *
- * Registers a function with the list of functions
- * to be called at reboot time.
- *
- * Currently always returns zero, as blocking_notifier_chain_register()
- * always returns zero.
- */
-int register_reboot_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(register_reboot_notifier);
-
-/**
- * unregister_reboot_notifier - Unregister previously registered reboot notifier
- * @nb: Hook to be unregistered
- *
- * Unregisters a previously registered reboot
- * notifier function.
- *
- * Returns zero on success, or %-ENOENT on failure.
- */
-int unregister_reboot_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(unregister_reboot_notifier);
-
-/**
- * kernel_restart - reboot the system
- * @cmd: pointer to buffer containing command to execute for restart
- * or %NULL
- *
- * Shutdown everything and perform a clean reboot.
- * This is not safe to call in interrupt context.
- */
-void kernel_restart(char *cmd)
-{
- kernel_restart_prepare(cmd);
- if (!cmd)
- printk(KERN_EMERG "Restarting system.\n");
- else
- printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
- kmsg_dump(KMSG_DUMP_RESTART);
- machine_restart(cmd);
-}
-EXPORT_SYMBOL_GPL(kernel_restart);
-
-static void kernel_shutdown_prepare(enum system_states state)
-{
- blocking_notifier_call_chain(&reboot_notifier_list,
- (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
- system_state = state;
- usermodehelper_disable();
- device_shutdown();
-}
-/**
- * kernel_halt - halt the system
- *
- * Shutdown everything and perform a clean system halt.
- */
-void kernel_halt(void)
-{
- kernel_shutdown_prepare(SYSTEM_HALT);
- syscore_shutdown();
- printk(KERN_EMERG "System halted.\n");
- kmsg_dump(KMSG_DUMP_HALT);
- machine_halt();
-}
-
-EXPORT_SYMBOL_GPL(kernel_halt);
-
-/**
- * kernel_power_off - power_off the system
- *
- * Shutdown everything and perform a clean system power_off.
- */
-void kernel_power_off(void)
-{
- kernel_shutdown_prepare(SYSTEM_POWER_OFF);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
- disable_nonboot_cpus();
- syscore_shutdown();
- printk(KERN_EMERG "Power down.\n");
- kmsg_dump(KMSG_DUMP_POWEROFF);
- machine_power_off();
-}
-EXPORT_SYMBOL_GPL(kernel_power_off);
-
-static DEFINE_MUTEX(reboot_mutex);
-
-/*
- * Reboot system call: for obvious reasons only root may call it,
- * and even root needs to set up some magic numbers in the registers
- * so that some mistake won't make this reboot the whole machine.
- * You can also set the meaning of the ctrl-alt-del-key here.
- *
- * reboot doesn't sync: do that yourself before calling this.
- */
-SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
- void __user *, arg)
-{
- char buffer[256];
- int ret = 0;
-
- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT))
- return -EPERM;
-
- /* For safety, we require "magic" arguments. */
- if (magic1 != LINUX_REBOOT_MAGIC1 ||
- (magic2 != LINUX_REBOOT_MAGIC2 &&
- magic2 != LINUX_REBOOT_MAGIC2A &&
- magic2 != LINUX_REBOOT_MAGIC2B &&
- magic2 != LINUX_REBOOT_MAGIC2C))
- return -EINVAL;
-
- /* Instead of trying to make the power_off code look like
- * halt when pm_power_off is not set do it the easy way.
- */
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
- cmd = LINUX_REBOOT_CMD_HALT;
-
- mutex_lock(&reboot_mutex);
- switch (cmd) {
- case LINUX_REBOOT_CMD_RESTART:
- kernel_restart(NULL);
- break;
-
- case LINUX_REBOOT_CMD_CAD_ON:
- C_A_D = 1;
- break;
-
- case LINUX_REBOOT_CMD_CAD_OFF:
- C_A_D = 0;
- break;
-
- case LINUX_REBOOT_CMD_HALT:
- kernel_halt();
- do_exit(0);
- panic("cannot halt");
-
- case LINUX_REBOOT_CMD_POWER_OFF:
- kernel_power_off();
- do_exit(0);
- break;
-
- case LINUX_REBOOT_CMD_RESTART2:
- if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
- ret = -EFAULT;
- break;
- }
- buffer[sizeof(buffer) - 1] = '\0';
-
- kernel_restart(buffer);
- break;
-
-#ifdef CONFIG_KEXEC
- case LINUX_REBOOT_CMD_KEXEC:
- ret = kernel_kexec();
- break;
-#endif
-
-#ifdef CONFIG_HIBERNATION
- case LINUX_REBOOT_CMD_SW_SUSPEND:
- ret = hibernate();
- break;
-#endif
-
- default:
- ret = -EINVAL;
- break;
- }
- mutex_unlock(&reboot_mutex);
- return ret;
-}
-
-static void deferred_cad(struct work_struct *dummy)
-{
- kernel_restart(NULL);
-}
-
-/*
- * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
- * As it's called within an interrupt, it may NOT sync: the only choice
- * is whether to reboot at once, or just ignore the ctrl-alt-del.
- */
-void ctrl_alt_del(void)
-{
- static DECLARE_WORK(cad_work, deferred_cad);
-
- if (C_A_D)
- schedule_work(&cad_work);
- else
- kill_cad_pid(SIGINT, 1);
-}
-
/*
* Unprivileged users may change the real gid to the effective gid
* or vice versa. (BSD-style)
@@ -544,9 +313,19 @@ void ctrl_alt_del(void)
*/
SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kgid_t krgid, kegid;
+
+ krgid = make_kgid(ns, rgid);
+ kegid = make_kgid(ns, egid);
+
+ if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+ return -EINVAL;
+ if ((egid != (gid_t) -1) && !gid_valid(kegid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -555,25 +334,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
retval = -EPERM;
if (rgid != (gid_t) -1) {
- if (old->gid == rgid ||
- old->egid == rgid ||
- nsown_capable(CAP_SETGID))
- new->gid = rgid;
+ if (gid_eq(old->gid, krgid) ||
+ gid_eq(old->egid, krgid) ||
+ ns_capable(old->user_ns, CAP_SETGID))
+ new->gid = krgid;
else
goto error;
}
if (egid != (gid_t) -1) {
- if (old->gid == egid ||
- old->egid == egid ||
- old->sgid == egid ||
- nsown_capable(CAP_SETGID))
- new->egid = egid;
+ if (gid_eq(old->gid, kegid) ||
+ gid_eq(old->egid, kegid) ||
+ gid_eq(old->sgid, kegid) ||
+ ns_capable(old->user_ns, CAP_SETGID))
+ new->egid = kegid;
else
goto error;
}
if (rgid != (gid_t) -1 ||
- (egid != (gid_t) -1 && egid != old->gid))
+ (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
new->sgid = new->egid;
new->fsgid = new->egid;
@@ -591,9 +370,15 @@ error:
*/
SYSCALL_DEFINE1(setgid, gid_t, gid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kgid_t kgid;
+
+ kgid = make_kgid(ns, gid);
+ if (!gid_valid(kgid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -601,10 +386,10 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
old = current_cred();
retval = -EPERM;
- if (nsown_capable(CAP_SETGID))
- new->gid = new->egid = new->sgid = new->fsgid = gid;
- else if (gid == old->gid || gid == old->sgid)
- new->egid = new->fsgid = gid;
+ if (ns_capable(old->user_ns, CAP_SETGID))
+ new->gid = new->egid = new->sgid = new->fsgid = kgid;
+ else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
+ new->egid = new->fsgid = kgid;
else
goto error;
@@ -622,7 +407,7 @@ static int set_user(struct cred *new)
{
struct user_struct *new_user;
- new_user = alloc_uid(current_user_ns(), new->uid);
+ new_user = alloc_uid(new->uid);
if (!new_user)
return -EAGAIN;
@@ -661,9 +446,19 @@ static int set_user(struct cred *new)
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kuid_t kruid, keuid;
+
+ kruid = make_kuid(ns, ruid);
+ keuid = make_kuid(ns, euid);
+
+ if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+ return -EINVAL;
+ if ((euid != (uid_t) -1) && !uid_valid(keuid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -672,29 +467,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
retval = -EPERM;
if (ruid != (uid_t) -1) {
- new->uid = ruid;
- if (old->uid != ruid &&
- old->euid != ruid &&
- !nsown_capable(CAP_SETUID))
+ new->uid = kruid;
+ if (!uid_eq(old->uid, kruid) &&
+ !uid_eq(old->euid, kruid) &&
+ !ns_capable(old->user_ns, CAP_SETUID))
goto error;
}
if (euid != (uid_t) -1) {
- new->euid = euid;
- if (old->uid != euid &&
- old->euid != euid &&
- old->suid != euid &&
- !nsown_capable(CAP_SETUID))
+ new->euid = keuid;
+ if (!uid_eq(old->uid, keuid) &&
+ !uid_eq(old->euid, keuid) &&
+ !uid_eq(old->suid, keuid) &&
+ !ns_capable(old->user_ns, CAP_SETUID))
goto error;
}
- if (new->uid != old->uid) {
+ if (!uid_eq(new->uid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
if (ruid != (uid_t) -1 ||
- (euid != (uid_t) -1 && euid != old->uid))
+ (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
new->suid = new->euid;
new->fsuid = new->euid;
@@ -722,9 +517,15 @@ error:
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kuid_t kuid;
+
+ kuid = make_kuid(ns, uid);
+ if (!uid_valid(kuid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -732,18 +533,18 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
old = current_cred();
retval = -EPERM;
- if (nsown_capable(CAP_SETUID)) {
- new->suid = new->uid = uid;
- if (uid != old->uid) {
+ if (ns_capable(old->user_ns, CAP_SETUID)) {
+ new->suid = new->uid = kuid;
+ if (!uid_eq(kuid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
- } else if (uid != old->uid && uid != new->suid) {
+ } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
goto error;
}
- new->fsuid = new->euid = uid;
+ new->fsuid = new->euid = kuid;
retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
if (retval < 0)
@@ -763,9 +564,24 @@ error:
*/
SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kuid_t kruid, keuid, ksuid;
+
+ kruid = make_kuid(ns, ruid);
+ keuid = make_kuid(ns, euid);
+ ksuid = make_kuid(ns, suid);
+
+ if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+ return -EINVAL;
+
+ if ((euid != (uid_t) -1) && !uid_valid(keuid))
+ return -EINVAL;
+
+ if ((suid != (uid_t) -1) && !uid_valid(ksuid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -774,30 +590,30 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
old = current_cred();
retval = -EPERM;
- if (!nsown_capable(CAP_SETUID)) {
- if (ruid != (uid_t) -1 && ruid != old->uid &&
- ruid != old->euid && ruid != old->suid)
+ if (!ns_capable(old->user_ns, CAP_SETUID)) {
+ if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
goto error;
- if (euid != (uid_t) -1 && euid != old->uid &&
- euid != old->euid && euid != old->suid)
+ if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
goto error;
- if (suid != (uid_t) -1 && suid != old->uid &&
- suid != old->euid && suid != old->suid)
+ if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
goto error;
}
if (ruid != (uid_t) -1) {
- new->uid = ruid;
- if (ruid != old->uid) {
+ new->uid = kruid;
+ if (!uid_eq(kruid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
}
if (euid != (uid_t) -1)
- new->euid = euid;
+ new->euid = keuid;
if (suid != (uid_t) -1)
- new->suid = suid;
+ new->suid = ksuid;
new->fsuid = new->euid;
retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -811,14 +627,19 @@ error:
return retval;
}
-SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
{
const struct cred *cred = current_cred();
int retval;
+ uid_t ruid, euid, suid;
- if (!(retval = put_user(cred->uid, ruid)) &&
- !(retval = put_user(cred->euid, euid)))
- retval = put_user(cred->suid, suid);
+ ruid = from_kuid_munged(cred->user_ns, cred->uid);
+ euid = from_kuid_munged(cred->user_ns, cred->euid);
+ suid = from_kuid_munged(cred->user_ns, cred->suid);
+
+ if (!(retval = put_user(ruid, ruidp)) &&
+ !(retval = put_user(euid, euidp)))
+ retval = put_user(suid, suidp);
return retval;
}
@@ -828,9 +649,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
*/
SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kgid_t krgid, kegid, ksgid;
+
+ krgid = make_kgid(ns, rgid);
+ kegid = make_kgid(ns, egid);
+ ksgid = make_kgid(ns, sgid);
+
+ if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+ return -EINVAL;
+ if ((egid != (gid_t) -1) && !gid_valid(kegid))
+ return -EINVAL;
+ if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -838,24 +672,24 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
old = current_cred();
retval = -EPERM;
- if (!nsown_capable(CAP_SETGID)) {
- if (rgid != (gid_t) -1 && rgid != old->gid &&
- rgid != old->egid && rgid != old->sgid)
+ if (!ns_capable(old->user_ns, CAP_SETGID)) {
+ if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
goto error;
- if (egid != (gid_t) -1 && egid != old->gid &&
- egid != old->egid && egid != old->sgid)
+ if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
goto error;
- if (sgid != (gid_t) -1 && sgid != old->gid &&
- sgid != old->egid && sgid != old->sgid)
+ if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
goto error;
}
if (rgid != (gid_t) -1)
- new->gid = rgid;
+ new->gid = krgid;
if (egid != (gid_t) -1)
- new->egid = egid;
+ new->egid = kegid;
if (sgid != (gid_t) -1)
- new->sgid = sgid;
+ new->sgid = ksgid;
new->fsgid = new->egid;
return commit_creds(new);
@@ -865,14 +699,19 @@ error:
return retval;
}
-SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
{
const struct cred *cred = current_cred();
int retval;
+ gid_t rgid, egid, sgid;
+
+ rgid = from_kgid_munged(cred->user_ns, cred->gid);
+ egid = from_kgid_munged(cred->user_ns, cred->egid);
+ sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(cred->gid, rgid)) &&
- !(retval = put_user(cred->egid, egid)))
- retval = put_user(cred->sgid, sgid);
+ if (!(retval = put_user(rgid, rgidp)) &&
+ !(retval = put_user(egid, egidp)))
+ retval = put_user(sgid, sgidp);
return retval;
}
@@ -889,18 +728,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
const struct cred *old;
struct cred *new;
uid_t old_fsuid;
+ kuid_t kuid;
+
+ old = current_cred();
+ old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
+
+ kuid = make_kuid(old->user_ns, uid);
+ if (!uid_valid(kuid))
+ return old_fsuid;
new = prepare_creds();
if (!new)
- return current_fsuid();
- old = current_cred();
- old_fsuid = old->fsuid;
+ return old_fsuid;
- if (uid == old->uid || uid == old->euid ||
- uid == old->suid || uid == old->fsuid ||
- nsown_capable(CAP_SETUID)) {
- if (uid != old_fsuid) {
- new->fsuid = uid;
+ if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
+ uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
+ ns_capable(old->user_ns, CAP_SETUID)) {
+ if (!uid_eq(kuid, old->fsuid)) {
+ new->fsuid = kuid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
goto change_okay;
}
@@ -922,18 +767,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
const struct cred *old;
struct cred *new;
gid_t old_fsgid;
+ kgid_t kgid;
+
+ old = current_cred();
+ old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
+
+ kgid = make_kgid(old->user_ns, gid);
+ if (!gid_valid(kgid))
+ return old_fsgid;
new = prepare_creds();
if (!new)
- return current_fsgid();
- old = current_cred();
- old_fsgid = old->fsgid;
+ return old_fsgid;
- if (gid == old->gid || gid == old->egid ||
- gid == old->sgid || gid == old->fsgid ||
- nsown_capable(CAP_SETGID)) {
- if (gid != old_fsgid) {
- new->fsgid = gid;
+ if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
+ gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
+ ns_capable(old->user_ns, CAP_SETGID)) {
+ if (!gid_eq(kgid, old->fsgid)) {
+ new->fsgid = kgid;
goto change_okay;
}
}
@@ -946,12 +797,73 @@ change_okay:
return old_fsgid;
}
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid. The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+SYSCALL_DEFINE0(getpid)
+{
+ return task_tgid_vnr(current);
+}
+
+/* Thread ID - the internal kernel "pid" */
+SYSCALL_DEFINE0(gettid)
+{
+ return task_pid_vnr(current);
+}
+
+/*
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
+ */
+SYSCALL_DEFINE0(getppid)
+{
+ int pid;
+
+ rcu_read_lock();
+ pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+ rcu_read_unlock();
+
+ return pid;
+}
+
+SYSCALL_DEFINE0(getuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_uid());
+}
+
+SYSCALL_DEFINE0(geteuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_euid());
+}
+
+SYSCALL_DEFINE0(getgid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_gid());
+}
+
+SYSCALL_DEFINE0(getegid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_egid());
+}
+
void do_sys_times(struct tms *tms)
{
cputime_t tgutime, tgstime, cutime, cstime;
spin_lock_irq(&current->sighand->siglock);
- thread_group_times(current, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
spin_unlock_irq(&current->sighand->siglock);
@@ -983,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
* only important on a multi-user system anyway, to make sure one user
* can't send a signal to a process owned by another. -TYT, 12/12/91
*
- * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
- * LBT 04.03.94
+ * !PF_FORKNOEXEC check to conform completely to POSIX.
*/
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
@@ -1020,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
if (task_session(p) != task_session(group_leader))
goto out;
err = -EACCES;
- if (p->did_exec)
+ if (!(p->flags & PF_FORKNOEXEC))
goto out;
} else {
err = -ESRCH;
@@ -1122,6 +1033,17 @@ out:
return retval;
}
+static void set_special_pids(struct pid *pid)
+{
+ struct task_struct *curr = current->group_leader;
+
+ if (task_session(curr) != pid)
+ change_pid(curr, PIDTYPE_SID, pid);
+
+ if (task_pgrp(curr) != pid)
+ change_pid(curr, PIDTYPE_PGID, pid);
+}
+
SYSCALL_DEFINE0(setsid)
{
struct task_struct *group_leader = current->group_leader;
@@ -1141,7 +1063,7 @@ SYSCALL_DEFINE0(setsid)
goto out;
group_leader->signal->leader = 1;
- __set_special_pids(sid);
+ set_special_pids(sid);
proc_clear_tty(group_leader);
@@ -1170,15 +1092,16 @@ DECLARE_RWSEM(uts_sem);
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
*/
-static int override_release(char __user *release, int len)
+static int override_release(char __user *release, size_t len)
{
int ret = 0;
- char buf[65];
if (current->personality & UNAME26) {
- char *rest = UTS_RELEASE;
+ const char *rest = UTS_RELEASE;
+ char buf[65] = { 0 };
int ndots = 0;
unsigned v;
+ size_t copy;
while (*rest) {
if (*rest == '.' && ++ndots >= 3)
@@ -1188,8 +1111,9 @@ static int override_release(char __user *release, int len)
rest++;
}
v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
- snprintf(buf, len, "2.6.%u%s", v, rest);
- ret = copy_to_user(release, buf, len);
+ copy = clamp_t(size_t, len, 1, sizeof(buf));
+ copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+ ret = copy_to_user(release, buf, copy + 1);
}
return ret;
}
@@ -1286,8 +1210,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
memcpy(u->nodename, tmp, len);
memset(u->nodename + len, 0, sizeof(u->nodename) - len);
errno = 0;
+ uts_proc_notify(UTS_PROC_HOSTNAME);
}
- uts_proc_notify(UTS_PROC_HOSTNAME);
up_write(&uts_sem);
return errno;
}
@@ -1337,8 +1261,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
memcpy(u->domainname, tmp, len);
memset(u->domainname + len, 0, sizeof(u->domainname) - len);
errno = 0;
+ uts_proc_notify(UTS_PROC_DOMAINNAME);
}
- uts_proc_notify(UTS_PROC_DOMAINNAME);
up_write(&uts_sem);
return errno;
}
@@ -1489,15 +1413,14 @@ static int check_prlimit_permission(struct task_struct *task)
return 0;
tcred = __task_cred(task);
- if (cred->user->user_ns == tcred->user->user_ns &&
- (cred->uid == tcred->euid &&
- cred->uid == tcred->suid &&
- cred->uid == tcred->uid &&
- cred->gid == tcred->egid &&
- cred->gid == tcred->sgid &&
- cred->gid == tcred->gid))
+ if (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid))
return 0;
- if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+ if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
return 0;
return -EPERM;
@@ -1608,7 +1531,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
utime = stime = 0;
if (who == RUSAGE_THREAD) {
- task_times(current, &utime, &stime);
+ task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;
@@ -1634,7 +1557,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
break;
case RUSAGE_SELF:
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
utime += tgutime;
stime += tgstime;
r->ru_nvcsw += p->signal->nvcsw;
@@ -1648,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
t = p;
do {
accumulate_thread_rusage(t, r);
- t = next_thread(t);
- } while (t != p);
+ } while_each_thread(p, t);
break;
default:
@@ -1686,84 +1608,125 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
return getrusage(current, who, ru);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+ who != RUSAGE_THREAD)
+ return -EINVAL;
+
+ k_getrusage(current, who, &r);
+ return put_compat_rusage(&r, ru);
+}
+#endif
+
SYSCALL_DEFINE1(umask, int, mask)
{
mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
return mask;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+ struct fd exe;
+ struct inode *inode;
+ int err;
+
+ exe = fdget(fd);
+ if (!exe.file)
+ return -EBADF;
+
+ inode = file_inode(exe.file);
+
+ /*
+ * Because the original mm->exe_file points to executable file, make
+ * sure that this one is executable as well, to avoid breaking an
+ * overall picture.
+ */
+ err = -EACCES;
+ if (!S_ISREG(inode->i_mode) ||
+ exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ goto exit;
+
+ err = inode_permission(inode, MAY_EXEC);
+ if (err)
+ goto exit;
+
+ down_write(&mm->mmap_sem);
+
+ /*
+ * Forbid mm->exe_file change if old file still mapped.
+ */
+ err = -EBUSY;
+ if (mm->exe_file) {
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if (vma->vm_file &&
+ path_equal(&vma->vm_file->f_path,
+ &mm->exe_file->f_path))
+ goto exit_unlock;
+ }
+
+ /*
+ * The symlink can be changed only once, just to disallow arbitrary
+ * transitions malicious software might bring in. This means one
+ * could make a snapshot over all processes running and monitor
+ * /proc/pid/exe changes to notice unusual activity if needed.
+ */
+ err = -EPERM;
+ if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
+ goto exit_unlock;
+
+ err = 0;
+ set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
+exit_unlock:
+ up_write(&mm->mmap_sem);
+
+exit:
+ fdput(exe);
+ return err;
+}
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
unsigned long rlim = rlimit(RLIMIT_DATA);
- unsigned long vm_req_flags;
- unsigned long vm_bad_flags;
- struct vm_area_struct *vma;
- int error = 0;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int error;
- if (arg4 | arg5)
+ if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (addr >= TASK_SIZE)
+ if (opt == PR_SET_MM_EXE_FILE)
+ return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
+ if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
+ error = -EINVAL;
+
down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
- if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
- /* It must be existing VMA */
- if (!vma || vma->vm_start > addr)
- goto out;
- }
-
- error = -EINVAL;
switch (opt) {
case PR_SET_MM_START_CODE:
+ mm->start_code = addr;
+ break;
case PR_SET_MM_END_CODE:
- vm_req_flags = VM_READ | VM_EXEC;
- vm_bad_flags = VM_WRITE | VM_MAYSHARE;
-
- if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
- (vma->vm_flags & vm_bad_flags))
- goto out;
-
- if (opt == PR_SET_MM_START_CODE)
- mm->start_code = addr;
- else
- mm->end_code = addr;
+ mm->end_code = addr;
break;
-
case PR_SET_MM_START_DATA:
- case PR_SET_MM_END_DATA:
- vm_req_flags = VM_READ | VM_WRITE;
- vm_bad_flags = VM_EXEC | VM_MAYSHARE;
-
- if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
- (vma->vm_flags & vm_bad_flags))
- goto out;
-
- if (opt == PR_SET_MM_START_DATA)
- mm->start_data = addr;
- else
- mm->end_data = addr;
+ mm->start_data = addr;
break;
-
- case PR_SET_MM_START_STACK:
-
-#ifdef CONFIG_STACK_GROWSUP
- vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
-#else
- vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
-#endif
- if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
- goto out;
-
- mm->start_stack = addr;
+ case PR_SET_MM_END_DATA:
+ mm->end_data = addr;
break;
case PR_SET_MM_START_BRK:
@@ -1790,21 +1753,81 @@ static int prctl_set_mm(int opt, unsigned long addr,
mm->brk = addr;
break;
+ /*
+ * If command line arguments and environment
+ * are placed somewhere else on stack, we can
+ * set them up here, ARG_START/END to setup
+ * command line argumets and ENV_START/END
+ * for environment.
+ */
+ case PR_SET_MM_START_STACK:
+ case PR_SET_MM_ARG_START:
+ case PR_SET_MM_ARG_END:
+ case PR_SET_MM_ENV_START:
+ case PR_SET_MM_ENV_END:
+ if (!vma) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (opt == PR_SET_MM_START_STACK)
+ mm->start_stack = addr;
+ else if (opt == PR_SET_MM_ARG_START)
+ mm->arg_start = addr;
+ else if (opt == PR_SET_MM_ARG_END)
+ mm->arg_end = addr;
+ else if (opt == PR_SET_MM_ENV_START)
+ mm->env_start = addr;
+ else if (opt == PR_SET_MM_ENV_END)
+ mm->env_end = addr;
+ break;
+
+ /*
+ * This doesn't move auxiliary vector itself
+ * since it's pinned to mm_struct, but allow
+ * to fill vector with new values. It's up
+ * to a caller to provide sane values here
+ * otherwise user space tools which use this
+ * vector might be unhappy.
+ */
+ case PR_SET_MM_AUXV: {
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (arg4 > sizeof(user_auxv))
+ goto out;
+ up_read(&mm->mmap_sem);
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, arg4);
+ task_unlock(current);
+
+ return 0;
+ }
default:
- error = -EINVAL;
goto out;
}
error = 0;
-
out:
up_read(&mm->mmap_sem);
-
return error;
}
-#else /* CONFIG_CHECKPOINT_RESTORE */
-static int prctl_set_mm(int opt, unsigned long addr,
- unsigned long arg4, unsigned long arg5)
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+ return put_user(me->clear_child_tid, tid_addr);
+}
+#else
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
return -EINVAL;
}
@@ -1823,148 +1846,174 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = 0;
switch (option) {
- case PR_SET_PDEATHSIG:
- if (!valid_signal(arg2)) {
- error = -EINVAL;
- break;
- }
- me->pdeath_signal = arg2;
- error = 0;
- break;
- case PR_GET_PDEATHSIG:
- error = put_user(me->pdeath_signal, (int __user *)arg2);
- break;
- case PR_GET_DUMPABLE:
- error = get_dumpable(me->mm);
- break;
- case PR_SET_DUMPABLE:
- if (arg2 < 0 || arg2 > 1) {
- error = -EINVAL;
- break;
- }
- set_dumpable(me->mm, arg2);
- error = 0;
- break;
-
- case PR_SET_UNALIGN:
- error = SET_UNALIGN_CTL(me, arg2);
- break;
- case PR_GET_UNALIGN:
- error = GET_UNALIGN_CTL(me, arg2);
- break;
- case PR_SET_FPEMU:
- error = SET_FPEMU_CTL(me, arg2);
- break;
- case PR_GET_FPEMU:
- error = GET_FPEMU_CTL(me, arg2);
- break;
- case PR_SET_FPEXC:
- error = SET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_FPEXC:
- error = GET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_TIMING:
- error = PR_TIMING_STATISTICAL;
- break;
- case PR_SET_TIMING:
- if (arg2 != PR_TIMING_STATISTICAL)
- error = -EINVAL;
- else
- error = 0;
- break;
-
- case PR_SET_NAME:
- comm[sizeof(me->comm)-1] = 0;
- if (strncpy_from_user(comm, (char __user *)arg2,
- sizeof(me->comm) - 1) < 0)
- return -EFAULT;
- set_task_comm(me, comm);
- proc_comm_connector(me);
- return 0;
- case PR_GET_NAME:
- get_task_comm(comm, me);
- if (copy_to_user((char __user *)arg2, comm,
- sizeof(comm)))
- return -EFAULT;
- return 0;
- case PR_GET_ENDIAN:
- error = GET_ENDIAN(me, arg2);
+ case PR_SET_PDEATHSIG:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
break;
- case PR_SET_ENDIAN:
- error = SET_ENDIAN(me, arg2);
+ }
+ me->pdeath_signal = arg2;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(me->pdeath_signal, (int __user *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ error = get_dumpable(me->mm);
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ error = -EINVAL;
break;
+ }
+ set_dumpable(me->mm, arg2);
+ break;
- case PR_GET_SECCOMP:
- error = prctl_get_seccomp();
- break;
- case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2);
- break;
- case PR_GET_TSC:
- error = GET_TSC_CTL(arg2);
- break;
- case PR_SET_TSC:
- error = SET_TSC_CTL(arg2);
- break;
- case PR_TASK_PERF_EVENTS_DISABLE:
- error = perf_event_task_disable();
- break;
- case PR_TASK_PERF_EVENTS_ENABLE:
- error = perf_event_task_enable();
- break;
- case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
- break;
- case PR_SET_TIMERSLACK:
- if (arg2 <= 0)
- current->timer_slack_ns =
+ case PR_SET_UNALIGN:
+ error = SET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_GET_UNALIGN:
+ error = GET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_SET_FPEMU:
+ error = SET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_GET_FPEMU:
+ error = GET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_SET_FPEXC:
+ error = SET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_FPEXC:
+ error = GET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_TIMING:
+ error = PR_TIMING_STATISTICAL;
+ break;
+ case PR_SET_TIMING:
+ if (arg2 != PR_TIMING_STATISTICAL)
+ error = -EINVAL;
+ break;
+ case PR_SET_NAME:
+ comm[sizeof(me->comm) - 1] = 0;
+ if (strncpy_from_user(comm, (char __user *)arg2,
+ sizeof(me->comm) - 1) < 0)
+ return -EFAULT;
+ set_task_comm(me, comm);
+ proc_comm_connector(me);
+ break;
+ case PR_GET_NAME:
+ get_task_comm(comm, me);
+ if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
+ return -EFAULT;
+ break;
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(me, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(me, arg2);
+ break;
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+ break;
+ case PR_SET_TSC:
+ error = SET_TSC_CTL(arg2);
+ break;
+ case PR_TASK_PERF_EVENTS_DISABLE:
+ error = perf_event_task_disable();
+ break;
+ case PR_TASK_PERF_EVENTS_ENABLE:
+ error = perf_event_task_enable();
+ break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
current->default_timer_slack_ns;
- else
- current->timer_slack_ns = arg2;
- error = 0;
- break;
- case PR_MCE_KILL:
- if (arg4 | arg5)
- return -EINVAL;
- switch (arg2) {
- case PR_MCE_KILL_CLEAR:
- if (arg3 != 0)
- return -EINVAL;
- current->flags &= ~PF_MCE_PROCESS;
- break;
- case PR_MCE_KILL_SET:
- current->flags |= PF_MCE_PROCESS;
- if (arg3 == PR_MCE_KILL_EARLY)
- current->flags |= PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_LATE)
- current->flags &= ~PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_DEFAULT)
- current->flags &=
- ~(PF_MCE_EARLY|PF_MCE_PROCESS);
- else
- return -EINVAL;
- break;
- default:
+ else
+ current->timer_slack_ns = arg2;
+ break;
+ case PR_MCE_KILL:
+ if (arg4 | arg5)
+ return -EINVAL;
+ switch (arg2) {
+ case PR_MCE_KILL_CLEAR:
+ if (arg3 != 0)
return -EINVAL;
- }
- error = 0;
+ current->flags &= ~PF_MCE_PROCESS;
break;
- case PR_MCE_KILL_GET:
- if (arg2 | arg3 | arg4 | arg5)
- return -EINVAL;
- if (current->flags & PF_MCE_PROCESS)
- error = (current->flags & PF_MCE_EARLY) ?
- PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ case PR_MCE_KILL_SET:
+ current->flags |= PF_MCE_PROCESS;
+ if (arg3 == PR_MCE_KILL_EARLY)
+ current->flags |= PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_LATE)
+ current->flags &= ~PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_DEFAULT)
+ current->flags &=
+ ~(PF_MCE_EARLY|PF_MCE_PROCESS);
else
- error = PR_MCE_KILL_DEFAULT;
- break;
- case PR_SET_MM:
- error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ return -EINVAL;
break;
default:
- error = -EINVAL;
- break;
+ return -EINVAL;
+ }
+ break;
+ case PR_MCE_KILL_GET:
+ if (arg2 | arg3 | arg4 | arg5)
+ return -EINVAL;
+ if (current->flags & PF_MCE_PROCESS)
+ error = (current->flags & PF_MCE_EARLY) ?
+ PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ else
+ error = PR_MCE_KILL_DEFAULT;
+ break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_TID_ADDRESS:
+ error = prctl_get_tid_address(me, (int __user **)arg2);
+ break;
+ case PR_SET_CHILD_SUBREAPER:
+ me->signal->is_child_subreaper = !!arg2;
+ break;
+ case PR_GET_CHILD_SUBREAPER:
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *)arg2);
+ break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ current->no_new_privs = 1;
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return current->no_new_privs ? 1 : 0;
+ case PR_GET_THP_DISABLE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ break;
+ case PR_SET_THP_DISABLE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ down_write(&me->mm->mmap_sem);
+ if (arg2)
+ me->mm->def_flags |= VM_NOHUGEPAGE;
+ else
+ me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ up_write(&me->mm->mmap_sem);
+ break;
+ default:
+ error = -EINVAL;
+ break;
}
return error;
}
@@ -1981,60 +2030,146 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
return err ? -EFAULT : 0;
}
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-
-static void argv_cleanup(struct subprocess_info *info)
-{
- argv_free(info->argv);
-}
-
/**
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
+ * do_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
*/
-int orderly_poweroff(bool force)
+static int do_sysinfo(struct sysinfo *info)
{
- int argc;
- char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
- static char *envp[] = {
- "HOME=/",
- "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
- NULL
- };
- int ret = -ENOMEM;
- struct subprocess_info *info;
-
- if (argv == NULL) {
- printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
- __func__, poweroff_cmd);
- goto out;
- }
+ unsigned long mem_total, sav_total;
+ unsigned int mem_unit, bitcount;
+ struct timespec tp;
+
+ memset(info, 0, sizeof(struct sysinfo));
+
+ get_monotonic_boottime(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+
+ info->procs = nr_threads;
+
+ si_meminfo(info);
+ si_swapinfo(info);
+
+ /*
+ * If the sum of all the available memory (i.e. ram + swap)
+ * is less than can be stored in a 32 bit unsigned long then
+ * we can be binary compatible with 2.2.x kernels. If not,
+ * well, in that case 2.2.x was broken anyways...
+ *
+ * -Erik Andersen <andersee@debian.org>
+ */
- info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
- if (info == NULL) {
- argv_free(argv);
+ mem_total = info->totalram + info->totalswap;
+ if (mem_total < info->totalram || mem_total < info->totalswap)
goto out;
+ bitcount = 0;
+ mem_unit = info->mem_unit;
+ while (mem_unit > 1) {
+ bitcount++;
+ mem_unit >>= 1;
+ sav_total = mem_total;
+ mem_total <<= 1;
+ if (mem_total < sav_total)
+ goto out;
}
- call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
+ /*
+ * If mem_total did not overflow, multiply all memory values by
+ * info->mem_unit and set it to 1. This leaves things compatible
+ * with 2.2.x, and also retains compatibility with earlier 2.4.x
+ * kernels...
+ */
- ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+ info->mem_unit = 1;
+ info->totalram <<= bitcount;
+ info->freeram <<= bitcount;
+ info->sharedram <<= bitcount;
+ info->bufferram <<= bitcount;
+ info->totalswap <<= bitcount;
+ info->freeswap <<= bitcount;
+ info->totalhigh <<= bitcount;
+ info->freehigh <<= bitcount;
- out:
- if (ret && force) {
- printk(KERN_WARNING "Failed to start orderly shutdown: "
- "forcing the issue\n");
+out:
+ return 0;
+}
- /* I guess this should try to kick off some daemon to
- sync and poweroff asap. Or not even bother syncing
- if we're doing an emergency shutdown? */
- emergency_sync();
- kernel_power_off();
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
+{
+ struct sysinfo val;
+
+ do_sysinfo(&val);
+
+ if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sysinfo {
+ s32 uptime;
+ u32 loads[3];
+ u32 totalram;
+ u32 freeram;
+ u32 sharedram;
+ u32 bufferram;
+ u32 totalswap;
+ u32 freeswap;
+ u16 procs;
+ u16 pad;
+ u32 totalhigh;
+ u32 freehigh;
+ u32 mem_unit;
+ char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
+{
+ struct sysinfo s;
+
+ do_sysinfo(&s);
+
+ /* Check to see if any memory value is too large for 32-bit and scale
+ * down if needed
+ */
+ if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ int bitcount = 0;
+
+ while (s.mem_unit < PAGE_SIZE) {
+ s.mem_unit <<= 1;
+ bitcount++;
+ }
+
+ s.totalram >>= bitcount;
+ s.freeram >>= bitcount;
+ s.sharedram >>= bitcount;
+ s.bufferram >>= bitcount;
+ s.totalswap >>= bitcount;
+ s.freeswap >>= bitcount;
+ s.totalhigh >>= bitcount;
+ s.freehigh >>= bitcount;
}
- return ret;
+ if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+ __put_user(s.uptime, &info->uptime) ||
+ __put_user(s.loads[0], &info->loads[0]) ||
+ __put_user(s.loads[1], &info->loads[1]) ||
+ __put_user(s.loads[2], &info->loads[2]) ||
+ __put_user(s.totalram, &info->totalram) ||
+ __put_user(s.freeram, &info->freeram) ||
+ __put_user(s.sharedram, &info->sharedram) ||
+ __put_user(s.bufferram, &info->bufferram) ||
+ __put_user(s.totalswap, &info->totalswap) ||
+ __put_user(s.freeswap, &info->freeswap) ||
+ __put_user(s.procs, &info->procs) ||
+ __put_user(s.totalhigh, &info->totalhigh) ||
+ __put_user(s.freehigh, &info->freehigh) ||
+ __put_user(s.mem_unit, &info->mem_unit))
+ return -EFAULT;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(orderly_poweroff);
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d..36441b51b5d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,11 +20,13 @@ cond_syscall(sys_quotactl);
cond_syscall(sys32_quotactl);
cond_syscall(sys_acct);
cond_syscall(sys_lookup_dcookie);
+cond_syscall(compat_sys_lookup_dcookie);
cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
cond_syscall(sys_init_module);
+cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
cond_syscall(sys_socketpair);
cond_syscall(sys_bind);
@@ -133,6 +135,8 @@ cond_syscall(sys_setresgid16);
cond_syscall(sys_setresuid16);
cond_syscall(sys_setreuid16);
cond_syscall(sys_setuid16);
+cond_syscall(sys_sgetmask);
+cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
cond_syscall(sys_ipc);
@@ -144,17 +148,19 @@ cond_syscall(sys_io_destroy);
cond_syscall(sys_io_submit);
cond_syscall(sys_io_cancel);
cond_syscall(sys_io_getevents);
+cond_syscall(sys_sysfs);
cond_syscall(sys_syslog);
cond_syscall(sys_process_vm_readv);
cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_uselib);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
cond_syscall(sys_pciconfig_write);
cond_syscall(sys_pciconfig_iobase);
-cond_syscall(sys32_ipc);
+cond_syscall(compat_sys_s390_ipc);
cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
@@ -198,8 +204,12 @@ cond_syscall(sys_perf_event_open);
/* fanotify! */
cond_syscall(sys_fanotify_init);
cond_syscall(sys_fanotify_mark);
+cond_syscall(compat_sys_fanotify_mark);
/* open by handle */
cond_syscall(sys_name_to_handle_at);
cond_syscall(sys_open_by_handle_at);
cond_syscall(compat_sys_open_by_handle_at);
+
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05..75b22e22a72 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,12 +23,14 @@
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
+#include <linux/bitmap.h>
#include <linux/signal.h>
#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -58,6 +60,9 @@
#include <linux/oom.h>
#include <linux/kmod.h>
#include <linux/capability.h>
+#include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
+#include <linux/kexec.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -67,6 +72,9 @@
#include <asm/stacktrace.h>
#include <asm/io.h>
#endif
+#ifdef CONFIG_SPARC
+#include <asm/setup.h>
+#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
#include <linux/acct.h>
#endif
@@ -88,17 +96,15 @@
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
extern int max_threads;
-extern int core_uses_pid;
extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
+#endif
extern int pid_max;
-extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
extern int latencytop_enabled;
@@ -106,20 +112,18 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
extern int sysctl_nr_trim_pages;
#endif
-#ifdef CONFIG_BLOCK
-extern int blk_iopoll_enabled;
-#endif
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
-static int neg_one = -1;
#endif
+static int __maybe_unused neg_one = -1;
+
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
-static int __maybe_unused three = 3;
+static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
#ifdef CONFIG_PRINTK
@@ -132,33 +136,45 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
+/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+#ifdef CONFIG_DETECT_HUNG_TASK
+static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
+#endif
+
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
#ifdef CONFIG_SPARC
-#include <asm/system.h>
-#endif
-
-#ifdef CONFIG_SPARC64
-extern int sysctl_tsb_ratio;
#endif
#ifdef __hppa__
extern int pwrsw_enabled;
+#endif
+
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
extern int unaligned_enabled;
#endif
#ifdef CONFIG_IA64
-extern int no_unaligned_warning;
extern int unaligned_dump_stack;
#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+extern int no_unaligned_warning;
+#endif
+
#ifdef CONFIG_PROC_SYSCTL
+
+#define SYSCTL_WRITES_LEGACY -1
+#define SYSCTL_WRITES_WARN 0
+#define SYSCTL_WRITES_STRICT 1
+
+static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
static int proc_taint(struct ctl_table *table, int write,
@@ -166,15 +182,22 @@ static int proc_taint(struct ctl_table *table, int write,
#endif
#ifdef CONFIG_PRINTK
-static int proc_dmesg_restrict(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COREDUMP
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
-static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
+static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
-static int sysrq_sysctl_handler(ctl_table *table, int write,
+static int sysrq_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -192,20 +215,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
#endif
-static struct ctl_table root_table[];
-static struct ctl_table_root sysctl_table_root;
-static struct ctl_table_header root_table_header = {
- {{.count = 1,
- .ctl_table = root_table,
- .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
- .root = &sysctl_table_root,
- .set = &sysctl_table_root.default_set,
-};
-static struct ctl_table_root sysctl_table_root = {
- .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
- .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
-};
-
static struct ctl_table kern_table[];
static struct ctl_table vm_table[];
static struct ctl_table fs_table[];
@@ -222,7 +231,7 @@ int sysctl_legacy_va_layout;
/* The default sysctl tables: */
-static struct ctl_table root_table[] = {
+static struct ctl_table sysctl_base_table[] = {
{
.procname = "kernel",
.mode = 0555,
@@ -256,9 +265,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
@@ -301,6 +312,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
+#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
@@ -311,7 +323,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_tunable_scaling,
},
{
- .procname = "sched_migration_cost",
+ .procname = "sched_migration_cost_ns",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
@@ -325,14 +337,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "sched_time_avg",
+ .procname = "sched_time_avg_ms",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
- .procname = "sched_shares_window",
+ .procname = "sched_shares_window_ns",
.data = &sysctl_sched_shares_window,
.maxlen = sizeof(unsigned int),
.mode = 0644,
@@ -347,7 +359,47 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_scan_delay_ms",
+ .data = &sysctl_numa_balancing_scan_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_min_ms",
+ .data = &sysctl_numa_balancing_scan_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_max_ms",
+ .data = &sysctl_numa_balancing_scan_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_size_mb",
+ .data = &sysctl_numa_balancing_scan_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
@@ -362,6 +414,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -408,6 +467,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_COREDUMP
{
.procname = "core_uses_pid",
.data = &core_uses_pid,
@@ -420,7 +480,7 @@ static struct ctl_table kern_table[] = {
.data = core_pattern,
.maxlen = CORENAME_MAX_SIZE,
.mode = 0644,
- .proc_handler = proc_dostring,
+ .proc_handler = proc_dostring_coredump,
},
{
.procname = "core_pipe_limit",
@@ -429,6 +489,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -436,6 +497,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_taint,
},
+ {
+ .procname = "sysctl_writes_strict",
+ .data = &sysctl_writes_strict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
+ .extra2 = &one,
+ },
#endif
#ifdef CONFIG_LATENCYTOP
{
@@ -502,6 +572,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
{
.procname = "unaligned-trap",
.data = &unaligned_enabled,
@@ -543,6 +615,25 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "traceoff_on_warning",
+ .data = &__disable_trace_on_warning,
+ .maxlen = sizeof(__disable_trace_on_warning),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_KEXEC
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
#endif
#ifdef CONFIG_MODULES
{
@@ -563,7 +654,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_UEVENT_HELPER
{
.procname = "hotplug",
.data = &uevent_helper,
@@ -713,7 +804,7 @@ static struct ctl_table kern_table[] = {
.data = &dmesg_restrict,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
.extra1 = &zero,
.extra2 = &one,
},
@@ -722,7 +813,7 @@ static struct ctl_table kern_table[] = {
.data = &kptr_restrict,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dmesg_restrict,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
.extra1 = &zero,
.extra2 = &two,
},
@@ -744,7 +835,7 @@ static struct ctl_table kern_table[] = {
#if defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "watchdog",
- .data = &watchdog_enabled,
+ .data = &watchdog_user_enabled,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = proc_dowatchdog,
@@ -757,7 +848,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dowatchdog,
- .extra1 = &neg_one,
+ .extra1 = &zero,
.extra2 = &sixty,
},
{
@@ -769,9 +860,20 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_SMP
+ {
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SMP */
{
.procname = "nmi_watchdog",
- .data = &watchdog_enabled,
+ .data = &watchdog_user_enabled,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = proc_dowatchdog,
@@ -868,7 +970,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
#endif
-#ifdef CONFIG_IA64
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
{
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
@@ -876,6 +978,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#ifdef CONFIG_IA64
{
.procname = "unaligned-dump-stack",
.data = &unaligned_dump_stack,
@@ -897,9 +1001,10 @@ static struct ctl_table kern_table[] = {
{
.procname = "hung_task_check_count",
.data = &sysctl_hung_task_check_count,
- .maxlen = sizeof(unsigned long),
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "hung_task_timeout_secs",
@@ -907,13 +1012,15 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = &hung_task_timeout_max,
},
{
.procname = "hung_task_warnings",
.data = &sysctl_hung_task_warnings,
- .maxlen = sizeof(unsigned long),
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
},
#endif
#ifdef CONFIG_COMPAT
@@ -984,21 +1091,22 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
.proc_handler = perf_proc_update_handler,
+ .extra1 = &one,
},
-#endif
-#ifdef CONFIG_KMEMCHECK
{
- .procname = "kmemcheck",
- .data = &kmemcheck_enabled,
- .maxlen = sizeof(int),
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
#endif
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_KMEMCHECK
{
- .procname = "blk_iopoll",
- .data = &blk_iopoll_enabled,
+ .procname = "kmemcheck",
+ .data = &kmemcheck_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -1045,7 +1153,14 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
},
{
.procname = "page-cluster",
@@ -1105,11 +1220,9 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
- .procname = "nr_pdflush_threads",
- .data = &nr_pdflush_threads,
- .maxlen = sizeof nr_pdflush_threads,
- .mode = 0444 /* read-only*/,
- .proc_handler = proc_dointvec,
+ .procname = "nr_pdflush_threads",
+ .mode = 0444 /* read-only */,
+ .proc_handler = pdflush_proc_obsolete,
},
{
.procname = "swappiness",
@@ -1153,7 +1266,7 @@ static struct ctl_table vm_table[] = {
.data = &hugepages_treat_as_movable,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = hugetlb_treat_movable_handler,
+ .proc_handler = proc_dointvec,
},
{
.procname = "nr_overcommit_hugepages",
@@ -1179,7 +1292,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = &one,
- .extra2 = &three,
+ .extra2 = &four,
},
#ifdef CONFIG_COMPACTION
{
@@ -1214,7 +1327,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
@@ -1327,8 +1440,13 @@ static struct ctl_table vm_table[] = {
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
.procname = "vdso_enabled",
+#ifdef CONFIG_X86_32
+ .data = &vdso32_enabled,
+ .maxlen = sizeof(vdso32_enabled),
+#else
.data = &vdso_enabled,
.maxlen = sizeof(vdso_enabled),
+#endif
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
@@ -1372,6 +1490,20 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
@@ -1385,14 +1517,14 @@ static struct ctl_table fs_table[] = {
{
.procname = "inode-nr",
.data = &inodes_stat,
- .maxlen = 2*sizeof(int),
+ .maxlen = 2*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
{
.procname = "inode-state",
.data = &inodes_stat,
- .maxlen = 7*sizeof(int),
+ .maxlen = 7*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
@@ -1422,7 +1554,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "dentry-state",
.data = &dentry_stat,
- .maxlen = 6*sizeof(int),
+ .maxlen = 6*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
@@ -1504,11 +1636,29 @@ static struct ctl_table fs_table[] = {
#endif
#endif
{
+ .procname = "protected_symlinks",
+ .data = &sysctl_protected_symlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "protected_hardlinks",
+ .data = &sysctl_protected_hardlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "suid_dumpable",
.data = &suid_dumpable,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dointvec_minmax_coredump,
.extra1 = &zero,
.extra2 = &two,
},
@@ -1531,8 +1681,7 @@ static struct ctl_table fs_table[] = {
};
static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390) || defined(CONFIG_TILE)
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
@@ -1559,490 +1708,15 @@ static struct ctl_table dev_table[] = {
{ }
};
-static DEFINE_SPINLOCK(sysctl_lock);
-
-/* called under sysctl_lock */
-static int use_table(struct ctl_table_header *p)
-{
- if (unlikely(p->unregistering))
- return 0;
- p->used++;
- return 1;
-}
-
-/* called under sysctl_lock */
-static void unuse_table(struct ctl_table_header *p)
-{
- if (!--p->used)
- if (unlikely(p->unregistering))
- complete(p->unregistering);
-}
-
-/* called under sysctl_lock, will reacquire if has to wait */
-static void start_unregistering(struct ctl_table_header *p)
-{
- /*
- * if p->used is 0, nobody will ever touch that entry again;
- * we'll eliminate all paths to it before dropping sysctl_lock
- */
- if (unlikely(p->used)) {
- struct completion wait;
- init_completion(&wait);
- p->unregistering = &wait;
- spin_unlock(&sysctl_lock);
- wait_for_completion(&wait);
- spin_lock(&sysctl_lock);
- } else {
- /* anything non-NULL; we'll never dereference it */
- p->unregistering = ERR_PTR(-EINVAL);
- }
- /*
- * do not remove from the list until nobody holds it; walking the
- * list in do_sysctl() relies on that.
- */
- list_del_init(&p->ctl_entry);
-}
-
-void sysctl_head_get(struct ctl_table_header *head)
-{
- spin_lock(&sysctl_lock);
- head->count++;
- spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
+int __init sysctl_init(void)
{
- spin_lock(&sysctl_lock);
- if (!--head->count)
- kfree_rcu(head, rcu);
- spin_unlock(&sysctl_lock);
-}
+ struct ctl_table_header *hdr;
-struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
-{
- if (!head)
- BUG();
- spin_lock(&sysctl_lock);
- if (!use_table(head))
- head = ERR_PTR(-ENOENT);
- spin_unlock(&sysctl_lock);
- return head;
-}
-
-void sysctl_head_finish(struct ctl_table_header *head)
-{
- if (!head)
- return;
- spin_lock(&sysctl_lock);
- unuse_table(head);
- spin_unlock(&sysctl_lock);
-}
-
-static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
- struct ctl_table_set *set = &root->default_set;
- if (root->lookup)
- set = root->lookup(root, namespaces);
- return set;
-}
-
-static struct list_head *
-lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
- struct ctl_table_set *set = lookup_header_set(root, namespaces);
- return &set->list;
-}
-
-struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
- struct ctl_table_header *prev)
-{
- struct ctl_table_root *root;
- struct list_head *header_list;
- struct ctl_table_header *head;
- struct list_head *tmp;
-
- spin_lock(&sysctl_lock);
- if (prev) {
- head = prev;
- tmp = &prev->ctl_entry;
- unuse_table(prev);
- goto next;
- }
- tmp = &root_table_header.ctl_entry;
- for (;;) {
- head = list_entry(tmp, struct ctl_table_header, ctl_entry);
-
- if (!use_table(head))
- goto next;
- spin_unlock(&sysctl_lock);
- return head;
- next:
- root = head->root;
- tmp = tmp->next;
- header_list = lookup_header_list(root, namespaces);
- if (tmp != header_list)
- continue;
-
- do {
- root = list_entry(root->root_list.next,
- struct ctl_table_root, root_list);
- if (root == &sysctl_table_root)
- goto out;
- header_list = lookup_header_list(root, namespaces);
- } while (list_empty(header_list));
- tmp = header_list->next;
- }
-out:
- spin_unlock(&sysctl_lock);
- return NULL;
-}
-
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
-{
- return __sysctl_head_next(current->nsproxy, prev);
-}
-
-void register_sysctl_root(struct ctl_table_root *root)
-{
- spin_lock(&sysctl_lock);
- list_add_tail(&root->root_list, &sysctl_table_root.root_list);
- spin_unlock(&sysctl_lock);
-}
-
-/*
- * sysctl_perm does NOT grant the superuser all rights automatically, because
- * some sysctl variables are readonly even to root.
- */
-
-static int test_perm(int mode, int op)
-{
- if (!current_euid())
- mode >>= 6;
- else if (in_egroup_p(0))
- mode >>= 3;
- if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
- return 0;
- return -EACCES;
-}
-
-int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
-{
- int mode;
-
- if (root->permissions)
- mode = root->permissions(root, current->nsproxy, table);
- else
- mode = table->mode;
-
- return test_perm(mode, op);
-}
-
-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
- for (; table->procname; table++) {
- table->parent = parent;
- if (table->child)
- sysctl_set_parent(table, table->child);
- }
-}
-
-static __init int sysctl_init(void)
-{
- sysctl_set_parent(NULL, root_table);
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
- sysctl_check_table(current->nsproxy, root_table);
-#endif
+ hdr = register_sysctl_table(sysctl_base_table);
+ kmemleak_not_leak(hdr);
return 0;
}
-core_initcall(sysctl_init);
-
-static struct ctl_table *is_branch_in(struct ctl_table *branch,
- struct ctl_table *table)
-{
- struct ctl_table *p;
- const char *s = branch->procname;
-
- /* branch should have named subdirectory as its first element */
- if (!s || !branch->child)
- return NULL;
-
- /* ... and nothing else */
- if (branch[1].procname)
- return NULL;
-
- /* table should contain subdirectory with the same name */
- for (p = table; p->procname; p++) {
- if (!p->child)
- continue;
- if (p->procname && strcmp(p->procname, s) == 0)
- return p;
- }
- return NULL;
-}
-
-/* see if attaching q to p would be an improvement */
-static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
-{
- struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
- struct ctl_table *next;
- int is_better = 0;
- int not_in_parent = !p->attached_by;
-
- while ((next = is_branch_in(by, to)) != NULL) {
- if (by == q->attached_by)
- is_better = 1;
- if (to == p->attached_by)
- not_in_parent = 1;
- by = by->child;
- to = next->child;
- }
-
- if (is_better && not_in_parent) {
- q->attached_by = by;
- q->attached_to = to;
- q->parent = p;
- }
-}
-
-/**
- * __register_sysctl_paths - register a sysctl hierarchy
- * @root: List of sysctl headers to register on
- * @namespaces: Data to compute which lists of sysctl entries are visible
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * The members of the &struct ctl_table structure are used as follows:
- *
- * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
- * enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file, and for sysctl(2)
- *
- * child - a pointer to the child sysctl table if this entry is a directory, or
- * %NULL.
- *
- * proc_handler - the text handler routine (described below)
- *
- * de - for internal use by the sysctl routines
- *
- * extra1, extra2 - extra pointers usable by the proc handler routines
- *
- * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
- *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table. The data and maxlen fields of the ctl_table
- * struct enable minimal validation of the values being written to be
- * performed, and the mode field allows minimal authentication.
- *
- * There must be a proc_handler routine for any terminal nodes
- * mirrored under /proc/sys (non-terminals are handled by a built-in
- * directory handler). Several default handlers are available to
- * cover common cases -
- *
- * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
- * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
- * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
- *
- * It is the handler's job to read the input buffer from user memory
- * and process it. The handler should return 0 on success.
- *
- * This routine returns %NULL on a failure to register, and a pointer
- * to the table header on success.
- */
-struct ctl_table_header *__register_sysctl_paths(
- struct ctl_table_root *root,
- struct nsproxy *namespaces,
- const struct ctl_path *path, struct ctl_table *table)
-{
- struct ctl_table_header *header;
- struct ctl_table *new, **prevp;
- unsigned int n, npath;
- struct ctl_table_set *set;
-
- /* Count the path components */
- for (npath = 0; path[npath].procname; ++npath)
- ;
-
- /*
- * For each path component, allocate a 2-element ctl_table array.
- * The first array element will be filled with the sysctl entry
- * for this, the second will be the sentinel (procname == 0).
- *
- * We allocate everything in one go so that we don't have to
- * worry about freeing additional memory in unregister_sysctl_table.
- */
- header = kzalloc(sizeof(struct ctl_table_header) +
- (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
- if (!header)
- return NULL;
-
- new = (struct ctl_table *) (header + 1);
-
- /* Now connect the dots */
- prevp = &header->ctl_table;
- for (n = 0; n < npath; ++n, ++path) {
- /* Copy the procname */
- new->procname = path->procname;
- new->mode = 0555;
-
- *prevp = new;
- prevp = &new->child;
-
- new += 2;
- }
- *prevp = table;
- header->ctl_table_arg = table;
-
- INIT_LIST_HEAD(&header->ctl_entry);
- header->used = 0;
- header->unregistering = NULL;
- header->root = root;
- sysctl_set_parent(NULL, header->ctl_table);
- header->count = 1;
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
- if (sysctl_check_table(namespaces, header->ctl_table)) {
- kfree(header);
- return NULL;
- }
-#endif
- spin_lock(&sysctl_lock);
- header->set = lookup_header_set(root, namespaces);
- header->attached_by = header->ctl_table;
- header->attached_to = root_table;
- header->parent = &root_table_header;
- for (set = header->set; set; set = set->parent) {
- struct ctl_table_header *p;
- list_for_each_entry(p, &set->list, ctl_entry) {
- if (p->unregistering)
- continue;
- try_attach(p, header);
- }
- }
- header->parent->count++;
- list_add_tail(&header->ctl_entry, &header->set->list);
- spin_unlock(&sysctl_lock);
-
- return header;
-}
-
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
- path, table);
-}
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
- static const struct ctl_path null_path[] = { {} };
-
- return register_sysctl_paths(null_path, table);
-}
-
-/**
- * unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
- *
- * Unregisters the sysctl table and all children. proc entries may not
- * actually be removed until they are no longer used by anyone.
- */
-void unregister_sysctl_table(struct ctl_table_header * header)
-{
- might_sleep();
-
- if (header == NULL)
- return;
-
- spin_lock(&sysctl_lock);
- start_unregistering(header);
- if (!--header->parent->count) {
- WARN_ON(1);
- kfree_rcu(header->parent, rcu);
- }
- if (!--header->count)
- kfree_rcu(header, rcu);
- spin_unlock(&sysctl_lock);
-}
-
-int sysctl_is_seen(struct ctl_table_header *p)
-{
- struct ctl_table_set *set = p->set;
- int res;
- spin_lock(&sysctl_lock);
- if (p->unregistering)
- res = 0;
- else if (!set->is_seen)
- res = 1;
- else
- res = set->is_seen(set);
- spin_unlock(&sysctl_lock);
- return res;
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
- struct ctl_table_set *parent,
- int (*is_seen)(struct ctl_table_set *))
-{
- INIT_LIST_HEAD(&p->list);
- p->parent = parent ? parent : &sysctl_table_root.default_set;
- p->is_seen = is_seen;
-}
-
-#else /* !CONFIG_SYSCTL */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-{
- return NULL;
-}
-
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
- struct ctl_table_set *parent,
- int (*is_seen)(struct ctl_table_set *))
-{
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-}
-
#endif /* CONFIG_SYSCTL */
/*
@@ -2051,8 +1725,8 @@ void sysctl_head_put(struct ctl_table_header *head)
#ifdef CONFIG_PROC_SYSCTL
-static int _proc_do_string(void* data, int maxlen, int write,
- void __user *buffer,
+static int _proc_do_string(char *data, int maxlen, int write,
+ char __user *buffer,
size_t *lenp, loff_t *ppos)
{
size_t len;
@@ -2065,21 +1739,30 @@ static int _proc_do_string(void* data, int maxlen, int write,
}
if (write) {
- len = 0;
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
p = buffer;
- while (len < *lenp) {
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
if (get_user(c, p++))
return -EFAULT;
if (c == 0 || c == '\n')
break;
- len++;
+ data[len++] = c;
}
- if (len >= maxlen)
- len = maxlen-1;
- if(copy_from_user(data, buffer, len))
- return -EFAULT;
- ((char *) data)[len] = 0;
- *ppos += *lenp;
+ data[len] = 0;
} else {
len = strlen(data);
if (len > maxlen)
@@ -2096,10 +1779,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
if (len > *lenp)
len = *lenp;
if (len)
- if(copy_to_user(buffer, data, len))
+ if (copy_to_user(buffer, data, len))
return -EFAULT;
if (len < *lenp) {
- if(put_user('\n', ((char __user *) buffer) + len))
+ if (put_user('\n', buffer + len))
return -EFAULT;
len++;
}
@@ -2109,6 +1792,14 @@ static int _proc_do_string(void* data, int maxlen, int write,
return 0;
}
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
/**
* proc_dostring - read a string sysctl
* @table: the sysctl table
@@ -2129,8 +1820,11 @@ static int _proc_do_string(void* data, int maxlen, int write,
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return _proc_do_string(table->data, table->maxlen, write,
- buffer, lenp, ppos);
+ if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
+ warn_sysctl_write(table);
+
+ return _proc_do_string((char *)(table->data), table->maxlen, write,
+ (char __user *)buffer, lenp, ppos);
}
static size_t proc_skip_spaces(char **buf)
@@ -2304,6 +1998,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
conv = do_proc_dointvec_conv;
if (write) {
+ if (*ppos) {
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ goto out;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ break;
+ default:
+ break;
+ }
+ }
+
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
page = __get_free_page(GFP_TEMPORARY);
@@ -2361,6 +2067,7 @@ free:
return err ? : -EINVAL;
}
*lenp -= left;
+out:
*ppos += *lenp;
return err;
}
@@ -2423,7 +2130,7 @@ static int proc_taint(struct ctl_table *table, int write,
int i;
for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
if ((tmptaint >> i) & 1)
- add_taint(i);
+ add_taint(i, LOCKDEP_STILL_OK);
}
}
@@ -2431,7 +2138,7 @@ static int proc_taint(struct ctl_table *table, int write,
}
#ifdef CONFIG_PRINTK
-static int proc_dmesg_restrict(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
if (write && !capable(CAP_SYS_ADMIN))
@@ -2497,6 +2204,38 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
do_proc_dointvec_minmax_conv, &param);
}
+static void validate_coredump_safety(void)
+{
+#ifdef CONFIG_COREDUMP
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING "Unsafe core_pattern used with "\
+ "suid_dumpable=2. Pipe handler or fully qualified "\
+ "core dump path required.\n");
+ }
+#endif
+}
+
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
+#ifdef CONFIG_COREDUMP
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+#endif
+
static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos,
@@ -2521,6 +2260,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
left = *lenp;
if (write) {
+ if (*ppos) {
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ goto out;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ break;
+ default:
+ break;
+ }
+ }
+
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
page = __get_free_page(GFP_TEMPORARY);
@@ -2554,8 +2305,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
*i = val;
} else {
val = convdiv * (*i) / convmul;
- if (!first)
+ if (!first) {
err = proc_put_char(&buffer, &left, '\t');
+ if (err)
+ break;
+ }
err = proc_put_long(&buffer, &left, val, false);
if (err)
break;
@@ -2573,6 +2327,7 @@ free:
return err ? : -EINVAL;
}
*lenp -= left;
+out:
*ppos += *lenp;
return err;
}
@@ -2686,7 +2441,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
- *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+ unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+
+ if (jif > INT_MAX)
+ return 1;
+ *valp = (int)jif;
} else {
int val = *valp;
unsigned long lval;
@@ -2815,11 +2574,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = (unsigned long *) table->data;
+ unsigned long *bitmap = *(unsigned long **) table->data;
unsigned long *tmp_bitmap = NULL;
char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
- if (!bitmap_len || !left || (*ppos && !write)) {
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
*lenp = 0;
return 0;
}
@@ -2884,9 +2643,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
}
- while (val_a <= val_b)
- set_bit(val_a++, tmp_bitmap);
-
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
first = 0;
proc_skip_char(&kbuf, &left, '\n');
}
@@ -2929,8 +2686,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
if (*ppos)
bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
else
- memcpy(bitmap, tmp_bitmap,
- BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
}
kfree(tmp_bitmap);
*lenp -= left;
@@ -3008,6 +2764,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
-EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a..653cbbd9e7a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -3,7 +3,6 @@
#include "../fs/xfs/xfs_sysctl.h"
#include <linux/sunrpc/debug.h>
#include <linux/string.h>
-#include <net/ip_vs.h>
#include <linux/syscalls.h>
#include <linux/namei.h>
#include <linux/mount.h>
@@ -15,6 +14,7 @@
#include <linux/netdevice.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/compat.h>
#ifdef CONFIG_SYSCTL_SYSCALL
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
{ CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
/* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
/* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
- { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
+ /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
{ CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
/* VM_PAGEBUF unused */
/* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
{ CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
{ CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
- { CTL_INT, NET_TCP_ABC, "tcp_abc" },
{ CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
{ CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
{ CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
static ssize_t bin_intvec(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t copied = 0;
char *buffer;
ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
if (oldval && oldlen) {
unsigned __user *vec = oldval;
size_t length = oldlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buffer, BUFSZ - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
if (result < 0)
goto out_kfree;
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
if (newval && newlen) {
unsigned __user *vec = newval;
size_t length = newlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
@@ -1030,12 +1024,10 @@ static ssize_t bin_intvec(struct file *file,
if (get_user(value, vec + i))
goto out_kfree;
- str += snprintf(str, end - str, "%lu\t", value);
+ str += scnprintf(str, end - str, "%lu\t", value);
}
- set_fs(KERNEL_DS);
- result = vfs_write(file, buffer, str - buffer, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buffer, str - buffer, 0);
if (result < 0)
goto out_kfree;
}
@@ -1049,7 +1041,6 @@ out:
static ssize_t bin_ulongvec(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t copied = 0;
char *buffer;
ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
if (oldval && oldlen) {
unsigned long __user *vec = oldval;
size_t length = oldlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buffer, BUFSZ - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
if (result < 0)
goto out_kfree;
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
if (newval && newlen) {
unsigned long __user *vec = newval;
size_t length = newlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
@@ -1108,12 +1095,10 @@ static ssize_t bin_ulongvec(struct file *file,
if (get_user(value, vec + i))
goto out_kfree;
- str += snprintf(str, end - str, "%lu\t", value);
+ str += scnprintf(str, end - str, "%lu\t", value);
}
- set_fs(KERNEL_DS);
- result = vfs_write(file, buffer, str - buffer, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buffer, str - buffer, 0);
if (result < 0)
goto out_kfree;
}
@@ -1127,19 +1112,15 @@ out:
static ssize_t bin_uuid(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t result, copied = 0;
/* Only supports reads */
if (oldval && oldlen) {
- loff_t pos = 0;
char buf[40], *str = buf;
unsigned char uuid[16];
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
goto out;
@@ -1175,18 +1156,14 @@ out:
static ssize_t bin_dn_node_address(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t result, copied = 0;
if (oldval && oldlen) {
- loff_t pos = 0;
char buf[15], *nodep;
unsigned long area, node;
__le16 dnaddr;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
goto out;
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
/* Convert the decnet address to binary */
result = -EIO;
- nodep = strchr(buf, '.') + 1;
+ nodep = strchr(buf, '.');
if (!nodep)
goto out;
+ ++nodep;
area = simple_strtoul(buf, NULL, 10);
node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
}
if (newval && newlen) {
- loff_t pos = 0;
__le16 dnaddr;
char buf[15];
int len;
@@ -1228,13 +1205,11 @@ static ssize_t bin_dn_node_address(struct file *file,
if (get_user(dnaddr, (__le16 __user *)newval))
goto out;
- len = snprintf(buf, sizeof(buf), "%hu.%hu",
+ len = scnprintf(buf, sizeof(buf), "%hu.%hu",
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
- set_fs(KERNEL_DS);
- result = vfs_write(file, buf, len, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buf, len, 0);
if (result < 0)
goto out;
}
@@ -1344,7 +1319,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
goto out_putname;
}
- mnt = current->nsproxy->pid_ns->proc_mnt;
+ mnt = task_active_pid_ns(current)->proc_mnt;
file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
result = PTR_ERR(file);
if (IS_ERR(file))
@@ -1472,7 +1447,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
#ifdef CONFIG_COMPAT
-#include <asm/compat.h>
struct compat_sysctl_args {
compat_uptr_t name;
@@ -1484,7 +1458,7 @@ struct compat_sysctl_args {
compat_ulong_t __unused[4];
};
-asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
+COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
{
struct compat_sysctl_args tmp;
compat_size_t __user *compat_oldlenp;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813..00000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <net/ip_vs.h>
-
-
-static int sysctl_depth(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth;
-
- depth = 0;
- for (tmp = table; tmp->parent; tmp = tmp->parent)
- depth++;
-
- return depth;
-}
-
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
-{
- int i;
-
- for (i = 0; table && i < n; i++)
- table = table->parent;
-
- return table;
-}
-
-
-static void sysctl_print_path(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth, i;
- depth = sysctl_depth(table);
- if (table->procname) {
- for (i = depth; i >= 0; i--) {
- tmp = sysctl_parent(table, i);
- printk("/%s", tmp->procname?tmp->procname:"");
- }
- }
- printk(" ");
-}
-
-static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
- struct ctl_table *table)
-{
- struct ctl_table_header *head;
- struct ctl_table *ref, *test;
- int depth, cur_depth;
-
- depth = sysctl_depth(table);
-
- for (head = __sysctl_head_next(namespaces, NULL); head;
- head = __sysctl_head_next(namespaces, head)) {
- cur_depth = depth;
- ref = head->ctl_table;
-repeat:
- test = sysctl_parent(table, cur_depth);
- for (; ref->procname; ref++) {
- int match = 0;
- if (cur_depth && !ref->child)
- continue;
-
- if (test->procname && ref->procname &&
- (strcmp(test->procname, ref->procname) == 0))
- match++;
-
- if (match) {
- if (cur_depth != 0) {
- cur_depth--;
- ref = ref->child;
- goto repeat;
- }
- goto out;
- }
- }
- }
- ref = NULL;
-out:
- sysctl_head_finish(head);
- return ref;
-}
-
-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
-{
- if (*fail) {
- printk(KERN_ERR "sysctl table check failed: ");
- sysctl_print_path(table);
- printk(" %s\n", *fail);
- dump_stack();
- }
- *fail = str;
-}
-
-static void sysctl_check_leaf(struct nsproxy *namespaces,
- struct ctl_table *table, const char **fail)
-{
- struct ctl_table *ref;
-
- ref = sysctl_check_lookup(namespaces, table);
- if (ref && (ref != table))
- set_fail(fail, table, "Sysctl already exists");
-}
-
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
-{
- int error = 0;
- for (; table->procname; table++) {
- const char *fail = NULL;
-
- if (table->parent) {
- if (!table->parent->procname)
- set_fail(&fail, table, "Parent without procname");
- }
- if (table->child) {
- if (table->data)
- set_fail(&fail, table, "Directory with data?");
- if (table->maxlen)
- set_fail(&fail, table, "Directory with maxlen?");
- if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
- set_fail(&fail, table, "Writable sysctl directory");
- if (table->proc_handler)
- set_fail(&fail, table, "Directory with proc_handler");
- if (table->extra1)
- set_fail(&fail, table, "Directory with extra1");
- if (table->extra2)
- set_fail(&fail, table, "Directory with extra2");
- } else {
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- set_fail(&fail, table, "No data");
- if (!table->maxlen)
- set_fail(&fail, table, "No maxlen");
- }
-#ifdef CONFIG_PROC_SYSCTL
- if (!table->proc_handler)
- set_fail(&fail, table, "No proc_handler");
-#endif
- sysctl_check_leaf(namespaces, table, &fail);
- }
- if (table->mode > 0777)
- set_fail(&fail, table, "bogus .mode");
- if (fail) {
- set_fail(&fail, table, NULL);
- error = -EINVAL;
- }
- if (table->child)
- error |= sysctl_check_table(namespaces, table->child);
- }
- return error;
-}
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
new file mode 100644
index 00000000000..3e9868d4753
--- /dev/null
+++ b/kernel/system_certificates.S
@@ -0,0 +1,20 @@
+#include <linux/export.h>
+#include <linux/init.h>
+
+ __INITRODATA
+
+ .align 8
+ .globl VMLINUX_SYMBOL(system_certificate_list)
+VMLINUX_SYMBOL(system_certificate_list):
+__cert_list_start:
+ .incbin "kernel/x509_certificate_list"
+__cert_list_end:
+
+ .align 8
+ .globl VMLINUX_SYMBOL(system_certificate_list_size)
+VMLINUX_SYMBOL(system_certificate_list_size):
+#ifdef CONFIG_64BIT
+ .quad __cert_list_end - __cert_list_start
+#else
+ .long __cert_list_end - __cert_list_start
+#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
new file mode 100644
index 00000000000..52ebc70263f
--- /dev/null
+++ b/kernel/system_keyring.c
@@ -0,0 +1,105 @@
+/* System trusted keyring for trusted public keys
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include "module-internal.h"
+
+struct key *system_trusted_keyring;
+EXPORT_SYMBOL_GPL(system_trusted_keyring);
+
+extern __initconst const u8 system_certificate_list[];
+extern __initconst const unsigned long system_certificate_list_size;
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int system_trusted_keyring_init(void)
+{
+ pr_notice("Initialise system trusted keyring\n");
+
+ system_trusted_keyring =
+ keyring_alloc(".system_keyring",
+ KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(system_trusted_keyring))
+ panic("Can't allocate system trusted keyring\n");
+
+ set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
+ return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(system_trusted_keyring_init);
+
+/*
+ * Load the compiled-in list of X.509 certificates.
+ */
+static __init int load_system_certificate_list(void)
+{
+ key_ref_t key;
+ const u8 *p, *end;
+ size_t plen;
+
+ pr_notice("Loading compiled-in X.509 certificates\n");
+
+ p = system_certificate_list;
+ end = p + system_certificate_list_size;
+ while (p < end) {
+ /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+ * than 256 bytes in size.
+ */
+ if (end - p < 4)
+ goto dodgy_cert;
+ if (p[0] != 0x30 &&
+ p[1] != 0x82)
+ goto dodgy_cert;
+ plen = (p[2] << 8) | p[3];
+ plen += 4;
+ if (plen > end - p)
+ goto dodgy_cert;
+
+ key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
+ "asymmetric",
+ NULL,
+ p,
+ plen,
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ),
+ KEY_ALLOC_NOT_IN_QUOTA |
+ KEY_ALLOC_TRUSTED);
+ if (IS_ERR(key)) {
+ pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
+ PTR_ERR(key));
+ } else {
+ pr_notice("Loaded X.509 cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+ key_ref_put(key);
+ }
+ p += plen;
+ }
+
+ return 0;
+
+dodgy_cert:
+ pr_err("Problem parsing in-kernel X.509 certificate list\n");
+ return 0;
+}
+late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 00000000000..8727032e3a6
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,128 @@
+#include <linux/spinlock.h>
+#include <linux/task_work.h>
+#include <linux/tracehook.h>
+
+static struct callback_head work_exited; /* all we need is ->next == NULL */
+
+/**
+ * task_work_add - ask the @task to execute @work->func()
+ * @task: the task which should run the callback
+ * @work: the callback to run
+ * @notify: send the notification if true
+ *
+ * Queue @work for task_work_run() below and notify the @task if @notify.
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task returns from kernel
+ * mode or exits.
+ *
+ * This is like the signal handler which runs in kernel mode, but it doesn't
+ * try to wake up the @task.
+ *
+ * RETURNS:
+ * 0 if succeeds or -ESRCH.
+ */
+int
+task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
+{
+ struct callback_head *head;
+
+ do {
+ head = ACCESS_ONCE(task->task_works);
+ if (unlikely(head == &work_exited))
+ return -ESRCH;
+ work->next = head;
+ } while (cmpxchg(&task->task_works, head, work) != head);
+
+ if (notify)
+ set_notify_resume(task);
+ return 0;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+ struct callback_head **pprev = &task->task_works;
+ struct callback_head *work;
+ unsigned long flags;
+ /*
+ * If cmpxchg() fails we continue without updating pprev.
+ * Either we raced with task_work_add() which added the
+ * new entry before this work, we will find it again. Or
+ * we raced with task_work_run(), *pprev == NULL/exited.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ while ((work = ACCESS_ONCE(*pprev))) {
+ smp_read_barrier_depends();
+ if (work->func != func)
+ pprev = &work->next;
+ else if (cmpxchg(pprev, work, work->next) == work)
+ break;
+ }
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ return work;
+}
+
+/**
+ * task_work_run - execute the works added by task_work_add()
+ *
+ * Flush the pending works. Should be used by the core kernel code.
+ * Called before the task returns to the user-mode or stops, or when
+ * it exits. In the latter case task_work_add() can no longer add the
+ * new work after task_work_run() returns.
+ */
+void task_work_run(void)
+{
+ struct task_struct *task = current;
+ struct callback_head *work, *head, *next;
+
+ for (;;) {
+ /*
+ * work->func() can do task_work_add(), do not set
+ * work_exited unless the list is empty.
+ */
+ do {
+ work = ACCESS_ONCE(task->task_works);
+ head = !work && (task->flags & PF_EXITING) ?
+ &work_exited : NULL;
+ } while (cmpxchg(&task->task_works, work, head) != work);
+
+ if (!work)
+ break;
+ /*
+ * Synchronize with task_work_cancel(). It can't remove
+ * the first entry == work, cmpxchg(task_works) should
+ * fail, but it can play with *work and other entries.
+ */
+ raw_spin_unlock_wait(&task->pi_lock);
+ smp_mb();
+
+ /* Reverse the list to run the works in fifo order */
+ head = NULL;
+ do {
+ next = work->next;
+ work->next = head;
+ head = work;
+ work = next;
+ } while (work);
+
+ work = head;
+ do {
+ next = work->next;
+ work->func(work);
+ work = next;
+ cond_resched();
+ } while (work);
+ }
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4..13d2f7cd65d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/pid_namespace.h>
#include <net/genetlink.h>
#include <linux/atomic.h>
@@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb,
up_write(&listeners->sem);
}
-static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
+static void fill_stats(struct user_namespace *user_ns,
+ struct pid_namespace *pid_ns,
+ struct task_struct *tsk, struct taskstats *stats)
{
memset(stats, 0, sizeof(*stats));
/*
@@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
stats->version = TASKSTATS_VERSION;
stats->nvcsw = tsk->nvcsw;
stats->nivcsw = tsk->nivcsw;
- bacct_add_tsk(stats, tsk);
+ bacct_add_tsk(user_ns, pid_ns, stats, tsk);
/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
@@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
rcu_read_unlock();
if (!tsk)
return -ESRCH;
- fill_stats(tsk, stats);
+ fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
put_task_struct(tsk);
return 0;
}
@@ -287,17 +290,25 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
struct listener_list *listeners;
struct listener *s, *tmp, *s2;
unsigned int cpu;
+ int ret = 0;
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
+ if (current_user_ns() != &init_user_ns)
+ return -EINVAL;
+
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
s = kmalloc_node(sizeof(struct listener),
GFP_KERNEL, cpu_to_node(cpu));
- if (!s)
+ if (!s) {
+ ret = -ENOMEM;
goto cleanup;
-
+ }
s->pid = pid;
s->valid = 1;
@@ -330,7 +341,7 @@ cleanup:
}
up_write(&listeners->sem);
}
- return 0;
+ return ret;
}
static int parse(struct nlattr *na, struct cpumask *mask)
@@ -395,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
if (!na)
goto err;
- if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
+ nla_nest_cancel(skb, na);
goto err;
+ }
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
- if (!ret)
+ if (!ret) {
+ nla_nest_cancel(skb, na);
goto err;
+ }
nla_nest_end(skb, na);
return nla_data(ret);
@@ -415,16 +430,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
struct nlattr *na;
size_t size;
u32 fd;
- struct file *file;
- int fput_needed;
+ struct fd f;
na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
if (!na)
return -EINVAL;
fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
- file = fget_light(fd, &fput_needed);
- if (!file)
+ f = fdget(fd);
+ if (!f.file)
return 0;
size = nla_total_size(sizeof(struct cgroupstats));
@@ -436,10 +450,16 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
sizeof(struct cgroupstats));
+ if (na == NULL) {
+ nlmsg_free(rep_skb);
+ rc = -EMSGSIZE;
+ goto err;
+ }
+
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, file->f_dentry);
+ rc = cgroupstats_build(stats, f.file->f_dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
@@ -448,7 +468,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
rc = send_reply(rep_skb, info);
err:
- fput_light(file, fput_needed);
+ fdput(f);
return rc;
}
@@ -462,7 +482,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
if (rc < 0)
goto out;
- rc = add_del_listener(info->snd_pid, mask, REGISTER);
+ rc = add_del_listener(info->snd_portid, mask, REGISTER);
out:
free_cpumask_var(mask);
return rc;
@@ -478,7 +498,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
if (rc < 0)
goto out;
- rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+ rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
out:
free_cpumask_var(mask);
return rc;
@@ -626,11 +646,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
if (rc < 0)
return;
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
+ task_pid_nr_ns(tsk, &init_pid_ns));
if (!stats)
goto err;
- fill_stats(tsk, stats);
+ fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
/*
* Doesn't matter if tsk is the leader or the last group member leaving
@@ -638,7 +659,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
if (!is_thread_group || !group_dead)
goto send;
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
+ task_tgid_nr_ns(tsk, &init_pid_ns));
if (!stats)
goto err;
@@ -651,17 +673,18 @@ err:
nlmsg_free(rep_skb);
}
-static struct genl_ops taskstats_ops = {
- .cmd = TASKSTATS_CMD_GET,
- .doit = taskstats_user_cmd,
- .policy = taskstats_cmd_get_policy,
- .flags = GENL_ADMIN_PERM,
-};
-
-static struct genl_ops cgroupstats_ops = {
- .cmd = CGROUPSTATS_CMD_GET,
- .doit = cgroupstats_user_cmd,
- .policy = cgroupstats_cmd_get_policy,
+static const struct genl_ops taskstats_ops[] = {
+ {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_user_cmd,
+ .policy = taskstats_cmd_get_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = CGROUPSTATS_CMD_GET,
+ .doit = cgroupstats_user_cmd,
+ .policy = cgroupstats_cmd_get_policy,
+ },
};
/* Needed early in initialization */
@@ -680,26 +703,13 @@ static int __init taskstats_init(void)
{
int rc;
- rc = genl_register_family(&family);
+ rc = genl_register_family_with_ops(&family, taskstats_ops);
if (rc)
return rc;
- rc = genl_register_ops(&family, &taskstats_ops);
- if (rc < 0)
- goto err;
-
- rc = genl_register_ops(&family, &cgroupstats_ops);
- if (rc < 0)
- goto err_cgroup_ops;
-
family_registered = 1;
pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
return 0;
-err_cgroup_ops:
- genl_unregister_ops(&family, &taskstats_ops);
-err:
- genl_unregister_family(&family);
- return rc;
}
/*
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index f8b11a28317..12d6ebbfdd8 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -365,7 +365,7 @@ int init_test_probes(void)
target2 = kprobe_target2;
do {
- rand1 = random32();
+ rand1 = prandom_u32();
} while (rand1 <= div_factor);
printk(KERN_INFO "Kprobe smoke test started\n");
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1..7c7964c33ae 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -11,7 +11,7 @@
* Modification history kernel/time.c
*
* 1993-09-02 Philip Gladstone
- * Created file with time related functions from sched.c and adjtimex()
+ * Created file with time related functions from sched/core.c and adjtimex()
* 1993-10-08 Torsten Duwe
* adjtime interface update and CMOS clock write code
* 1995-08-13 Torsten Duwe
@@ -30,7 +30,7 @@
#include <linux/export.h>
#include <linux/timex.h>
#include <linux/capability.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
}
/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time.
*
@@ -132,11 +138,14 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
*/
static inline void warp_clock(void)
{
- struct timespec adjust;
+ if (sys_tz.tz_minuteswest != 0) {
+ struct timespec adjust;
- adjust = current_kernel_time();
- adjust.tv_sec += sys_tz.tz_minuteswest * 60;
- do_settimeofday(&adjust);
+ persistent_clock_is_local = 1;
+ adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+ adjust.tv_nsec = 0;
+ timekeeping_inject_offset(&adjust);
+ }
}
/*
@@ -163,7 +172,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
return error;
if (tz) {
- /* SMP safe, global irq locking makes it work. */
sys_tz = *tz;
update_vsyscall_tz();
if (firsttime) {
@@ -173,12 +181,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
}
}
if (tv)
- {
- /* SMP safe, again the code in arch/foo/time.c should
- * globally block out interrupts when it runs.
- */
return do_settimeofday(tv);
- }
return 0;
}
@@ -238,7 +241,7 @@ EXPORT_SYMBOL(current_fs_time);
* Avoid unnecessary multiplications/divisions in the
* two most common HZ cases:
*/
-inline unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +257,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
-inline unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned int jiffies_to_usecs(const unsigned long j)
{
#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2cf9cc7aa10..f448513a45e 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,17 +1,195 @@
#
# Timer subsystem related configuration options
#
+
+# Options selectable by arch Kconfig
+
+# Watchdog function for clocksources to detect instabilities
+config CLOCKSOURCE_WATCHDOG
+ bool
+
+# Architecture has extra clocksource data
+config ARCH_CLOCKSOURCE_DATA
+ bool
+
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL
+ bool
+
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL_OLD
+ bool
+
+# ktime_t scalar 64bit nsec representation
+config KTIME_SCALAR
+ bool
+
+# Old style timekeeping
+config ARCH_USES_GETTIMEOFFSET
+ bool
+
+# The generic clock events infrastructure
+config GENERIC_CLOCKEVENTS
+ bool
+
+# Migration helper. Builds, but does not invoke
+config GENERIC_CLOCKEVENTS_BUILD
+ bool
+ default y
+ depends on GENERIC_CLOCKEVENTS
+
+# Architecture can handle broadcast in a driver-agnostic way
+config ARCH_HAS_TICK_BROADCAST
+ bool
+
+# Clockevents broadcasting infrastructure
+config GENERIC_CLOCKEVENTS_BROADCAST
+ bool
+ depends on GENERIC_CLOCKEVENTS
+
+# Automatically adjust the min. reprogramming time for
+# clock event device
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+ bool
+
+# Generic update of CMOS clock
+config GENERIC_CMOS_UPDATE
+ bool
+
+if GENERIC_CLOCKEVENTS
+menu "Timers subsystem"
+
+# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
+# only related to the tick functionality. Oneshot clockevent devices
+# are supported independ of this.
config TICK_ONESHOT
bool
-config NO_HZ
- bool "Tickless System (Dynamic Ticks)"
+config NO_HZ_COMMON
+ bool
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
+
+choice
+ prompt "Timer tick handling"
+ default NO_HZ_IDLE if NO_HZ
+
+config HZ_PERIODIC
+ bool "Periodic timer ticks (constant rate, no dynticks)"
+ help
+ This option keeps the tick running periodically at a constant
+ rate, even when the CPU doesn't need it.
+
+config NO_HZ_IDLE
+ bool "Idle dynticks system (tickless idle)"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select NO_HZ_COMMON
+ help
+ This option enables a tickless idle system: timer interrupts
+ will only trigger on an as-needed basis when the system is idle.
+ This is usually interesting for energy saving.
+
+ Most of the time you want to say Y here.
+
+config NO_HZ_FULL
+ bool "Full dynticks system (tickless)"
+ # NO_HZ_COMMON dependency
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ # We need at least one periodic CPU for timekeeping
+ depends on SMP
+ # RCU_USER_QS dependency
+ depends on HAVE_CONTEXT_TRACKING
+ # VIRT_CPU_ACCOUNTING_GEN dependency
+ depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select NO_HZ_COMMON
+ select RCU_USER_QS
+ select RCU_NOCB_CPU
+ select VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_WORK
help
- This option enables a tickless system: timer interrupts will
- only trigger on an as-needed basis both when the system is
- busy and when the system is idle.
+ Adaptively try to shutdown the tick whenever possible, even when
+ the CPU is running tasks. Typically this requires running a single
+ task on the CPU. Chances for running tickless are maximized when
+ the task mostly runs in userspace and has few kernel activity.
+
+ You need to fill up the nohz_full boot parameter with the
+ desired range of dynticks CPUs.
+
+ This is implemented at the expense of some overhead in user <-> kernel
+ transitions: syscalls, exceptions and interrupts. Even when it's
+ dynamically off.
+
+ Say N.
+
+endchoice
+
+config NO_HZ_FULL_ALL
+ bool "Full dynticks system on all CPUs by default (except CPU 0)"
+ depends on NO_HZ_FULL
+ help
+ If the user doesn't pass the nohz_full boot option to
+ define the range of full dynticks CPUs, consider that all
+ CPUs in the system are full dynticks by default.
+ Note the boot CPU will still be kept outside the range to
+ handle the timekeeping duty.
+
+config NO_HZ_FULL_SYSIDLE
+ bool "Detect full-system idle state for full dynticks system"
+ depends on NO_HZ_FULL
+ default n
+ help
+ At least one CPU must keep the scheduling-clock tick running for
+ timekeeping purposes whenever there is a non-idle CPU, where
+ "non-idle" also includes dynticks CPUs as long as they are
+ running non-idle tasks. Because the underlying adaptive-tick
+ support cannot distinguish between all CPUs being idle and
+ all CPUs each running a single task in dynticks mode, the
+ underlying support simply ensures that there is always a CPU
+ handling the scheduling-clock tick, whether or not all CPUs
+ are idle. This Kconfig option enables scalable detection of
+ the all-CPUs-idle state, thus allowing the scheduling-clock
+ tick to be disabled when all CPUs are idle. Note that scalable
+ detection of the all-CPUs-idle state means that larger systems
+ will be slower to declare the all-CPUs-idle state.
+
+ Say Y if you would like to help debug all-CPUs-idle detection.
+
+ Say N if you are unsure.
+
+config NO_HZ_FULL_SYSIDLE_SMALL
+ int "Number of CPUs above which large-system approach is used"
+ depends on NO_HZ_FULL_SYSIDLE
+ range 1 NR_CPUS
+ default 8
+ help
+ The full-system idle detection mechanism takes a lazy approach
+ on large systems, as is required to attain decent scalability.
+ However, on smaller systems, scalability is not anywhere near as
+ large a concern as is energy efficiency. The sysidle subsystem
+ therefore uses a fast but non-scalable algorithm for small
+ systems and a lazier but scalable algorithm for large systems.
+ This Kconfig parameter defines the number of CPUs in the largest
+ system that will be considered to be "small".
+
+ The default value will be fine in most cases. Battery-powered
+ systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
+ numbers of CPUs, and (3) are suffering from battery-lifetime
+ problems due to long sysidle latencies might wish to experiment
+ with larger values for this Kconfig parameter. On the other
+ hand, they might be even better served by disabling NO_HZ_FULL
+ entirely, given that NO_HZ_FULL is intended for HPC and
+ real-time workloads that at present do not tend to be run on
+ battery-powered systems.
+
+ Take the default if you are unsure.
+
+config NO_HZ
+ bool "Old Idle dynticks config"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ help
+ This is the old config entry that enables dynticks idle.
+ We keep it around for a little while to enforce backward
+ compatibility with older config files.
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
@@ -22,10 +200,5 @@ config HIGH_RES_TIMERS
hardware is not capable then this option only increases
the size of the kernel image.
-config GENERIC_CLOCKEVENTS_BUILD
- bool
- default y
- depends on GENERIC_CLOCKEVENTS
-
-config GENERIC_CLOCKEVENTS_MIN_ADJUST
- bool
+endmenu
+endif
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c..57a413fd0eb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,9 +1,14 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o posix-clock.o alarmtimer.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
+ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
+ obj-y += tick-broadcast.o
+ obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
+endif
+obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
+obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d6450..fe75444ae7e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- struct hrtimer timer;
ktime_t (*gettime)(void);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
+static struct wakeup_source *ws;
+
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
@@ -59,7 +60,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* If one has not already been chosen, it checks to see if a
* functional rtc device is available.
*/
-static struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
unsigned long flags;
struct rtc_device *ret;
@@ -96,6 +97,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
return 0;
}
+static inline void alarmtimer_rtc_timer_init(void)
+{
+ rtc_timer_init(&rtctimer, NULL, NULL);
+}
+
static struct class_interface alarmtimer_rtc_interface = {
.add_dev = &alarmtimer_rtc_add_device,
};
@@ -110,13 +116,14 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-static inline struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
return NULL;
}
#define rtcdev (NULL)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
+static inline void alarmtimer_rtc_timer_init(void) { }
#endif
/**
@@ -124,50 +131,35 @@ static inline void alarmtimer_rtc_interface_remove(void) { }
* @base: pointer to the base where the timer is being run
* @alarm: pointer to alarm being enqueued.
*
- * Adds alarm to a alarm_base timerqueue and if necessary sets
- * an hrtimer to run.
+ * Adds alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
{
+ if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
+ timerqueue_del(&base->timerqueue, &alarm->node);
+
timerqueue_add(&base->timerqueue, &alarm->node);
alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-
- if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
- hrtimer_try_to_cancel(&base->timer);
- hrtimer_start(&base->timer, alarm->node.expires,
- HRTIMER_MODE_ABS);
- }
}
/**
- * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
* @base: pointer to the base where the timer is running
* @alarm: pointer to alarm being removed
*
- * Removes alarm to a alarm_base timerqueue and if necessary sets
- * a new timer to run.
+ * Removes alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
-static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
{
- struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
-
if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
return;
timerqueue_del(&base->timerqueue, &alarm->node);
alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-
- if (next == &alarm->node) {
- hrtimer_try_to_cancel(&base->timer);
- next = timerqueue_getnext(&base->timerqueue);
- if (!next)
- return;
- hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
- }
}
@@ -182,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
- struct alarm_base *base = container_of(timer, struct alarm_base, timer);
- struct timerqueue_node *next;
+ struct alarm *alarm = container_of(timer, struct alarm, timer);
+ struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- ktime_t now;
int ret = HRTIMER_NORESTART;
int restart = ALARMTIMER_NORESTART;
spin_lock_irqsave(&base->lock, flags);
- now = base->gettime();
- while ((next = timerqueue_getnext(&base->timerqueue))) {
- struct alarm *alarm;
- ktime_t expired = next->expires;
-
- if (expired.tv64 > now.tv64)
- break;
-
- alarm = container_of(next, struct alarm, node);
-
- timerqueue_del(&base->timerqueue, &alarm->node);
- alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-
- alarm->state |= ALARMTIMER_STATE_CALLBACK;
- spin_unlock_irqrestore(&base->lock, flags);
- if (alarm->function)
- restart = alarm->function(alarm, now);
- spin_lock_irqsave(&base->lock, flags);
- alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
+ alarmtimer_dequeue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
- if (restart != ALARMTIMER_NORESTART) {
- timerqueue_add(&base->timerqueue, &alarm->node);
- alarm->state |= ALARMTIMER_STATE_ENQUEUED;
- }
- }
+ if (alarm->function)
+ restart = alarm->function(alarm, base->gettime());
- if (next) {
- hrtimer_set_expires(&base->timer, next->expires);
+ spin_lock_irqsave(&base->lock, flags);
+ if (restart != ALARMTIMER_NORESTART) {
+ hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+ alarmtimer_enqueue(base, alarm);
ret = HRTIMER_RESTART;
}
spin_unlock_irqrestore(&base->lock, flags);
@@ -226,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
+ktime_t alarm_expires_remaining(const struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ return ktime_sub(alarm->node.expires, base->gettime());
+}
+EXPORT_SYMBOL_GPL(alarm_expires_remaining);
+
#ifdef CONFIG_RTC_CLASS
/**
* alarmtimer_suspend - Suspend time callback
@@ -244,6 +224,7 @@ static int alarmtimer_suspend(struct device *dev)
unsigned long flags;
struct rtc_device *rtc;
int i;
+ int ret;
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
@@ -273,8 +254,10 @@ static int alarmtimer_suspend(struct device *dev)
if (min.tv64 == 0)
return 0;
- /* XXX - Should we enforce a minimum sleep time? */
- WARN_ON(min.tv64 < NSEC_PER_SEC);
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
/* Setup an rtc timer to fire that far in the future */
rtc_timer_cancel(rtc, &rtctimer);
@@ -282,9 +265,11 @@ static int alarmtimer_suspend(struct device *dev)
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
- rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-
- return 0;
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, MSEC_PER_SEC);
+ return ret;
}
#else
static int alarmtimer_suspend(struct device *dev)
@@ -318,28 +303,62 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
timerqueue_init(&alarm->node);
+ hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
+ alarm->timer.function = alarmtimer_fired;
alarm->function = function;
alarm->type = type;
alarm->state = ALARMTIMER_STATE_INACTIVE;
}
+EXPORT_SYMBOL_GPL(alarm_init);
/**
- * alarm_start - Sets an alarm to fire
+ * alarm_start - Sets an absolute alarm to fire
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
-void alarm_start(struct alarm *alarm, ktime_t start)
+int alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
+ int ret;
spin_lock_irqsave(&base->lock, flags);
- if (alarmtimer_active(alarm))
- alarmtimer_remove(base, alarm);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
+ ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+ HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
+ return ret;
}
+EXPORT_SYMBOL_GPL(alarm_start);
+
+/**
+ * alarm_start_relative - Sets a relative alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time relative to now to run the alarm
+ */
+int alarm_start_relative(struct alarm *alarm, ktime_t start)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+
+ start = ktime_add(start, base->gettime());
+ return alarm_start(alarm, start);
+}
+EXPORT_SYMBOL_GPL(alarm_start_relative);
+
+void alarm_restart(struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+
+ spin_lock_irqsave(&base->lock, flags);
+ hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+ hrtimer_restart(&alarm->timer);
+ alarmtimer_enqueue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+EXPORT_SYMBOL_GPL(alarm_restart);
/**
* alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -352,21 +371,16 @@ int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- int ret = -1;
- spin_lock_irqsave(&base->lock, flags);
-
- if (alarmtimer_callback_running(alarm))
- goto out;
+ int ret;
- if (alarmtimer_is_queued(alarm)) {
- alarmtimer_remove(base, alarm);
- ret = 1;
- } else
- ret = 0;
-out:
+ spin_lock_irqsave(&base->lock, flags);
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
+EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
/**
@@ -384,6 +398,7 @@ int alarm_cancel(struct alarm *alarm)
cpu_relax();
}
}
+EXPORT_SYMBOL_GPL(alarm_cancel);
u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
@@ -416,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
alarm->node.expires = ktime_add(alarm->node.expires, interval);
return overrun;
}
+EXPORT_SYMBOL_GPL(alarm_forward);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ return alarm_forward(alarm, base->gettime(), interval);
+}
+EXPORT_SYMBOL_GPL(alarm_forward_now);
/**
@@ -468,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EINVAL;
return hrtimer_get_res(baseid, tp);
}
@@ -485,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EINVAL;
*tp = ktime_to_timespec(base->gettime());
return 0;
@@ -563,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
struct itimerspec *new_setting,
struct itimerspec *old_setting)
{
+ ktime_t exp;
+
if (!rtcdev)
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (old_setting)
alarm_timer_get(timr, old_setting);
@@ -575,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
- alarm_start(&timr->it.alarm.alarmtimer,
- timespec_to_ktime(new_setting->it_value));
+ exp = timespec_to_ktime(new_setting->it_value);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now;
+
+ now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ alarm_start(&timr->it.alarm.alarmtimer, exp);
return 0;
}
@@ -708,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -783,6 +821,8 @@ static int __init alarmtimer_init(void)
.nsleep = alarm_timer_nsleep,
};
+ alarmtimer_rtc_timer_init();
+
posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
@@ -794,10 +834,6 @@ static int __init alarmtimer_init(void)
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
- hrtimer_init(&alarm_bases[i].timer,
- alarm_bases[i].base_clockid,
- HRTIMER_MODE_ABS);
- alarm_bases[i].timer.function = alarmtimer_fired;
}
error = alarmtimer_rtc_interface_setup();
@@ -813,6 +849,7 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
+ ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c..9c94c19f130 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -15,44 +15,82 @@
#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/device.h>
#include "tick-internal.h"
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
static LIST_HEAD(clockevents_released);
-
-/* Notification for clock events */
-static RAW_NOTIFIER_HEAD(clockevents_chain);
-
/* Protection for the above */
static DEFINE_RAW_SPINLOCK(clockevents_lock);
+/* Protection for unbind operations */
+static DEFINE_MUTEX(clockevents_mutex);
-/**
- * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
- * @latch: value to convert
- * @evt: pointer to clock event device descriptor
- *
- * Math helper, returns latch value converted to nanoseconds (bound checked)
- */
-u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+struct ce_unbind {
+ struct clock_event_device *ce;
+ int res;
+};
+
+static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
+ bool ismax)
{
u64 clc = (u64) latch << evt->shift;
+ u64 rnd;
if (unlikely(!evt->mult)) {
evt->mult = 1;
WARN_ON(1);
}
+ rnd = (u64) evt->mult - 1;
+
+ /*
+ * Upper bound sanity check. If the backwards conversion is
+ * not equal latch, we know that the above shift overflowed.
+ */
+ if ((clc >> evt->shift) != (u64)latch)
+ clc = ~0ULL;
+
+ /*
+ * Scaled math oddities:
+ *
+ * For mult <= (1 << shift) we can safely add mult - 1 to
+ * prevent integer rounding loss. So the backwards conversion
+ * from nsec to device ticks will be correct.
+ *
+ * For mult > (1 << shift), i.e. device frequency is > 1GHz we
+ * need to be careful. Adding mult - 1 will result in a value
+ * which when converted back to device ticks can be larger
+ * than latch by up to (mult - 1) >> shift. For the min_delta
+ * calculation we still want to apply this in order to stay
+ * above the minimum device ticks limit. For the upper limit
+ * we would end up with a latch value larger than the upper
+ * limit of the device, so we omit the add to stay below the
+ * device upper boundary.
+ *
+ * Also omit the add if it would overflow the u64 boundary.
+ */
+ if ((~0ULL - clc > rnd) &&
+ (!ismax || evt->mult <= (1U << evt->shift)))
+ clc += rnd;
do_div(clc, evt->mult);
- if (clc < 1000)
- clc = 1000;
- if (clc > KTIME_MAX)
- clc = KTIME_MAX;
- return clc;
+ /* Deltas less than 1usec are pointless noise */
+ return clc > 1000 ? clc : 1000;
+}
+
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch: value to convert
+ * @evt: pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+{
+ return cev_delta2ns(latch, evt, false);
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
@@ -108,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
{
/* Nothing to do if we already reached the limit */
if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
- printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+ printk_deferred(KERN_WARNING
+ "CE: Reprogramming failure. Giving up\n");
dev->next_event.tv64 = KTIME_MAX;
return -ETIME;
}
@@ -121,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
if (dev->min_delta_ns > MIN_DELTA_LIMIT)
dev->min_delta_ns = MIN_DELTA_LIMIT;
- printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
- dev->name ? dev->name : "?",
- (unsigned long long) dev->min_delta_ns);
+ printk_deferred(KERN_WARNING
+ "CE: %s increased min_delta_ns to %llu nsec\n",
+ dev->name ? dev->name : "?",
+ (unsigned long long) dev->min_delta_ns);
return 0;
}
@@ -232,46 +272,106 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
return (rc && force) ? clockevents_program_min_delta(dev) : rc;
}
-/**
- * clockevents_register_notifier - register a clock events change listener
+/*
+ * Called after a notify add to make devices available which were
+ * released from the notifier call.
*/
-int clockevents_register_notifier(struct notifier_block *nb)
+static void clockevents_notify_released(void)
{
- unsigned long flags;
- int ret;
+ struct clock_event_device *dev;
- raw_spin_lock_irqsave(&clockevents_lock, flags);
- ret = raw_notifier_chain_register(&clockevents_chain, nb);
- raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+ while (!list_empty(&clockevents_released)) {
+ dev = list_entry(clockevents_released.next,
+ struct clock_event_device, list);
+ list_del(&dev->list);
+ list_add(&dev->list, &clockevent_devices);
+ tick_check_new_device(dev);
+ }
+}
- return ret;
+/*
+ * Try to install a replacement clock event device
+ */
+static int clockevents_replace(struct clock_event_device *ced)
+{
+ struct clock_event_device *dev, *newdev = NULL;
+
+ list_for_each_entry(dev, &clockevent_devices, list) {
+ if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+ continue;
+
+ if (!tick_check_replacement(newdev, dev))
+ continue;
+
+ if (!try_module_get(dev->owner))
+ continue;
+
+ if (newdev)
+ module_put(newdev->owner);
+ newdev = dev;
+ }
+ if (newdev) {
+ tick_install_replacement(newdev);
+ list_del_init(&ced->list);
+ }
+ return newdev ? 0 : -EBUSY;
}
/*
- * Notify about a clock event change. Called with clockevents_lock
- * held.
+ * Called with clockevents_mutex and clockevents_lock held
*/
-static void clockevents_do_notify(unsigned long reason, void *dev)
+static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
- raw_notifier_call_chain(&clockevents_chain, reason, dev);
+ /* Fast track. Device is unused */
+ if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+ list_del_init(&ced->list);
+ return 0;
+ }
+
+ return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
}
/*
- * Called after a notify add to make devices available which were
- * released from the notifier call.
+ * SMP function call to unbind a device
*/
-static void clockevents_notify_released(void)
+static void __clockevents_unbind(void *arg)
{
- struct clock_event_device *dev;
+ struct ce_unbind *cu = arg;
+ int res;
+
+ raw_spin_lock(&clockevents_lock);
+ res = __clockevents_try_unbind(cu->ce, smp_processor_id());
+ if (res == -EAGAIN)
+ res = clockevents_replace(cu->ce);
+ cu->res = res;
+ raw_spin_unlock(&clockevents_lock);
+}
- while (!list_empty(&clockevents_released)) {
- dev = list_entry(clockevents_released.next,
- struct clock_event_device, list);
- list_del(&dev->list);
- list_add(&dev->list, &clockevent_devices);
- clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
- }
+/*
+ * Issues smp function call to unbind a per cpu device. Called with
+ * clockevents_mutex held.
+ */
+static int clockevents_unbind(struct clock_event_device *ced, int cpu)
+{
+ struct ce_unbind cu = { .ce = ced, .res = -ENODEV };
+
+ smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
+ return cu.res;
+}
+
+/*
+ * Unbind a clockevents device.
+ */
+int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
+{
+ int ret;
+
+ mutex_lock(&clockevents_mutex);
+ ret = clockevents_unbind(ced, cpu);
+ mutex_unlock(&clockevents_mutex);
+ return ret;
}
+EXPORT_SYMBOL_GPL(clockevents_unbind);
/**
* clockevents_register_device - register a clock event device
@@ -290,15 +390,14 @@ void clockevents_register_device(struct clock_event_device *dev)
raw_spin_lock_irqsave(&clockevents_lock, flags);
list_add(&dev->list, &clockevent_devices);
- clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ tick_check_new_device(dev);
clockevents_notify_released();
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
-static void clockevents_config(struct clock_event_device *dev,
- u32 freq)
+void clockevents_config(struct clock_event_device *dev, u32 freq)
{
u64 sec;
@@ -318,8 +417,8 @@ static void clockevents_config(struct clock_event_device *dev,
sec = 600;
clockevents_calc_mult_shift(dev, freq, sec);
- dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
- dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+ dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
+ dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
}
/**
@@ -340,6 +439,20 @@ void clockevents_config_and_register(struct clock_event_device *dev,
clockevents_config(dev, freq);
clockevents_register_device(dev);
}
+EXPORT_SYMBOL_GPL(clockevents_config_and_register);
+
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ clockevents_config(dev, freq);
+
+ if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+ return clockevents_program_event(dev, dev->next_event, false);
+
+ if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+
+ return 0;
+}
/**
* clockevents_update_freq - Update frequency and reprogram a clock event device.
@@ -348,17 +461,22 @@ void clockevents_config_and_register(struct clock_event_device *dev,
*
* Reconfigure and reprogram a clock event device in oneshot
* mode. Must be called on the cpu for which the device delivers per
- * cpu timer events with interrupts disabled! Returns 0 on success,
- * -ETIME when the event is in the past.
+ * cpu timer events. If called for the broadcast device the core takes
+ * care of serialization.
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
*/
int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
- clockevents_config(dev, freq);
-
- if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
- return 0;
+ unsigned long flags;
+ int ret;
- return clockevents_program_event(dev, dev->next_event, false);
+ local_irq_save(flags);
+ ret = tick_broadcast_update_freq(dev, freq);
+ if (ret == -ENODEV)
+ ret = __clockevents_update_freq(dev, freq);
+ local_irq_restore(flags);
+ return ret;
}
/*
@@ -386,6 +504,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
* released list and do a notify add later.
*/
if (old) {
+ module_put(old->owner);
clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
@@ -398,21 +517,72 @@ void clockevents_exchange_device(struct clock_event_device *old,
local_irq_restore(flags);
}
+/**
+ * clockevents_suspend - suspend clock devices
+ */
+void clockevents_suspend(void)
+{
+ struct clock_event_device *dev;
+
+ list_for_each_entry_reverse(dev, &clockevent_devices, list)
+ if (dev->suspend)
+ dev->suspend(dev);
+}
+
+/**
+ * clockevents_resume - resume clock devices
+ */
+void clockevents_resume(void)
+{
+ struct clock_event_device *dev;
+
+ list_for_each_entry(dev, &clockevent_devices, list)
+ if (dev->resume)
+ dev->resume(dev);
+}
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS
/**
* clockevents_notify - notification about relevant events
+ * Returns 0 on success, any other value on error
*/
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
- int cpu;
+ int cpu, ret = 0;
raw_spin_lock_irqsave(&clockevents_lock, flags);
- clockevents_do_notify(reason, arg);
switch (reason) {
+ case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+ case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+ case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ tick_broadcast_on_off(reason, arg);
+ break;
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+ case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+ ret = tick_broadcast_oneshot_control(reason);
+ break;
+
+ case CLOCK_EVT_NOTIFY_CPU_DYING:
+ tick_handover_do_timer(arg);
+ break;
+
+ case CLOCK_EVT_NOTIFY_SUSPEND:
+ tick_suspend();
+ tick_suspend_broadcast();
+ break;
+
+ case CLOCK_EVT_NOTIFY_RESUME:
+ tick_resume();
+ break;
+
case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ tick_shutdown_broadcast_oneshot(arg);
+ tick_shutdown_broadcast(arg);
+ tick_shutdown(arg);
/*
* Unregister the clock event devices which were
* released from the users in the notify chain.
@@ -436,6 +606,126 @@ void clockevents_notify(unsigned long reason, void *arg)
break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(clockevents_notify);
+
+#ifdef CONFIG_SYSFS
+struct bus_type clockevents_subsys = {
+ .name = "clockevents",
+ .dev_name = "clockevent",
+};
+
+static DEFINE_PER_CPU(struct device, tick_percpu_dev);
+static struct tick_device *tick_get_tick_dev(struct device *dev);
+
+static ssize_t sysfs_show_current_tick_dev(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct tick_device *td;
+ ssize_t count = 0;
+
+ raw_spin_lock_irq(&clockevents_lock);
+ td = tick_get_tick_dev(dev);
+ if (td && td->evtdev)
+ count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
+ raw_spin_unlock_irq(&clockevents_lock);
+ return count;
+}
+static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
+
+/* We don't support the abomination of removable broadcast devices */
+static ssize_t sysfs_unbind_tick_dev(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ char name[CS_NAME_LEN];
+ ssize_t ret = sysfs_get_uname(buf, name, count);
+ struct clock_event_device *ce;
+
+ if (ret < 0)
+ return ret;
+
+ ret = -ENODEV;
+ mutex_lock(&clockevents_mutex);
+ raw_spin_lock_irq(&clockevents_lock);
+ list_for_each_entry(ce, &clockevent_devices, list) {
+ if (!strcmp(ce->name, name)) {
+ ret = __clockevents_try_unbind(ce, dev->id);
+ break;
+ }
+ }
+ raw_spin_unlock_irq(&clockevents_lock);
+ /*
+ * We hold clockevents_mutex, so ce can't go away
+ */
+ if (ret == -EAGAIN)
+ ret = clockevents_unbind(ce, dev->id);
+ mutex_unlock(&clockevents_mutex);
+ return ret ? ret : count;
+}
+static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static struct device tick_bc_dev = {
+ .init_name = "broadcast",
+ .id = 0,
+ .bus = &clockevents_subsys,
+};
+
+static struct tick_device *tick_get_tick_dev(struct device *dev)
+{
+ return dev == &tick_bc_dev ? tick_get_broadcast_device() :
+ &per_cpu(tick_cpu_device, dev->id);
+}
+
+static __init int tick_broadcast_init_sysfs(void)
+{
+ int err = device_register(&tick_bc_dev);
+
+ if (!err)
+ err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
+ return err;
+}
+#else
+static struct tick_device *tick_get_tick_dev(struct device *dev)
+{
+ return &per_cpu(tick_cpu_device, dev->id);
+}
+static inline int tick_broadcast_init_sysfs(void) { return 0; }
#endif
+
+static int __init tick_init_sysfs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct device *dev = &per_cpu(tick_percpu_dev, cpu);
+ int err;
+
+ dev->id = cpu;
+ dev->bus = &clockevents_subsys;
+ err = device_register(dev);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_current_device);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_unbind_device);
+ if (err)
+ return err;
+ }
+ return tick_broadcast_init_sysfs();
+}
+
+static int __init clockevents_init_sysfs(void)
+{
+ int err = subsys_system_register(&clockevents_subsys, NULL);
+
+ if (!err)
+ err = tick_init_sysfs();
+ return err;
+}
+device_initcall(clockevents_init_sysfs);
+#endif /* SYSFS */
+
+#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab2..ba3e502c955 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,8 @@
#include <linux/tick.h>
#include <linux/kthread.h>
+#include "tick-internal.h"
+
void timecounter_init(struct timecounter *tc,
const struct cyclecounter *cc,
u64 start_tstamp)
@@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
static struct clocksource *curr_clocksource;
static LIST_HEAD(clocksource_list);
static DEFINE_MUTEX(clocksource_mutex);
-static char override_name[32];
+static char override_name[CS_NAME_LEN];
static int finished_booting;
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
+static void clocksource_select(void);
static LIST_HEAD(watchdog_list);
static struct clocksource *watchdog;
@@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data)
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ /* Mark it valid for high-res. */
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+
/*
- * We just marked the clocksource as highres-capable,
- * notify the rest of the system as well so that we
- * transition into high-res mode:
+ * clocksource_done_booting() will sort it if
+ * finished_booting is not set yet.
*/
- tick_clock_notify();
+ if (!finished_booting)
+ continue;
+
+ /*
+ * If this is not the current clocksource let
+ * the watchdog thread reselect it. Due to the
+ * change to high res this clocksource might
+ * be preferred now. If it is the current
+ * clocksource let the tick code know about
+ * that change.
+ */
+ if (cs != curr_clocksource) {
+ cs->flags |= CLOCK_SOURCE_RESELECT;
+ schedule_work(&watchdog_work);
+ } else {
+ tick_clock_notify();
+ }
}
}
@@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
- struct clocksource *tmp;
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
- if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
- /* cs is a watched clocksource. */
- list_del_init(&cs->wd_list);
- } else if (cs == watchdog) {
- /* Reset watchdog cycles */
- clocksource_reset_watchdog();
- /* Current watchdog is removed. Find an alternative. */
- watchdog = NULL;
- list_for_each_entry(tmp, &clocksource_list, list) {
- if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
- continue;
- if (!watchdog || tmp->rating > watchdog->rating)
- watchdog = tmp;
+ if (cs != watchdog) {
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ /* cs is a watched clocksource. */
+ list_del_init(&cs->wd_list);
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
}
}
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
- /* Check if the watchdog timer needs to be stopped. */
- clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static int clocksource_watchdog_kthread(void *data)
+static int __clocksource_watchdog_kthread(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
LIST_HEAD(unstable);
+ int select = 0;
- mutex_lock(&clocksource_mutex);
spin_lock_irqsave(&watchdog_lock, flags);
- list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+ list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
list_add(&cs->wd_list, &unstable);
+ select = 1;
}
+ if (cs->flags & CLOCK_SOURCE_RESELECT) {
+ cs->flags &= ~CLOCK_SOURCE_RESELECT;
+ select = 1;
+ }
+ }
/* Check if the watchdog timer needs to be stopped. */
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)
list_del_init(&cs->wd_list);
__clocksource_change_rating(cs, 0);
}
+ return select;
+}
+
+static int clocksource_watchdog_kthread(void *data)
+{
+ mutex_lock(&clocksource_mutex);
+ if (__clocksource_watchdog_kthread())
+ clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
+static bool clocksource_is_watchdog(struct clocksource *cs)
+{
+ return cs == watchdog;
+}
+
#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -449,7 +477,9 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
-static inline int clocksource_watchdog_kthread(void *data) { return 0; }
+static inline int __clocksource_watchdog_kthread(void) { return 0; }
+static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
+void clocksource_mark_unstable(struct clocksource *cs) { }
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -500,7 +530,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
{
u64 ret;
/*
- * We won't try to correct for more then 11% adjustments (110,000 ppm),
+ * We won't try to correct for more than 11% adjustments (110,000 ppm),
*/
ret = (u64)cs->mult * 11;
do_div(ret,100);
@@ -508,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
}
/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
- *
+ * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
+ * @mult: cycle to nanosecond multiplier
+ * @shift: cycle to nanosecond divisor (power of two)
+ * @maxadj: maximum adjustment value to mult (~11%)
+ * @mask: bitmask for two's complement subtraction of non 64 bit counters
*/
-static u64 clocksource_max_deferment(struct clocksource *cs)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
* cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+ * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
* which is equivalent to the below.
- * max_cycles < (2^63)/(cs->mult + cs->maxadj)
- * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
- * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
- * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
+ * max_cycles < (2^63)/(mult + maxadj)
+ * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
+ * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
+ * max_cycles < 2^(63 - log2(mult + maxadj))
+ * max_cycles < 1 << (63 - log2(mult + maxadj))
* Please note that we add 1 to the result of the log2 to account for
* any rounding errors, ensure the above inequality is satisfied and
* no overflow will occur.
*/
- max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
+ max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
/*
* The actual maximum number of cycles we can defer the clocksource is
- * determined by the minimum of max_cycles and cs->mask.
+ * determined by the minimum of max_cycles and mask.
* Note: Here we subtract the maxadj to make sure we don't sleep for
* too long if there's a large negative adjustment.
*/
- max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
- max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
- cs->shift);
+ max_cycles = min(max_cycles, mask);
+ max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+
+ return max_nsecs;
+}
+
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs: Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+ u64 max_nsecs;
+ max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+ cs->mask);
/*
* To ensure that the clocksource does not wrap whilst we are idle,
* limit the time the clocksource can be deferred by 12.5%. Please
@@ -553,24 +598,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
-/**
- * clocksource_select - Select the best clocksource available
- *
- * Private function. Must hold clocksource_mutex when called.
- *
- * Select the clocksource with the best rating, or the clocksource,
- * which is selected by userspace override.
- */
-static void clocksource_select(void)
+static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
{
- struct clocksource *best, *cs;
+ struct clocksource *cs;
if (!finished_booting || list_empty(&clocksource_list))
+ return NULL;
+
+ /*
+ * We pick the clocksource with the highest rating. If oneshot
+ * mode is active, we pick the highres valid clocksource with
+ * the best rating.
+ */
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (skipcur && cs == curr_clocksource)
+ continue;
+ if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+ continue;
+ return cs;
+ }
+ return NULL;
+}
+
+static void __clocksource_select(bool skipcur)
+{
+ bool oneshot = tick_oneshot_mode_active();
+ struct clocksource *best, *cs;
+
+ /* Find the best suitable clocksource */
+ best = clocksource_find_best(oneshot, skipcur);
+ if (!best)
return;
- /* First clocksource on the list has the best rating. */
- best = list_first_entry(&clocksource_list, struct clocksource, list);
+
/* Check for the override clocksource. */
list_for_each_entry(cs, &clocksource_list, list) {
+ if (skipcur && cs == curr_clocksource)
+ continue;
if (strcmp(cs->name, override_name) != 0)
continue;
/*
@@ -578,8 +641,7 @@ static void clocksource_select(void)
* capable clocksource if the tick code is in oneshot
* mode (highres or nohz)
*/
- if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
- tick_oneshot_mode_active()) {
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
printk(KERN_WARNING "Override clocksource %s is not "
"HRT compatible. Cannot switch while in "
@@ -590,16 +652,35 @@ static void clocksource_select(void)
best = cs;
break;
}
- if (curr_clocksource != best) {
- printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+
+ if (curr_clocksource != best && !timekeeping_notify(best)) {
+ pr_info("Switched to clocksource %s\n", best->name);
curr_clocksource = best;
- timekeeping_notify(curr_clocksource);
}
}
+/**
+ * clocksource_select - Select the best clocksource available
+ *
+ * Private function. Must hold clocksource_mutex when called.
+ *
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
+ */
+static void clocksource_select(void)
+{
+ return __clocksource_select(false);
+}
+
+static void clocksource_select_fallback(void)
+{
+ return __clocksource_select(true);
+}
+
#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
static inline void clocksource_select(void) { }
+static inline void clocksource_select_fallback(void) { }
#endif
@@ -614,16 +695,11 @@ static int __init clocksource_done_booting(void)
{
mutex_lock(&clocksource_mutex);
curr_clocksource = clocksource_default_clock();
- mutex_unlock(&clocksource_mutex);
-
finished_booting = 1;
-
/*
* Run the watchdog first to eliminate unstable clock sources
*/
- clocksource_watchdog_kthread(NULL);
-
- mutex_lock(&clocksource_mutex);
+ __clocksource_watchdog_kthread();
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
@@ -756,7 +832,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
list_del(&cs->list);
cs->rating = rating;
clocksource_enqueue(cs);
- clocksource_select();
}
/**
@@ -768,21 +843,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
{
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
+ clocksource_select();
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
+/*
+ * Unbind clocksource @cs. Called with clocksource_mutex held
+ */
+static int clocksource_unbind(struct clocksource *cs)
+{
+ /*
+ * I really can't convince myself to support this on hardware
+ * designed by lobotomized monkeys.
+ */
+ if (clocksource_is_watchdog(cs))
+ return -EBUSY;
+
+ if (cs == curr_clocksource) {
+ /* Select and try to install a replacement clock source */
+ clocksource_select_fallback();
+ if (curr_clocksource == cs)
+ return -EBUSY;
+ }
+ clocksource_dequeue_watchdog(cs);
+ list_del_init(&cs->list);
+ return 0;
+}
+
/**
* clocksource_unregister - remove a registered clocksource
* @cs: clocksource to be unregistered
*/
-void clocksource_unregister(struct clocksource *cs)
+int clocksource_unregister(struct clocksource *cs)
{
+ int ret = 0;
+
mutex_lock(&clocksource_mutex);
- clocksource_dequeue_watchdog(cs);
- list_del(&cs->list);
- clocksource_select();
+ if (!list_empty(&cs->list))
+ ret = clocksource_unbind(cs);
mutex_unlock(&clocksource_mutex);
+ return ret;
}
EXPORT_SYMBOL(clocksource_unregister);
@@ -808,6 +909,23 @@ sysfs_show_current_clocksources(struct device *dev,
return count;
}
+ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+{
+ size_t ret = cnt;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (!cnt || cnt >= CS_NAME_LEN)
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[cnt-1] == '\n')
+ cnt--;
+ if (cnt > 0)
+ memcpy(dst, buf, cnt);
+ dst[cnt] = 0;
+ return ret;
+}
+
/**
* sysfs_override_clocksource - interface for manually overriding clocksource
* @dev: unused
@@ -822,22 +940,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- size_t ret = count;
-
- /* strings from sysfs write are not 0 terminated! */
- if (count >= sizeof(override_name))
- return -EINVAL;
-
- /* strip of \n: */
- if (buf[count-1] == '\n')
- count--;
+ ssize_t ret;
mutex_lock(&clocksource_mutex);
- if (count > 0)
- memcpy(override_name, buf, count);
- override_name[count] = 0;
- clocksource_select();
+ ret = sysfs_get_uname(buf, override_name, count);
+ if (ret >= 0)
+ clocksource_select();
mutex_unlock(&clocksource_mutex);
@@ -845,6 +954,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
}
/**
+ * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: unused
+ * @count: length of buffer
+ *
+ * Takes input from sysfs interface for manually unbinding a clocksource.
+ */
+static ssize_t sysfs_unbind_clocksource(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct clocksource *cs;
+ char name[CS_NAME_LEN];
+ ssize_t ret;
+
+ ret = sysfs_get_uname(buf, name, count);
+ if (ret < 0)
+ return ret;
+
+ ret = -ENODEV;
+ mutex_lock(&clocksource_mutex);
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (strcmp(cs->name, name))
+ continue;
+ ret = clocksource_unbind(cs);
+ break;
+ }
+ mutex_unlock(&clocksource_mutex);
+
+ return ret ? ret : count;
+}
+
+/**
* sysfs_show_available_clocksources - sysfs interface for listing clocksource
* @dev: unused
* @attr: unused
@@ -886,6 +1029,8 @@ sysfs_show_available_clocksources(struct device *dev,
static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
sysfs_override_clocksource);
+static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
+
static DEVICE_ATTR(available_clocksource, 0444,
sysfs_show_available_clocksources, NULL);
@@ -910,6 +1055,9 @@ static int __init init_clocksource_sysfs(void)
&device_clocksource,
&dev_attr_current_clocksource);
if (!error)
+ error = device_create_file(&device_clocksource,
+ &dev_attr_unbind_clocksource);
+ if (!error)
error = device_create_file(
&device_clocksource,
&dev_attr_available_clocksource);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e040..a6a5bf53e86 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
-#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
* conversion, the .shift value could be zero. However
@@ -51,14 +51,20 @@
* HZ shrinks, so values greater than 8 overflow 32bits when
* HZ=100.
*/
+#if HZ < 34
+#define JIFFIES_SHIFT 6
+#elif HZ < 67
+#define JIFFIES_SHIFT 7
+#else
#define JIFFIES_SHIFT 8
+#endif
static cycle_t jiffies_read(struct clocksource *cs)
{
return (cycle_t) jiffies;
}
-struct clocksource clocksource_jiffies = {
+static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
@@ -67,6 +73,8 @@ struct clocksource clocksource_jiffies = {
.shift = JIFFIES_SHIFT,
};
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
@@ -74,9 +82,9 @@ u64 get_jiffies_64(void)
u64 ret;
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqbegin(&jiffies_lock);
ret = jiffies_64;
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqretry(&jiffies_lock, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
@@ -95,3 +103,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
{
return &clocksource_jiffies;
}
+
+struct clocksource refined_jiffies;
+
+int register_refined_jiffies(long cycles_per_second)
+{
+ u64 nsec_per_tick, shift_hz;
+ long cycles_per_tick;
+
+
+
+ refined_jiffies = clocksource_jiffies;
+ refined_jiffies.name = "refined-jiffies";
+ refined_jiffies.rating++;
+
+ /* Calc cycles per tick */
+ cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
+ /* shift_hz stores hz<<8 for extra accuracy */
+ shift_hz = (u64)cycles_per_second << 8;
+ shift_hz += cycles_per_tick/2;
+ do_div(shift_hz, cycles_per_tick);
+ /* Calculate nsec_per_tick using shift_hz */
+ nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+ nsec_per_tick += (u32)shift_hz/2;
+ do_div(nsec_per_tick, (u32)shift_hz);
+
+ refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
+
+ clocksource_register(&refined_jiffies);
+ return 0;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f6117a4c7cb..33db43a3951 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,24 +15,27 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/rtc.h>
#include "tick-internal.h"
+#include "ntp_internal.h"
/*
* NTP timekeeping variables:
+ *
+ * Note: All of the NTP state is protected by the timekeeping locks.
*/
+
/* USER_HZ period (usecs): */
unsigned long tick_usec = TICK_USEC;
-/* ACTHZ period (nsecs): */
+/* SHIFTED_HZ period (nsecs): */
unsigned long tick_nsec;
-u64 tick_length;
+static u64 tick_length;
static u64 tick_length_base;
-static struct hrtimer leap_timer;
-
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -49,10 +52,7 @@ static struct hrtimer leap_timer;
static int time_state = TIME_OK;
/* clock status bits: */
-int time_status = STA_UNSYNC;
-
-/* TAI offset (secs): */
-static long time_tai;
+static int time_status = STA_UNSYNC;
/* time adjustment (nsecs): */
static s64 time_offset;
@@ -132,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
/**
* pps_clear - Clears the PPS state variables
- *
- * Must be called while holding a write on the xtime_lock
*/
static inline void pps_clear(void)
{
@@ -148,8 +146,6 @@ static inline void pps_clear(void)
/* Decrease pps_valid to indicate that another second has passed since
* the last PPS signal. When it reaches 0, indicate that PPS signal is
* missing.
- *
- * Must be called while holding a write on the xtime_lock
*/
static inline void pps_dec_valid(void)
{
@@ -169,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
static inline int is_error_status(int status)
{
- return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ return (status & (STA_UNSYNC|STA_CLOCKERR))
/* PPS signal lost when either PPS time or
* PPS frequency synchronization requested
*/
- || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
- && !(time_status & STA_PPSSIGNAL))
+ || ((status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(status & STA_PPSSIGNAL))
/* PPS jitter exceeded when
* PPS time synchronization requested */
- || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ || ((status & (STA_PPSTIME|STA_PPSJITTER))
== (STA_PPSTIME|STA_PPSJITTER))
/* PPS wander exceeded or calibration error when
* PPS frequency synchronization requested
*/
- || ((time_status & STA_PPSFREQ)
- && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+ || ((status & STA_PPSFREQ)
+ && (status & (STA_PPSWANDER|STA_PPSERROR)));
}
static inline void pps_fill_timex(struct timex *txc)
@@ -233,6 +229,17 @@ static inline void pps_fill_timex(struct timex *txc)
#endif /* CONFIG_NTP_PPS */
+
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+ return !(time_status & STA_UNSYNC);
+}
+
+
/*
* NTP methods:
*/
@@ -275,7 +282,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
time_status |= STA_MODE;
- return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
+ return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
}
static void ntp_update_offset(long offset)
@@ -330,8 +337,6 @@ static void ntp_update_offset(long offset)
/**
* ntp_clear - Clears the NTP state variables
- *
- * Must be called while holding a write on the xtime_lock
*/
void ntp_clear(void)
{
@@ -349,61 +354,70 @@ void ntp_clear(void)
pps_clear();
}
+
+u64 ntp_tick_length(void)
+{
+ return tick_length;
+}
+
+
/*
- * Leap second processing. If in leap-insert state at the end of the
- * day, the system clock is set back one second; if in leap-delete
- * state, the system clock is set ahead one second.
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
*/
-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+int second_overflow(unsigned long secs)
{
- enum hrtimer_restart res = HRTIMER_NORESTART;
-
- write_seqlock(&xtime_lock);
+ s64 delta;
+ int leap = 0;
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
switch (time_state) {
case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
break;
case TIME_INS:
- timekeeping_leap_insert(-1);
- time_state = TIME_OOP;
- printk(KERN_NOTICE
- "Clock: inserting leap second 23:59:60 UTC\n");
- hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
- res = HRTIMER_RESTART;
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if (secs % 86400 == 0) {
+ leap = -1;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ }
break;
case TIME_DEL:
- timekeeping_leap_insert(1);
- time_tai--;
- time_state = TIME_WAIT;
- printk(KERN_NOTICE
- "Clock: deleting leap second 23:59:59 UTC\n");
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if ((secs + 1) % 86400 == 0) {
+ leap = 1;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ }
break;
case TIME_OOP:
- time_tai++;
time_state = TIME_WAIT;
- /* fall through */
+ break;
+
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
break;
}
- write_sequnlock(&xtime_lock);
-
- return res;
-}
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- */
-void second_overflow(void)
-{
- s64 delta;
/* Bump the maxerror field */
time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,30 +437,29 @@ void second_overflow(void)
pps_dec_valid();
if (!time_adjust)
- return;
+ goto out;
if (time_adjust > MAX_TICKADJ) {
time_adjust -= MAX_TICKADJ;
tick_length += MAX_TICKADJ_SCALED;
- return;
+ goto out;
}
if (time_adjust < -MAX_TICKADJ) {
time_adjust += MAX_TICKADJ;
tick_length -= MAX_TICKADJ_SCALED;
- return;
+ goto out;
}
tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
<< NTP_SCALE_SHIFT;
time_adjust = 0;
-}
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
-
-/* Disable the cmos update - used by virtualization and embedded */
-int no_sync_cmos_clock __read_mostly;
+out:
+ return leap;
+}
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_cmos_clock(struct work_struct *work);
static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -462,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
* called as close as possible to 500 ms before the new second starts.
* This code is run on a timer. If the clock is set, that timer
* may not expire at the correct time. Thus, we adjust...
+ * We want the clock to be within a couple of ticks from the target.
*/
if (!ntp_synced()) {
/*
@@ -472,14 +486,26 @@ static void sync_cmos_clock(struct work_struct *work)
}
getnstimeofday(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
- fail = update_persistent_clock(now);
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
+ struct timespec adjust = now;
+
+ fail = -ENODEV;
+ if (persistent_clock_is_local)
+ adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+ fail = update_persistent_clock(adjust);
+#endif
+#ifdef CONFIG_RTC_SYSTOHC
+ if (fail == -ENODEV)
+ fail = rtc_set_ntp_time(adjust);
+#endif
+ }
next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
if (next.tv_nsec <= 0)
next.tv_nsec += NSEC_PER_SEC;
- if (!fail)
+ if (!fail || fail == -ENODEV)
next.tv_sec = 659;
else
next.tv_sec = 0;
@@ -488,40 +514,19 @@ static void sync_cmos_clock(struct work_struct *work)
next.tv_sec++;
next.tv_nsec -= NSEC_PER_SEC;
}
- schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
+ queue_delayed_work(system_power_efficient_wq,
+ &sync_cmos_work, timespec_to_jiffies(&next));
}
-static void notify_cmos_timer(void)
+void ntp_notify_cmos_timer(void)
{
- if (!no_sync_cmos_clock)
- schedule_delayed_work(&sync_cmos_work, 0);
+ queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
}
#else
-static inline void notify_cmos_timer(void) { }
+void ntp_notify_cmos_timer(void) { }
#endif
-/*
- * Start the leap seconds timer:
- */
-static inline void ntp_start_leap_timer(struct timespec *ts)
-{
- long now = ts->tv_sec;
-
- if (time_status & STA_INS) {
- time_state = TIME_INS;
- now += 86400 - now % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-
- return;
- }
-
- if (time_status & STA_DEL) {
- time_state = TIME_DEL;
- now += 86400 - (now + 1) % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
- }
-}
/*
* Propagate a new txc->status value into the NTP state:
@@ -545,29 +550,12 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
/* only set allowed bits */
time_status &= STA_RONLY;
time_status |= txc->status & ~STA_RONLY;
-
- switch (time_state) {
- case TIME_OK:
- ntp_start_leap_timer(ts);
- break;
- case TIME_INS:
- case TIME_DEL:
- time_state = TIME_OK;
- ntp_start_leap_timer(ts);
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- break;
- case TIME_OOP:
- hrtimer_restart(&leap_timer);
- break;
- }
}
-/*
- * Called with the xtime lock held, so we can access and modify
- * all the global NTP state:
- */
-static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+
+
+static inline void process_adjtimex_modes(struct timex *txc,
+ struct timespec *ts,
+ s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
process_adj_status(txc, ts);
@@ -601,7 +589,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
}
if (txc->modes & ADJ_TAI && txc->constant > 0)
- time_tai = txc->constant;
+ *time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
ntp_update_offset(txc->offset);
@@ -613,16 +601,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
ntp_update_frequency();
}
-/*
- * adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
+
+
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-int do_adjtimex(struct timex *txc)
+int ntp_validate_timex(struct timex *txc)
{
- struct timespec ts;
- int result;
-
- /* Validate the data before disabling interrupts */
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -634,7 +619,6 @@ int do_adjtimex(struct timex *txc)
/* In order to modify anything, you gotta be super-user! */
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
-
/*
* if the quartz is off by more than 10% then
* something is VERY wrong!
@@ -643,27 +627,22 @@ int do_adjtimex(struct timex *txc)
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
return -EINVAL;
-
- if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
- hrtimer_cancel(&leap_timer);
}
- if (txc->modes & ADJ_SETOFFSET) {
- struct timespec delta;
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
- if (!capable(CAP_SYS_TIME))
- return -EPERM;
- if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- result = timekeeping_inject_offset(&delta);
- if (result)
- return result;
- }
+ if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
+ return -EPERM;
+
+ return 0;
+}
- getnstimeofday(&ts);
- write_seqlock_irq(&xtime_lock);
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
+{
+ int result;
if (txc->modes & ADJ_ADJTIME) {
long save_adjust = time_adjust;
@@ -678,7 +657,7 @@ int do_adjtimex(struct timex *txc)
/* If there are input parameters, then process them: */
if (txc->modes)
- process_adjtimex_modes(txc, &ts);
+ process_adjtimex_modes(txc, ts, time_tai);
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
@@ -700,20 +679,16 @@ int do_adjtimex(struct timex *txc)
txc->precision = 1;
txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
txc->tick = tick_usec;
- txc->tai = time_tai;
+ txc->tai = *time_tai;
/* fill PPS status fields */
pps_fill_timex(txc);
- write_sequnlock_irq(&xtime_lock);
-
- txc->time.tv_sec = ts.tv_sec;
- txc->time.tv_usec = ts.tv_nsec;
+ txc->time.tv_sec = ts->tv_sec;
+ txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
- notify_cmos_timer();
-
return result;
}
@@ -811,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
time_status |= STA_PPSERROR;
pps_errcnt++;
pps_dec_freq_interval();
- pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
- freq_norm.sec);
+ printk_deferred(KERN_ERR
+ "hardpps: PPSERROR: interval too long - %ld s\n",
+ freq_norm.sec);
return 0;
}
@@ -825,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
pps_freq = ftemp;
if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
- pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+ printk_deferred(KERN_WARNING
+ "hardpps: PPSWANDER: change=%ld\n", delta);
time_status |= STA_PPSWANDER;
pps_stbcnt++;
pps_dec_freq_interval();
@@ -869,8 +846,9 @@ static void hardpps_update_phase(long error)
* the time offset is updated.
*/
if (jitter > (pps_jitter << PPS_POPCORN)) {
- pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
- jitter, (pps_jitter << PPS_POPCORN));
+ printk_deferred(KERN_WARNING
+ "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (pps_jitter << PPS_POPCORN));
time_status |= STA_PPSJITTER;
pps_jitcnt++;
} else if (time_status & STA_PPSTIME) {
@@ -885,7 +863,7 @@ static void hardpps_update_phase(long error)
}
/*
- * hardpps() - discipline CPU clock oscillator to external PPS signal
+ * __hardpps() - discipline CPU clock oscillator to external PPS signal
*
* This routine is called at each PPS signal arrival in order to
* discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -896,15 +874,12 @@ static void hardpps_update_phase(long error)
* This code is based on David Mills's reference nanokernel
* implementation. It was mostly rewritten but keeps the same idea.
*/
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
{
struct pps_normtime pts_norm, freq_norm;
- unsigned long flags;
pts_norm = pps_normalize_ts(*phase_ts);
- write_seqlock_irqsave(&xtime_lock, flags);
-
/* clear the error bits, they will be set again if needed */
time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -916,7 +891,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
* just start the frequency interval */
if (unlikely(pps_fbase.tv_sec == 0)) {
pps_fbase = *raw_ts;
- write_sequnlock_irqrestore(&xtime_lock, flags);
return;
}
@@ -931,8 +905,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
time_status |= STA_PPSJITTER;
/* restart the frequency calibration interval */
pps_fbase = *raw_ts;
- write_sequnlock_irqrestore(&xtime_lock, flags);
- pr_err("hardpps: PPSJITTER: bad pulse\n");
+ printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
return;
}
@@ -948,15 +921,15 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
hardpps_update_phase(pts_norm.nsec);
- write_sequnlock_irqrestore(&xtime_lock, flags);
}
-EXPORT_SYMBOL(hardpps);
-
#endif /* CONFIG_NTP_PPS */
static int __init ntp_tick_adj_setup(char *str)
{
- ntp_tick_adj = simple_strtol(str, NULL, 0);
+ int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
+
+ if (rc)
+ return rc;
ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
@@ -967,6 +940,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
ntp_clear();
- hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- leap_timer.function = ntp_leap_second;
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 00000000000..1950cb4ca2a
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_NTP_INTERNAL_H
+#define _LINUX_NTP_INTERNAL_H
+
+extern void ntp_init(void);
+extern void ntp_clear(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 ntp_tick_length(void);
+extern int second_overflow(unsigned long secs);
+extern int ntp_validate_timex(struct timex *);
+extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
+extern void __hardpps(const struct timespec *, const struct timespec *);
+#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
new file mode 100644
index 00000000000..01d2d15aa66
--- /dev/null
+++ b/kernel/time/sched_clock.c
@@ -0,0 +1,217 @@
+/*
+ * sched_clock.c: support for extending counters to full 64-bit ns counter
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/ktime.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/syscore_ops.h>
+#include <linux/hrtimer.h>
+#include <linux/sched_clock.h>
+#include <linux/seqlock.h>
+#include <linux/bitops.h>
+
+struct clock_data {
+ ktime_t wrap_kt;
+ u64 epoch_ns;
+ u64 epoch_cyc;
+ seqcount_t seq;
+ unsigned long rate;
+ u32 mult;
+ u32 shift;
+ bool suspended;
+};
+
+static struct hrtimer sched_clock_timer;
+static int irqtime = -1;
+
+core_param(irqtime, irqtime, int, 0400);
+
+static struct clock_data cd = {
+ .mult = NSEC_PER_SEC / HZ,
+};
+
+static u64 __read_mostly sched_clock_mask;
+
+static u64 notrace jiffy_sched_clock_read(void)
+{
+ /*
+ * We don't need to use get_jiffies_64 on 32-bit arches here
+ * because we register with BITS_PER_LONG
+ */
+ return (u64)(jiffies - INITIAL_JIFFIES);
+}
+
+static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+
+static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+{
+ return (cyc * mult) >> shift;
+}
+
+unsigned long long notrace sched_clock(void)
+{
+ u64 epoch_ns;
+ u64 epoch_cyc;
+ u64 cyc;
+ unsigned long seq;
+
+ if (cd.suspended)
+ return cd.epoch_ns;
+
+ do {
+ seq = raw_read_seqcount_begin(&cd.seq);
+ epoch_cyc = cd.epoch_cyc;
+ epoch_ns = cd.epoch_ns;
+ } while (read_seqcount_retry(&cd.seq, seq));
+
+ cyc = read_sched_clock();
+ cyc = (cyc - epoch_cyc) & sched_clock_mask;
+ return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+}
+
+/*
+ * Atomically update the sched_clock epoch.
+ */
+static void notrace update_sched_clock(void)
+{
+ unsigned long flags;
+ u64 cyc;
+ u64 ns;
+
+ cyc = read_sched_clock();
+ ns = cd.epoch_ns +
+ cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+ cd.mult, cd.shift);
+
+ raw_local_irq_save(flags);
+ raw_write_seqcount_begin(&cd.seq);
+ cd.epoch_ns = ns;
+ cd.epoch_cyc = cyc;
+ raw_write_seqcount_end(&cd.seq);
+ raw_local_irq_restore(flags);
+}
+
+static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
+{
+ update_sched_clock();
+ hrtimer_forward_now(hrt, cd.wrap_kt);
+ return HRTIMER_RESTART;
+}
+
+void __init sched_clock_register(u64 (*read)(void), int bits,
+ unsigned long rate)
+{
+ u64 res, wrap, new_mask, new_epoch, cyc, ns;
+ u32 new_mult, new_shift;
+ ktime_t new_wrap_kt;
+ unsigned long r;
+ char r_unit;
+
+ if (cd.rate > rate)
+ return;
+
+ WARN_ON(!irqs_disabled());
+
+ /* calculate the mult/shift to convert counter ticks to ns. */
+ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+
+ new_mask = CLOCKSOURCE_MASK(bits);
+
+ /* calculate how many ns until we wrap */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
+ new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+ /* update epoch for new counter and update epoch_ns from old counter*/
+ new_epoch = read();
+ cyc = read_sched_clock();
+ ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+ cd.mult, cd.shift);
+
+ raw_write_seqcount_begin(&cd.seq);
+ read_sched_clock = read;
+ sched_clock_mask = new_mask;
+ cd.rate = rate;
+ cd.wrap_kt = new_wrap_kt;
+ cd.mult = new_mult;
+ cd.shift = new_shift;
+ cd.epoch_cyc = new_epoch;
+ cd.epoch_ns = ns;
+ raw_write_seqcount_end(&cd.seq);
+
+ r = rate;
+ if (r >= 4000000) {
+ r /= 1000000;
+ r_unit = 'M';
+ } else if (r >= 1000) {
+ r /= 1000;
+ r_unit = 'k';
+ } else
+ r_unit = ' ';
+
+ /* calculate the ns resolution of this counter */
+ res = cyc_to_ns(1ULL, new_mult, new_shift);
+
+ pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
+ bits, r, r_unit, res, wrap);
+
+ /* Enable IRQ time accounting if we have a fast enough sched_clock */
+ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
+ enable_sched_clock_irqtime();
+
+ pr_debug("Registered %pF as sched_clock source\n", read);
+}
+
+void __init sched_clock_postinit(void)
+{
+ /*
+ * If no sched_clock function has been provided at that point,
+ * make it the final one one.
+ */
+ if (read_sched_clock == jiffy_sched_clock_read)
+ sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
+
+ update_sched_clock();
+
+ /*
+ * Start the timer to keep sched_clock() properly updated and
+ * sets the initial epoch.
+ */
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ sched_clock_timer.function = sched_clock_poll;
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+}
+
+static int sched_clock_suspend(void)
+{
+ update_sched_clock();
+ hrtimer_cancel(&sched_clock_timer);
+ cd.suspended = true;
+ return 0;
+}
+
+static void sched_clock_resume(void)
+{
+ cd.epoch_cyc = read_sched_clock();
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ cd.suspended = false;
+}
+
+static struct syscore_ops sched_clock_ops = {
+ .suspend = sched_clock_suspend,
+ .resume = sched_clock_resume,
+};
+
+static int __init sched_clock_syscore_init(void)
+{
+ register_syscore_ops(&sched_clock_ops);
+ return 0;
+}
+device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
new file mode 100644
index 00000000000..eb682d5c697
--- /dev/null
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,106 @@
+/*
+ * linux/kernel/time/tick-broadcast-hrtimer.c
+ * This file emulates a local clock event device
+ * via a pseudo clock device.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/clockchips.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+static struct hrtimer bctimer;
+
+static void bc_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *bc)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * This is called from the guts of the broadcast code when the cpu
+ * which is about to enter idle has the earliest broadcast timer event.
+ */
+static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
+{
+ /*
+ * We try to cancel the timer first. If the callback is on
+ * flight on some other cpu then we let it handle it. If we
+ * were able to cancel the timer nothing can rearm it as we
+ * own broadcast_lock.
+ *
+ * However we can also be called from the event handler of
+ * ce_broadcast_hrtimer itself when it expires. We cannot
+ * restart the timer because we are in the callback, but we
+ * can set the expiry time and let the callback return
+ * HRTIMER_RESTART.
+ */
+ if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+ /* Bind the "device" to the cpu */
+ bc->bound_on = smp_processor_id();
+ } else if (bc->bound_on == smp_processor_id()) {
+ hrtimer_set_expires(&bctimer, expires);
+ }
+ return 0;
+}
+
+static struct clock_event_device ce_broadcast_hrtimer = {
+ .set_mode = bc_set_mode,
+ .set_next_ktime = bc_set_next,
+ .features = CLOCK_EVT_FEAT_ONESHOT |
+ CLOCK_EVT_FEAT_KTIME |
+ CLOCK_EVT_FEAT_HRTIMER,
+ .rating = 0,
+ .bound_on = -1,
+ .min_delta_ns = 1,
+ .max_delta_ns = KTIME_MAX,
+ .min_delta_ticks = 1,
+ .max_delta_ticks = ULONG_MAX,
+ .mult = 1,
+ .shift = 0,
+ .cpumask = cpu_all_mask,
+};
+
+static enum hrtimer_restart bc_handler(struct hrtimer *t)
+{
+ ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
+
+ if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+ return HRTIMER_NORESTART;
+
+ return HRTIMER_RESTART;
+}
+
+void tick_setup_hrtimer_broadcast(void)
+{
+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ bctimer.function = bc_handler;
+ clockevents_register_device(&ce_broadcast_hrtimer);
+}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index fd4a7b1625a..64c5990fd50 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,8 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
#include "tick-internal.h"
@@ -27,9 +29,9 @@
*/
static struct tick_device tick_broadcast_device;
-/* FIXME: Use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
-static DECLARE_BITMAP(tmpmask, NR_CPUS);
+static cpumask_var_t tick_broadcast_mask;
+static cpumask_var_t tick_broadcast_on;
+static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
static int tick_broadcast_force;
@@ -49,7 +51,7 @@ struct tick_device *tick_get_broadcast_device(void)
struct cpumask *tick_get_broadcast_mask(void)
{
- return to_cpumask(tick_broadcast_mask);
+ return tick_broadcast_mask;
}
/*
@@ -64,18 +66,50 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
/*
* Check, if the device can be utilized as broadcast device:
*/
-int tick_check_broadcast_device(struct clock_event_device *dev)
+static bool tick_check_broadcast_device(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
{
- if ((tick_broadcast_device.evtdev &&
- tick_broadcast_device.evtdev->rating >= dev->rating) ||
- (dev->features & CLOCK_EVT_FEAT_C3STOP))
- return 0;
+ if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
+ (newdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return false;
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
- clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+ return !curdev || newdev->rating > curdev->rating;
+}
+
+/*
+ * Conditionally install/replace broadcast device
+ */
+void tick_install_broadcast_device(struct clock_event_device *dev)
+{
+ struct clock_event_device *cur = tick_broadcast_device.evtdev;
+
+ if (!tick_check_broadcast_device(cur, dev))
+ return;
+
+ if (!try_module_get(dev->owner))
+ return;
+
+ clockevents_exchange_device(cur, dev);
+ if (cur)
+ cur->event_handler = clockevents_handle_noop;
tick_broadcast_device.evtdev = dev;
- if (!cpumask_empty(tick_get_broadcast_mask()))
+ if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(dev);
- return 1;
+ /*
+ * Inform all cpus about this. We might be in a situation
+ * where we did not switch to oneshot mode because the per cpu
+ * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
+ * of a oneshot capable broadcast device. Without that
+ * notification the systems stays stuck in periodic mode
+ * forever.
+ */
+ if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_clock_notify();
}
/*
@@ -86,14 +120,44 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
return (dev && tick_broadcast_device.evtdev == dev);
}
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ int ret = -ENODEV;
+
+ if (tick_is_broadcast_device(dev)) {
+ raw_spin_lock(&tick_broadcast_lock);
+ ret = __clockevents_update_freq(dev, freq);
+ raw_spin_unlock(&tick_broadcast_lock);
+ }
+ return ret;
+}
+
+
+static void err_broadcast(const struct cpumask *mask)
+{
+ pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
+}
+
+static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
+{
+ if (!dev->broadcast)
+ dev->broadcast = tick_broadcast;
+ if (!dev->broadcast) {
+ pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
+ dev->name);
+ dev->broadcast = err_broadcast;
+ }
+}
+
/*
* Check, if the device is disfunctional and a place holder, which
* needs to be handled by the broadcast device.
*/
int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
unsigned long flags;
- int ret = 0;
+ int ret;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -105,26 +169,87 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
*/
if (!tick_device_is_functional(dev)) {
dev->event_handler = tick_handle_periodic;
- cpumask_set_cpu(cpu, tick_get_broadcast_mask());
- tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+ tick_device_setup_broadcast_func(dev);
+ cpumask_set_cpu(cpu, tick_broadcast_mask);
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
ret = 1;
} else {
/*
- * When the new device is not affected by the stop
- * feature and the cpu is marked in the broadcast mask
- * then clear the broadcast bit.
+ * Clear the broadcast bit for this cpu if the
+ * device is not power state affected.
*/
- if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
- int cpu = smp_processor_id();
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ else
+ tick_device_setup_broadcast_func(dev);
+
+ /*
+ * Clear the broadcast bit if the CPU is not in
+ * periodic broadcast on state.
+ */
+ if (!cpumask_test_cpu(cpu, tick_broadcast_on))
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
- cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_ONESHOT:
+ /*
+ * If the system is in oneshot mode we can
+ * unconditionally clear the oneshot mask bit,
+ * because the CPU is running and therefore
+ * not in an idle state which causes the power
+ * state affected device to stop. Let the
+ * caller initialize the device.
+ */
tick_broadcast_clear_oneshot(cpu);
+ ret = 0;
+ break;
+
+ case TICKDEV_MODE_PERIODIC:
+ /*
+ * If the system is in periodic mode, check
+ * whether the broadcast device can be
+ * switched off now.
+ */
+ if (cpumask_empty(tick_broadcast_mask) && bc)
+ clockevents_shutdown(bc);
+ /*
+ * If we kept the cpu in the broadcast mask,
+ * tell the caller to leave the per cpu device
+ * in shutdown state. The periodic interrupt
+ * is delivered by the broadcast device.
+ */
+ ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+ break;
+ default:
+ /* Nothing to do */
+ ret = 0;
+ break;
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
return ret;
}
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+int tick_receive_broadcast(void)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ struct clock_event_device *evt = td->evtdev;
+
+ if (!evt)
+ return -ENODEV;
+
+ if (!evt->event_handler)
+ return -EINVAL;
+
+ evt->event_handler(evt);
+ return 0;
+}
+#endif
+
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
*/
@@ -160,13 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask)
*/
static void tick_do_periodic_broadcast(void)
{
- raw_spin_lock(&tick_broadcast_lock);
-
- cpumask_and(to_cpumask(tmpmask),
- cpu_online_mask, tick_get_broadcast_mask());
- tick_do_broadcast(to_cpumask(tmpmask));
-
- raw_spin_unlock(&tick_broadcast_lock);
+ cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
+ tick_do_broadcast(tmpmask);
}
/*
@@ -176,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
{
ktime_t next;
+ raw_spin_lock(&tick_broadcast_lock);
+
tick_do_periodic_broadcast();
/*
* The device is in periodic mode. No reprogramming necessary:
*/
if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- return;
+ goto unlock;
/*
* Setup the next period for devices, which do not have
@@ -195,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
next = ktime_add(next, tick_period);
if (!clockevents_program_event(dev, next, false))
- return;
+ goto unlock;
tick_do_periodic_broadcast();
}
+unlock:
+ raw_spin_unlock(&tick_broadcast_lock);
}
/*
@@ -227,13 +351,13 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
if (!tick_device_is_functional(dev))
goto out;
- bc_stopped = cpumask_empty(tick_get_broadcast_mask());
+ bc_stopped = cpumask_empty(tick_broadcast_mask);
switch (*reason) {
case CLOCK_EVT_NOTIFY_BROADCAST_ON:
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
- cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ cpumask_set_cpu(cpu, tick_broadcast_on);
+ if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
@@ -242,9 +366,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
tick_broadcast_force = 1;
break;
case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- if (!tick_broadcast_force &&
- cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
- cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ if (tick_broadcast_force)
+ break;
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
+ if (!tick_device_is_functional(dev))
+ break;
+ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
tick_setup_periodic(dev, 0);
@@ -252,7 +379,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
break;
}
- if (cpumask_empty(tick_get_broadcast_mask())) {
+ if (cpumask_empty(tick_broadcast_mask)) {
if (!bc_stopped)
clockevents_shutdown(bc);
} else if (bc_stopped) {
@@ -301,10 +428,11 @@ void tick_shutdown_broadcast(unsigned int *cpup)
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
- cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
- if (bc && cpumask_empty(tick_get_broadcast_mask()))
+ if (bc && cpumask_empty(tick_broadcast_mask))
clockevents_shutdown(bc);
}
@@ -340,13 +468,14 @@ int tick_resume_broadcast(void)
switch (tick_broadcast_device.mode) {
case TICKDEV_MODE_PERIODIC:
- if (!cpumask_empty(tick_get_broadcast_mask()))
+ if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(bc);
broadcast = cpumask_test_cpu(smp_processor_id(),
- tick_get_broadcast_mask());
+ tick_broadcast_mask);
break;
case TICKDEV_MODE_ONESHOT:
- broadcast = tick_resume_broadcast_oneshot(bc);
+ if (!cpumask_empty(tick_broadcast_mask))
+ broadcast = tick_resume_broadcast_oneshot(bc);
break;
}
}
@@ -358,22 +487,58 @@ int tick_resume_broadcast(void)
#ifdef CONFIG_TICK_ONESHOT
-/* FIXME: use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+static cpumask_var_t tick_broadcast_oneshot_mask;
+static cpumask_var_t tick_broadcast_pending_mask;
+static cpumask_var_t tick_broadcast_force_mask;
/*
* Exposed for debugging: see timer_list.c
*/
struct cpumask *tick_get_broadcast_oneshot_mask(void)
{
- return to_cpumask(tick_broadcast_oneshot_mask);
+ return tick_broadcast_oneshot_mask;
+}
+
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen. We detected
+ * that in tick_broadcast_oneshot_control(). The callsite can use this
+ * to avoid a deep idle transition as we are about to get the
+ * broadcast IPI right away.
+ */
+int tick_check_broadcast_expired(void)
+{
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
}
-static int tick_broadcast_set_event(ktime_t expires, int force)
+/*
+ * Set broadcast interrupt affinity
+ */
+static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+ const struct cpumask *cpumask)
{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+ return;
+
+ if (cpumask_equal(bc->cpumask, cpumask))
+ return;
- return clockevents_program_event(bc, expires, force);
+ bc->cpumask = cpumask;
+ irq_set_affinity(bc->irq, bc->cpumask);
+}
+
+static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ ktime_t expires, int force)
+{
+ int ret;
+
+ if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+
+ ret = clockevents_program_event(bc, expires, force);
+ if (!ret)
+ tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+ return ret;
}
int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -386,12 +551,20 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
* Called from irq_enter() when idle was interrupted to reenable the
* per cpu device.
*/
-void tick_check_oneshot_broadcast(int cpu)
+void tick_check_oneshot_broadcast_this_cpu(void)
{
- if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ /*
+ * We might be in the middle of switching over from
+ * periodic to oneshot. If the CPU has not yet
+ * switched over, leave the device alone.
+ */
+ if (td->mode == TICKDEV_MODE_ONESHOT) {
+ clockevents_set_mode(td->evtdev,
+ CLOCK_EVT_MODE_ONESHOT);
+ }
}
}
@@ -402,27 +575,52 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
{
struct tick_device *td;
ktime_t now, next_event;
- int cpu;
+ int cpu, next_cpu = 0;
raw_spin_lock(&tick_broadcast_lock);
again:
dev->next_event.tv64 = KTIME_MAX;
next_event.tv64 = KTIME_MAX;
- cpumask_clear(to_cpumask(tmpmask));
+ cpumask_clear(tmpmask);
now = ktime_get();
/* Find all expired events */
- for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+ for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
td = &per_cpu(tick_cpu_device, cpu);
- if (td->evtdev->next_event.tv64 <= now.tv64)
- cpumask_set_cpu(cpu, to_cpumask(tmpmask));
- else if (td->evtdev->next_event.tv64 < next_event.tv64)
+ if (td->evtdev->next_event.tv64 <= now.tv64) {
+ cpumask_set_cpu(cpu, tmpmask);
+ /*
+ * Mark the remote cpu in the pending mask, so
+ * it can avoid reprogramming the cpu local
+ * timer in tick_broadcast_oneshot_control().
+ */
+ cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
+ } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
next_event.tv64 = td->evtdev->next_event.tv64;
+ next_cpu = cpu;
+ }
}
/*
+ * Remove the current cpu from the pending mask. The event is
+ * delivered immediately in tick_do_broadcast() !
+ */
+ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
+
+ /* Take care of enforced broadcast requests */
+ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
+ cpumask_clear(tick_broadcast_force_mask);
+
+ /*
+ * Sanity check. Catch the case where we try to broadcast to
+ * offline cpus.
+ */
+ if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
+ cpumask_and(tmpmask, tmpmask, cpu_online_mask);
+
+ /*
* Wakeup the cpus which have an expired event.
*/
- tick_do_broadcast(to_cpumask(tmpmask));
+ tick_do_broadcast(tmpmask);
/*
* Two reasons for reprogram:
@@ -439,29 +637,67 @@ again:
* Rearm the broadcast device. If event expired,
* repeat the above
*/
- if (tick_broadcast_set_event(next_event, 0))
+ if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
goto again;
}
raw_spin_unlock(&tick_broadcast_lock);
}
+static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
+{
+ if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return 0;
+ if (bc->next_event.tv64 == KTIME_MAX)
+ return 0;
+ return bc->bound_on == cpu ? -EBUSY : 0;
+}
+
+static void broadcast_shutdown_local(struct clock_event_device *bc,
+ struct clock_event_device *dev)
+{
+ /*
+ * For hrtimer based broadcasting we cannot shutdown the cpu
+ * local device if our own event is the first one to expire or
+ * if we own the broadcast timer.
+ */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
+ if (broadcast_needs_cpu(bc, smp_processor_id()))
+ return;
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ return;
+ }
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+}
+
+static void broadcast_move_bc(int deadcpu)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+ return;
+ /* This moves the broadcast assignment to this cpu */
+ clockevents_program_event(bc, bc->next_event, 1);
+}
+
/*
* Powerstate information: The system enters/leaves a state, where
* affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
*/
-void tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(unsigned long reason)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
unsigned long flags;
- int cpu;
+ ktime_t now;
+ int cpu, ret = 0;
/*
* Periodic mode does not care about the enter/exit of power
* states
*/
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- return;
+ return 0;
/*
* We are called with preemtion disabled from the depth of the
@@ -472,28 +708,105 @@ void tick_broadcast_oneshot_control(unsigned long reason)
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return;
+ return 0;
bc = tick_broadcast_device.evtdev;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
- if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
- cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
- if (dev->next_event.tv64 < bc->next_event.tv64)
- tick_broadcast_set_event(dev->next_event, 1);
+ if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+ broadcast_shutdown_local(bc, dev);
+ /*
+ * We only reprogram the broadcast timer if we
+ * did not mark ourself in the force mask and
+ * if the cpu local event is earlier than the
+ * broadcast event. If the current CPU is in
+ * the force mask, then we are going to be
+ * woken by the IPI right away.
+ */
+ if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
+ dev->next_event.tv64 < bc->next_event.tv64)
+ tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
}
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle and we remove the
+ * CPU from the broadcast mask. We don't have to go
+ * through the EXIT path as the local timer is not
+ * shutdown.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret)
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
- if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
- cpumask_clear_cpu(cpu,
- tick_get_broadcast_oneshot_mask());
+ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
- if (dev->next_event.tv64 != KTIME_MAX)
- tick_program_event(dev->next_event, 1);
+ /*
+ * The cpu which was handling the broadcast
+ * timer marked this cpu in the broadcast
+ * pending mask and fired the broadcast
+ * IPI. So we are going to handle the expired
+ * event anyway via the broadcast IPI
+ * handler. No need to reprogram the timer
+ * with an already expired event.
+ */
+ if (cpumask_test_and_clear_cpu(cpu,
+ tick_broadcast_pending_mask))
+ goto out;
+
+ /*
+ * Bail out if there is no next event.
+ */
+ if (dev->next_event.tv64 == KTIME_MAX)
+ goto out;
+ /*
+ * If the pending bit is not set, then we are
+ * either the CPU handling the broadcast
+ * interrupt or we got woken by something else.
+ *
+ * We are not longer in the broadcast mask, so
+ * if the cpu local expiry time is already
+ * reached, we would reprogram the cpu local
+ * timer with an already expired event.
+ *
+ * This can lead to a ping-pong when we return
+ * to idle and therefor rearm the broadcast
+ * timer before the cpu local timer was able
+ * to fire. This happens because the forced
+ * reprogramming makes sure that the event
+ * will happen in the future and depending on
+ * the min_delta setting this might be far
+ * enough out that the ping-pong starts.
+ *
+ * If the cpu local next_event has expired
+ * then we know that the broadcast timer
+ * next_event has expired as well and
+ * broadcast is about to be handled. So we
+ * avoid reprogramming and enforce that the
+ * broadcast handler, which did not run yet,
+ * will invoke the cpu local handler.
+ *
+ * We cannot call the handler directly from
+ * here, because we might be in a NOHZ phase
+ * and we did not go through the irq_enter()
+ * nohz fixups.
+ */
+ now = ktime_get();
+ if (dev->next_event.tv64 <= now.tv64) {
+ cpumask_set_cpu(cpu, tick_broadcast_force_mask);
+ goto out;
+ }
+ /*
+ * We got woken by something else. Reprogram
+ * the cpu local timer device.
+ */
+ tick_program_event(dev->next_event, 1);
}
}
+out:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ return ret;
}
/*
@@ -503,7 +816,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
*/
static void tick_broadcast_clear_oneshot(int cpu)
{
- cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
}
static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -531,10 +845,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
bc->event_handler = tick_handle_oneshot_broadcast;
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-
- /* Take the do_timer update */
- tick_do_timer_cpu = cpu;
/*
* We must be careful here. There might be other CPUs
@@ -542,16 +852,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
* oneshot_mask bits for those and program the
* broadcast device to fire.
*/
- cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
- cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
- cpumask_or(tick_get_broadcast_oneshot_mask(),
- tick_get_broadcast_oneshot_mask(),
- to_cpumask(tmpmask));
-
- if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
- tick_broadcast_init_next_event(to_cpumask(tmpmask),
+ cpumask_copy(tmpmask, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tmpmask);
+ cpumask_or(tick_broadcast_oneshot_mask,
+ tick_broadcast_oneshot_mask, tmpmask);
+
+ if (was_periodic && !cpumask_empty(tmpmask)) {
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ tick_broadcast_init_next_event(tmpmask,
tick_next_period);
- tick_broadcast_set_event(tick_next_period, 1);
+ tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
} else
bc->next_event.tv64 = KTIME_MAX;
} else {
@@ -580,6 +890,7 @@ void tick_broadcast_switch_to_oneshot(void)
bc = tick_broadcast_device.evtdev;
if (bc)
tick_broadcast_setup_oneshot(bc);
+
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -595,10 +906,14 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
/*
- * Clear the broadcast mask flag for the dead cpu, but do not
- * stop the broadcast device!
+ * Clear the broadcast masks for the dead cpu, but do not stop
+ * the broadcast device!
*/
- cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
+
+ broadcast_move_bc(cpu);
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -622,3 +937,15 @@ bool tick_broadcast_oneshot_available(void)
}
#endif
+
+void __init tick_broadcast_init(void)
+{
+ zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
+ zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+#ifdef CONFIG_TICK_ONESHOT
+ zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+#endif
+}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e..0a0608edeb2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,6 +18,7 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
+#include <linux/module.h>
#include <asm/irq_regs.h>
@@ -32,8 +33,22 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
+
+/*
+ * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
+ * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
+ * variable has two functions:
+ *
+ * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
+ * timekeeping lock all at once. Only the CPU which is assigned to do the
+ * update is handling it.
+ *
+ * 2) Hand off the duty in the NOHZ idle case by setting the value to
+ * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
+ * at it will take over and keep the time keeping alive. The handover
+ * procedure also covers cpu hotplug.
+ */
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-static DEFINE_RAW_SPINLOCK(tick_device_lock);
/*
* Debugging: see timer_list.c
@@ -63,13 +78,14 @@ int tick_is_oneshot_available(void)
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
- write_seqlock(&xtime_lock);
+ write_seqlock(&jiffies_lock);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
- write_sequnlock(&xtime_lock);
+ write_sequnlock(&jiffies_lock);
+ update_wall_time();
}
update_process_times(user_mode(get_irq_regs()));
@@ -82,18 +98,19 @@ static void tick_periodic(int cpu)
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
- ktime_t next;
+ ktime_t next = dev->next_event;
tick_periodic(cpu);
if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
return;
- /*
- * Setup the next period for devices, which do not have
- * periodic mode:
- */
- next = ktime_add(dev->next_event, tick_period);
for (;;) {
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ next = ktime_add(next, tick_period);
+
if (!clockevents_program_event(dev, next, false))
return;
/*
@@ -102,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev)
* to be sure we're using a real hardware clocksource.
* Otherwise we could get trapped in an infinite
* loop, as the tick_periodic() increments jiffies,
- * when then will increment time, posibly causing
+ * which then will increment time, possibly causing
* the loop to trigger again and again.
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
- next = ktime_add(next, tick_period);
}
}
@@ -130,9 +146,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
ktime_t next;
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqbegin(&jiffies_lock);
next = tick_next_period;
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqretry(&jiffies_lock, seq));
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
@@ -163,7 +179,10 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- tick_do_timer_cpu = cpu;
+ if (!tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
+ else
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get();
tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
}
@@ -191,7 +210,8 @@ static void tick_setup_device(struct tick_device *td,
* When global broadcasting is active, check if the current
* device is registered as a placeholder for broadcast mode.
* This allows us to handle this x86 misfeature in a generic
- * way.
+ * way. This function also returns !=0 when we keep the
+ * current active broadcast state for this CPU.
*/
if (tick_device_uses_broadcast(newdev, cpu))
return;
@@ -202,17 +222,75 @@ static void tick_setup_device(struct tick_device *td,
tick_setup_oneshot(newdev, handler, next_event);
}
+void tick_install_replacement(struct clock_event_device *newdev)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ int cpu = smp_processor_id();
+
+ clockevents_exchange_device(td->evtdev, newdev);
+ tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
+ if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_oneshot_notify();
+}
+
+static bool tick_check_percpu(struct clock_event_device *curdev,
+ struct clock_event_device *newdev, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, newdev->cpumask))
+ return false;
+ if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
+ return true;
+ /* Check if irq affinity can be set */
+ if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
+ return false;
+ /* Prefer an existing cpu local device */
+ if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
+ return false;
+ return true;
+}
+
+static bool tick_check_preferred(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ /* Prefer oneshot capable device */
+ if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
+ if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+ if (tick_oneshot_mode_active())
+ return false;
+ }
+
+ /*
+ * Use the higher rated one, but prefer a CPU local device with a lower
+ * rating than a non-CPU local device
+ */
+ return !curdev ||
+ newdev->rating > curdev->rating ||
+ !cpumask_equal(curdev->cpumask, newdev->cpumask);
+}
+
/*
- * Check, if the new registered device should be used.
+ * Check whether the new device is a better fit than curdev. curdev
+ * can be NULL !
*/
-static int tick_check_new_device(struct clock_event_device *newdev)
+bool tick_check_replacement(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
+ return false;
+
+ return tick_check_preferred(curdev, newdev);
+}
+
+/*
+ * Check, if the new registered device should be used. Called with
+ * clockevents_lock held and interrupts disabled.
+ */
+void tick_check_new_device(struct clock_event_device *newdev)
{
struct clock_event_device *curdev;
struct tick_device *td;
- int cpu, ret = NOTIFY_OK;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&tick_device_lock, flags);
+ int cpu;
cpu = smp_processor_id();
if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -222,40 +300,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)
curdev = td->evtdev;
/* cpu local device ? */
- if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
-
- /*
- * If the cpu affinity of the device interrupt can not
- * be set, ignore it.
- */
- if (!irq_can_set_affinity(newdev->irq))
- goto out_bc;
+ if (!tick_check_percpu(curdev, newdev, cpu))
+ goto out_bc;
- /*
- * If we have a cpu local device already, do not replace it
- * by a non cpu local device
- */
- if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
- goto out_bc;
- }
+ /* Preference decision */
+ if (!tick_check_preferred(curdev, newdev))
+ goto out_bc;
- /*
- * If we have an active device, then check the rating and the oneshot
- * feature.
- */
- if (curdev) {
- /*
- * Prefer one shot capable devices !
- */
- if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
- !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
- goto out_bc;
- /*
- * Check the rating
- */
- if (curdev->rating >= newdev->rating)
- goto out_bc;
- }
+ if (!try_module_get(newdev->owner))
+ return;
/*
* Replace the eventually existing device by the new
@@ -270,20 +323,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)
tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
tick_oneshot_notify();
-
- raw_spin_unlock_irqrestore(&tick_device_lock, flags);
- return NOTIFY_STOP;
+ return;
out_bc:
/*
* Can the new device be used as a broadcast device ?
*/
- if (tick_check_broadcast_device(newdev))
- ret = NOTIFY_STOP;
-
- raw_spin_unlock_irqrestore(&tick_device_lock, flags);
-
- return ret;
+ tick_install_broadcast_device(newdev);
}
/*
@@ -291,7 +337,7 @@ out_bc:
*
* Called with interrupts disabled.
*/
-static void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(int *cpup)
{
if (*cpup == tick_do_timer_cpu) {
int cpu = cpumask_first(cpu_online_mask);
@@ -308,13 +354,11 @@ static void tick_handover_do_timer(int *cpup)
* access the hardware device itself.
* We just set the mode and remove it from the lists.
*/
-static void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int *cpup)
{
struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
struct clock_event_device *dev = td->evtdev;
- unsigned long flags;
- raw_spin_lock_irqsave(&tick_device_lock, flags);
td->mode = TICKDEV_MODE_PERIODIC;
if (dev) {
/*
@@ -323,28 +367,23 @@ static void tick_shutdown(unsigned int *cpup)
*/
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
+ dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
- raw_spin_unlock_irqrestore(&tick_device_lock, flags);
}
-static void tick_suspend(void)
+void tick_suspend(void)
{
struct tick_device *td = &__get_cpu_var(tick_cpu_device);
- unsigned long flags;
- raw_spin_lock_irqsave(&tick_device_lock, flags);
clockevents_shutdown(td->evtdev);
- raw_spin_unlock_irqrestore(&tick_device_lock, flags);
}
-static void tick_resume(void)
+void tick_resume(void)
{
struct tick_device *td = &__get_cpu_var(tick_cpu_device);
- unsigned long flags;
int broadcast = tick_resume_broadcast();
- raw_spin_lock_irqsave(&tick_device_lock, flags);
clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
if (!broadcast) {
@@ -353,67 +392,12 @@ static void tick_resume(void)
else
tick_resume_oneshot();
}
- raw_spin_unlock_irqrestore(&tick_device_lock, flags);
-}
-
-/*
- * Notification about clock event devices
- */
-static int tick_notify(struct notifier_block *nb, unsigned long reason,
- void *dev)
-{
- switch (reason) {
-
- case CLOCK_EVT_NOTIFY_ADD:
- return tick_check_new_device(dev);
-
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- tick_broadcast_on_off(reason, dev);
- break;
-
- case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
- case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
- tick_broadcast_oneshot_control(reason);
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DYING:
- tick_handover_do_timer(dev);
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DEAD:
- tick_shutdown_broadcast_oneshot(dev);
- tick_shutdown_broadcast(dev);
- tick_shutdown(dev);
- break;
-
- case CLOCK_EVT_NOTIFY_SUSPEND:
- tick_suspend();
- tick_suspend_broadcast();
- break;
-
- case CLOCK_EVT_NOTIFY_RESUME:
- tick_resume();
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
}
-static struct notifier_block tick_notifier = {
- .notifier_call = tick_notify,
-};
-
/**
* tick_init - initialize the tick control
- *
- * Register the notifier with the clockevents framework
*/
void __init tick_init(void)
{
- clockevents_register_notifier(&tick_notifier);
+ tick_broadcast_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e265b901fe..7ab92b19965 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,10 @@
#include <linux/hrtimer.h>
#include <linux/tick.h>
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN 32
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
#define TICK_DO_TIMER_NONE -1
@@ -16,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void tick_check_new_device(struct clock_event_device *dev);
+extern void tick_handover_do_timer(int *cpup);
+extern void tick_shutdown(unsigned int *cpup);
+extern void tick_suspend(void);
+extern void tick_resume(void);
+extern bool tick_check_replacement(struct clock_event_device *curdev,
+ struct clock_event_device *newdev);
+extern void tick_install_replacement(struct clock_event_device *dev);
extern void clockevents_shutdown(struct clock_event_device *dev);
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+
/*
* NO_HZ / high resolution timer shared code
*/
@@ -32,23 +46,23 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
extern void tick_resume_oneshot(void);
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern int tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
extern int tick_broadcast_oneshot_active(void);
-extern void tick_check_oneshot_broadcast(int cpu);
+extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
BUG();
}
-static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
static inline bool tick_broadcast_oneshot_available(void) { return true; }
# endif /* !BROADCAST */
@@ -73,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
BUG();
}
-static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
@@ -88,21 +102,21 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }
*/
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
extern void tick_shutdown_broadcast(unsigned int *cpup);
extern void tick_suspend_broadcast(void);
extern int tick_resume_broadcast(void);
-
+extern void tick_broadcast_init(void);
extern void
tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
#else /* !BROADCAST */
-static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+static inline void tick_install_broadcast_device(struct clock_event_device *dev)
{
- return 0;
}
static inline int tick_is_broadcast_device(struct clock_event_device *dev)
@@ -119,6 +133,9 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
static inline void tick_suspend_broadcast(void) { }
static inline int tick_resume_broadcast(void) { return 0; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
+ u32 freq) { return -ENODEV; }
/*
* Set the periodic handler in non broadcast mode
@@ -138,7 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
+
#endif
extern void do_timer(unsigned long ticks);
-extern seqlock_t xtime_lock;
+extern void update_wall_time(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7656642e4b8..6558b7ac112 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,18 +20,24 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/posix-timers.h>
+#include <linux/perf_event.h>
+#include <linux/context_tracking.h>
#include <asm/irq_regs.h>
#include "tick-internal.h"
+#include <trace/events/timer.h>
+
/*
* Per cpu nohz control structure
*/
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
/*
- * The time, when the last jiffy update happened. Protected by xtime_lock.
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
*/
static ktime_t last_jiffies_update;
@@ -49,14 +55,14 @@ static void tick_do_update_jiffies64(ktime_t now)
ktime_t delta;
/*
- * Do a quick check without holding xtime_lock:
+ * Do a quick check without holding jiffies_lock:
*/
delta = ktime_sub(now, last_jiffies_update);
if (delta.tv64 < tick_period.tv64)
return;
- /* Reevalute with xtime_lock held */
- write_seqlock(&xtime_lock);
+ /* Reevalute with jiffies_lock held */
+ write_seqlock(&jiffies_lock);
delta = ktime_sub(now, last_jiffies_update);
if (delta.tv64 >= tick_period.tv64) {
@@ -78,8 +84,12 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ } else {
+ write_sequnlock(&jiffies_lock);
+ return;
}
- write_sequnlock(&xtime_lock);
+ write_sequnlock(&jiffies_lock);
+ update_wall_time();
}
/*
@@ -89,24 +99,274 @@ static ktime_t tick_init_jiffy_update(void)
{
ktime_t period;
- write_seqlock(&xtime_lock);
+ write_seqlock(&jiffies_lock);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update.tv64 == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
- write_sequnlock(&xtime_lock);
+ write_sequnlock(&jiffies_lock);
return period;
}
+
+static void tick_sched_do_timer(ktime_t now)
+{
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * jiffies_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ && !tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
+#endif
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
+}
+
+static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start of
+ * idle" jiffy stamp so the idle accounting adjustment we do
+ * when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ if (is_idle_task(current))
+ ts->idle_jiffies++;
+ }
+#endif
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+cpumask_var_t tick_nohz_full_mask;
+bool tick_nohz_full_running;
+
+static bool can_stop_full_tick(void)
+{
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!sched_can_stop_tick()) {
+ trace_tick_stop(0, "more than 1 task in runqueue\n");
+ return false;
+ }
+
+ if (!posix_cpu_timers_can_stop_tick(current)) {
+ trace_tick_stop(0, "posix timers running\n");
+ return false;
+ }
+
+ if (!perf_event_can_stop_tick()) {
+ trace_tick_stop(0, "perf events running\n");
+ return false;
+ }
+
+ /* sched_clock_tick() needs us? */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ /*
+ * TODO: kick full dynticks CPUs when
+ * sched_clock_stable is set.
+ */
+ if (!sched_clock_stable()) {
+ trace_tick_stop(0, "unstable sched clock\n");
+ /*
+ * Don't allow the user to think they can get
+ * full NO_HZ with this machine.
+ */
+ WARN_ONCE(tick_nohz_full_running,
+ "NO_HZ FULL will not work with unstable sched clock");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
+
+/*
+ * Re-evaluate the need for the tick on the current CPU
+ * and restart it if necessary.
+ */
+void __tick_nohz_full_check(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (tick_nohz_full_cpu(smp_processor_id())) {
+ if (ts->tick_stopped && !is_idle_task(current)) {
+ if (!can_stop_full_tick())
+ tick_nohz_restart_sched_tick(ts, ktime_get());
+ }
+ }
+}
+
+static void nohz_full_kick_work_func(struct irq_work *work)
+{
+ __tick_nohz_full_check();
+}
+
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
+ .func = nohz_full_kick_work_func,
+};
+
+/*
+ * Kick the current CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick(void)
+{
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+static void nohz_full_kick_ipi(void *info)
+{
+ __tick_nohz_full_check();
+}
+
+/*
+ * Kick all full dynticks CPUs in order to force these to re-evaluate
+ * their dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_all(void)
+{
+ if (!tick_nohz_full_running)
+ return;
+
+ preempt_disable();
+ smp_call_function_many(tick_nohz_full_mask,
+ nohz_full_kick_ipi, NULL, false);
+ tick_nohz_full_kick();
+ preempt_enable();
+}
+
+/*
+ * Re-evaluate the need for the tick as we switch the current task.
+ * It might need the tick due to per task/process properties:
+ * perf events, posix cpu timers, ...
+ */
+void __tick_nohz_task_switch(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ goto out;
+
+ if (tick_nohz_tick_stopped() && !can_stop_full_tick())
+ tick_nohz_full_kick();
+
+out:
+ local_irq_restore(flags);
+}
+
+/* Parse the boot-time nohz CPU list from the kernel parameters. */
+static int __init tick_nohz_full_setup(char *str)
+{
+ int cpu;
+
+ alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
+ if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
+ pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ return 1;
+ }
+
+ cpu = smp_processor_id();
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
+ tick_nohz_full_running = true;
+
+ return 1;
+}
+__setup("nohz_full=", tick_nohz_full_setup);
+
+static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /*
+ * If we handle the timekeeping duty for full dynticks CPUs,
+ * we can't safely shutdown that CPU.
+ */
+ if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
+ return NOTIFY_BAD;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Worst case string length in chunks of CPU range seems 2 steps
+ * separations: 0,2,4,6,...
+ * This is NR_CPUS + sizeof('\0')
+ */
+static char __initdata nohz_full_buf[NR_CPUS + 1];
+
+static int tick_nohz_init_all(void)
+{
+ int err = -1;
+
+#ifdef CONFIG_NO_HZ_FULL_ALL
+ if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
+ pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
+ return err;
+ }
+ err = 0;
+ cpumask_setall(tick_nohz_full_mask);
+ cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
+ tick_nohz_full_running = true;
+#endif
+ return err;
+}
+
+void __init tick_nohz_init(void)
+{
+ int cpu;
+
+ if (!tick_nohz_full_running) {
+ if (tick_nohz_init_all() < 0)
+ return;
+ }
+
+ for_each_cpu(cpu, tick_nohz_full_mask)
+ context_tracking_cpu_set(cpu);
+
+ cpu_notifier(tick_nohz_cpu_down_callback, 0);
+ cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
+ pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+}
+#endif
+
/*
* NOHZ - aka dynamic tick functionality
*/
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* NO HZ enabled ?
*/
static int tick_nohz_enabled __read_mostly = 1;
-
+int tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -135,11 +395,9 @@ __setup("nohz=", setup_tick_nohz);
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
- int cpu = smp_processor_id();
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
unsigned long flags;
- ts->idle_waketime = now;
+ __this_cpu_write(tick_cpu_sched.idle_waketime, now);
local_irq_save(flags);
tick_do_update_jiffies64(now);
@@ -170,23 +428,17 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
}
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- update_ts_time_stats(cpu, ts, now, NULL);
+ update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
}
-static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now;
-
- now = ktime_get();
-
- update_ts_time_stats(cpu, ts, now, NULL);
+ ktime_t now = ktime_get();
ts->idle_entrytime = now;
ts->idle_active = 1;
@@ -213,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, idle;
- if (!tick_nohz_enabled)
+ if (!tick_nohz_active)
return -1;
now = ktime_get();
@@ -254,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, iowait;
- if (!tick_nohz_enabled)
+ if (!tick_nohz_active)
return -1;
now = ktime_get();
@@ -275,71 +527,43 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
+static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
+ ktime_t now, int cpu)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
- ktime_t last_update, expires, now;
+ ktime_t last_update, expires, ret = { .tv64 = 0 };
+ unsigned long rcu_delta_jiffies;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
u64 time_delta;
- int cpu;
-
- cpu = smp_processor_id();
- ts = &per_cpu(tick_cpu_sched, cpu);
-
- now = tick_nohz_start_idle(cpu, ts);
-
- /*
- * If this cpu is offline and it is the one which updates
- * jiffies, then give up the assignment and let it be taken by
- * the cpu which runs the tick timer next. If we don't drop
- * this here the jiffies might be stale and do_timer() never
- * invoked.
- */
- if (unlikely(!cpu_online(cpu))) {
- if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- }
-
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
- return;
-
- if (need_resched())
- return;
- if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
- static int ratelimit;
+ time_delta = timekeeping_max_deferment();
- if (ratelimit < 10) {
- printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
- return;
- }
-
- ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqbegin(&jiffies_lock);
last_update = last_jiffies_update;
last_jiffies = jiffies;
- time_delta = timekeeping_max_deferment();
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqretry(&jiffies_lock, seq));
- if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
- arch_needs_cpu(cpu)) {
+ if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
+ arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
/* Get the next timer wheel timer */
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
+ if (rcu_delta_jiffies < delta_jiffies) {
+ next_jiffies = last_jiffies + rcu_delta_jiffies;
+ delta_jiffies = rcu_delta_jiffies;
+ }
}
+
/*
- * Do not stop the tick, if we are only one off
- * or if the cpu is required for rcu
+ * Do not stop the tick, if we are only one off (or less)
+ * or if the cpu is required for RCU:
*/
- if (!ts->tick_stopped && delta_jiffies == 1)
+ if (!ts->tick_stopped && delta_jiffies <= 1)
goto out;
/* Schedule the tick, if we are at least one jiffie off */
@@ -368,6 +592,13 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
time_delta = KTIME_MAX;
}
+#ifdef CONFIG_NO_HZ_FULL
+ if (!ts->inidle) {
+ time_delta = min(time_delta,
+ scheduler_tick_max_deferment());
+ }
+#endif
+
/*
* calculate the expiry time for the next timer wheel
* timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -396,6 +627,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
goto out;
+ ret = expires;
+
/*
* nohz_stop_sched_tick can be called several times before
* the nohz_restart_sched_tick is called. This happens when
@@ -404,18 +637,14 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
- select_nohz_load_balancer(1);
+ nohz_balance_enter_idle(cpu);
+ calc_load_enter_idle();
- ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
- ts->idle_jiffies = last_jiffies;
+ trace_tick_stop(1, " ");
}
- ts->idle_sleeps++;
-
- /* Mark expires */
- ts->idle_expires = expires;
-
/*
* If the expiration time == KTIME_MAX, then
* in this case we simply stop the tick timer.
@@ -446,6 +675,102 @@ out:
ts->next_jiffies = next_jiffies;
ts->last_jiffies = last_jiffies;
ts->sleep_length = ktime_sub(dev->next_event, now);
+
+ return ret;
+}
+
+static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = smp_processor_id();
+
+ if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ return;
+
+ if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
+ return;
+
+ if (!can_stop_full_tick())
+ return;
+
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+#endif
+}
+
+static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
+{
+ /*
+ * If this cpu is offline and it is the one which updates
+ * jiffies, then give up the assignment and let it be taken by
+ * the cpu which runs the tick timer next. If we don't drop
+ * this here the jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (unlikely(!cpu_online(cpu))) {
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ return false;
+ }
+
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+ ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
+ return false;
+ }
+
+ if (need_resched())
+ return false;
+
+ if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+ static int ratelimit;
+
+ if (ratelimit < 10 &&
+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+ pr_warn("NOHZ: local_softirq_pending %02x\n",
+ (unsigned int) local_softirq_pending());
+ ratelimit++;
+ }
+ return false;
+ }
+
+ if (tick_nohz_full_enabled()) {
+ /*
+ * Keep the tick alive to guarantee timekeeping progression
+ * if there are full dynticks CPUs around
+ */
+ if (tick_do_timer_cpu == cpu)
+ return false;
+ /*
+ * Boot safety: make sure the timekeeping duty has been
+ * assigned before entering dyntick-idle mode,
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ return false;
+ }
+
+ return true;
+}
+
+static void __tick_nohz_idle_enter(struct tick_sched *ts)
+{
+ ktime_t now, expires;
+ int cpu = smp_processor_id();
+
+ now = tick_nohz_start_idle(ts);
+
+ if (can_stop_idle_tick(cpu, ts)) {
+ int was_stopped = ts->tick_stopped;
+
+ ts->idle_calls++;
+
+ expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+ if (expires.tv64 > 0LL) {
+ ts->idle_sleeps++;
+ ts->idle_expires = expires;
+ }
+
+ if (!was_stopped && ts->tick_stopped)
+ ts->idle_jiffies = ts->last_jiffies;
+ }
}
/**
@@ -477,16 +802,12 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = &__get_cpu_var(tick_cpu_sched);
- /*
- * set ts->inidle unconditionally. even if the system did not
- * switch to nohz mode the cpu frequency governers rely on the
- * update of the idle time accounting in tick_nohz_start_idle().
- */
ts->inidle = 1;
- tick_nohz_stop_sched_tick(ts);
+ __tick_nohz_idle_enter(ts);
local_irq_enable();
}
+EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -500,10 +821,10 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
- if (!ts->inidle)
- return;
-
- tick_nohz_stop_sched_tick(ts);
+ if (ts->inidle)
+ __tick_nohz_idle_enter(ts);
+ else
+ tick_nohz_full_stop_tick(ts);
}
/**
@@ -521,7 +842,7 @@ ktime_t tick_nohz_get_sleep_length(void)
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
while (1) {
/* Forward the time to expire in the future */
@@ -538,49 +859,36 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_get_expires(&ts->sched_timer), 0))
break;
}
- /* Update jiffies and reread time */
- tick_do_update_jiffies64(now);
+ /* Reread time and update jiffies */
now = ktime_get();
+ tick_do_update_jiffies64(now);
}
}
-/**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
- *
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
- */
-void tick_nohz_idle_exit(void)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
- int cpu = smp_processor_id();
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- unsigned long ticks;
-#endif
- ktime_t now;
+ /* Update jiffies first */
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
- local_irq_disable();
+ calc_load_exit_idle();
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
- if (ts->idle_active || (ts->inidle && ts->tick_stopped))
- now = ktime_get();
+ tick_nohz_restart(ts, now);
+}
- if (ts->idle_active)
- tick_nohz_stop_idle(cpu, now);
+static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ unsigned long ticks;
- if (!ts->inidle || !ts->tick_stopped) {
- ts->inidle = 0;
- local_irq_enable();
+ if (vtime_accounting_enabled())
return;
- }
-
- ts->inidle = 0;
-
- /* Update jiffies first */
- select_nohz_load_balancer(0);
- tick_do_update_jiffies64(now);
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
/*
* We stopped the tick in idle. Update process times would miss the
* time we slept as update_process_times does only a 1 tick
@@ -593,18 +901,40 @@ void tick_nohz_idle_exit(void)
if (ticks && ticks < LONG_MAX)
account_idle_ticks(ticks);
#endif
+}
- touch_softlockup_watchdog();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
+/**
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
+ */
+void tick_nohz_idle_exit(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t now;
- tick_nohz_restart(ts, now);
+ local_irq_disable();
+
+ WARN_ON_ONCE(!ts->inidle);
+
+ ts->inidle = 0;
+
+ if (ts->idle_active || ts->tick_stopped)
+ now = ktime_get();
+
+ if (ts->idle_active)
+ tick_nohz_stop_idle(ts, now);
+
+ if (ts->tick_stopped) {
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_ticks(ts);
+ }
local_irq_enable();
}
+EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
@@ -619,40 +949,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
- int cpu = smp_processor_id();
ktime_t now = ktime_get();
dev->next_event.tv64 = KTIME_MAX;
- /*
- * Check if the do_timer duty was dropped. We don't care about
- * concurrency: This happens only when the cpu in charge went
- * into a long sleep. If two cpus happen to assign themself to
- * this duty, then the jiffies update is still serialized by
- * xtime_lock.
- */
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
- tick_do_timer_cpu = cpu;
-
- /* Check, if the jiffies need an update */
- if (tick_do_timer_cpu == cpu)
- tick_do_update_jiffies64(now);
-
- /*
- * When we are idle and the tick is stopped, we have to touch
- * the watchdog as we might not schedule for a really long
- * time. This happens on complete idle SMP systems while
- * waiting on the login prompt. We also increment the "start
- * of idle" jiffy stamp so the idle accounting adjustment we
- * do when we go busy again does not account too much ticks.
- */
- if (ts->tick_stopped) {
- touch_softlockup_watchdog();
- ts->idle_jiffies++;
- }
-
- update_process_times(user_mode(regs));
- profile_tick(CPU_PROFILING);
+ tick_sched_do_timer(now);
+ tick_sched_handle(ts, regs);
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
@@ -676,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)
local_irq_enable();
return;
}
-
+ tick_nohz_active = 1;
ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
@@ -707,12 +1009,10 @@ static void tick_nohz_switch_to_nohz(void)
* timer and do not touch the other magic bits which need to be done
* when idle is left.
*/
-static void tick_nohz_kick_tick(int cpu, ktime_t now)
+static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
{
#if 0
/* Switch back to 2.6.27 behaviour */
-
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t delta;
/*
@@ -727,36 +1027,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)
#endif
}
-static inline void tick_check_nohz(int cpu)
+static inline void tick_nohz_irq_enter(void)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now;
if (!ts->idle_active && !ts->tick_stopped)
return;
now = ktime_get();
if (ts->idle_active)
- tick_nohz_stop_idle(cpu, now);
+ tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped) {
tick_nohz_update_jiffies(now);
- tick_nohz_kick_tick(cpu, now);
+ tick_nohz_kick_tick(ts, now);
}
}
#else
static inline void tick_nohz_switch_to_nohz(void) { }
-static inline void tick_check_nohz(int cpu) { }
+static inline void tick_nohz_irq_enter(void) { }
-#endif /* NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from irq_enter to notify about the possible interruption of idle()
*/
-void tick_check_idle(int cpu)
+void tick_irq_enter(void)
{
- tick_check_oneshot_broadcast(cpu);
- tick_check_nohz(cpu);
+ tick_check_oneshot_broadcast_this_cpu();
+ tick_nohz_irq_enter();
}
/*
@@ -765,7 +1065,7 @@ void tick_check_idle(int cpu)
#ifdef CONFIG_HIGH_RES_TIMERS
/*
* We rearm the timer until we get disabled by the idle code.
- * Called with interrupts disabled and timer->base->cpu_base->lock held.
+ * Called with interrupts disabled.
*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
@@ -773,50 +1073,31 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
container_of(timer, struct tick_sched, sched_timer);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
- int cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ
- /*
- * Check if the do_timer duty was dropped. We don't care about
- * concurrency: This happens only when the cpu in charge went
- * into a long sleep. If two cpus happen to assign themself to
- * this duty, then the jiffies update is still serialized by
- * xtime_lock.
- */
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
- tick_do_timer_cpu = cpu;
-#endif
-
- /* Check, if the jiffies need an update */
- if (tick_do_timer_cpu == cpu)
- tick_do_update_jiffies64(now);
+ tick_sched_do_timer(now);
/*
* Do not call, when we are not in irq context and have
* no valid regs pointer
*/
- if (regs) {
- /*
- * When we are idle and the tick is stopped, we have to touch
- * the watchdog as we might not schedule for a really long
- * time. This happens on complete idle SMP systems while
- * waiting on the login prompt. We also increment the "start of
- * idle" jiffy stamp so the idle accounting adjustment we do
- * when we go busy again does not account too much ticks.
- */
- if (ts->tick_stopped) {
- touch_softlockup_watchdog();
- ts->idle_jiffies++;
- }
- update_process_times(user_mode(regs));
- profile_tick(CPU_PROFILING);
- }
+ if (regs)
+ tick_sched_handle(ts, regs);
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
}
+static int sched_skew_tick;
+
+static int __init skew_tick(char *str)
+{
+ get_option(&str, &sched_skew_tick);
+
+ return 0;
+}
+early_param("skew_tick", skew_tick);
+
/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
@@ -834,6 +1115,14 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+ /* Offset the tick to avert jiffies_lock contention. */
+ if (sched_skew_tick) {
+ u64 offset = ktime_to_ns(tick_period) >> 1;
+ do_div(offset, num_possible_cpus());
+ offset *= smp_processor_id();
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+ }
+
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
hrtimer_start_expires(&ts->sched_timer,
@@ -844,14 +1133,16 @@ void tick_setup_sched_timer(void)
now = ktime_get();
}
-#ifdef CONFIG_NO_HZ
- if (tick_nohz_enabled)
+#ifdef CONFIG_NO_HZ_COMMON
+ if (tick_nohz_enabled) {
ts->nohz_mode = NOHZ_MODE_HIGHRES;
+ tick_nohz_active = 1;
+ }
#endif
}
#endif /* HIGH_RES_TIMERS */
-#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -861,7 +1152,7 @@ void tick_cancel_sched_timer(int cpu)
hrtimer_cancel(&ts->sched_timer);
# endif
- ts->nohz_mode = NOHZ_MODE_INACTIVE;
+ memset(ts, 0, sizeof(*ts));
}
#endif
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
deleted file mode 100644
index a9ae369925c..00000000000
--- a/kernel/time/timecompare.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 2009 Intel Corporation.
- * Author: Patrick Ohly <patrick.ohly@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/timecompare.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/math64.h>
-#include <linux/kernel.h>
-
-/*
- * fixed point arithmetic scale factor for skew
- *
- * Usually one would measure skew in ppb (parts per billion, 1e9), but
- * using a factor of 2 simplifies the math.
- */
-#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
-
-ktime_t timecompare_transform(struct timecompare *sync,
- u64 source_tstamp)
-{
- u64 nsec;
-
- nsec = source_tstamp + sync->offset;
- nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
- TIMECOMPARE_SKEW_RESOLUTION;
-
- return ns_to_ktime(nsec);
-}
-EXPORT_SYMBOL_GPL(timecompare_transform);
-
-int timecompare_offset(struct timecompare *sync,
- s64 *offset,
- u64 *source_tstamp)
-{
- u64 start_source = 0, end_source = 0;
- struct {
- s64 offset;
- s64 duration_target;
- } buffer[10], sample, *samples;
- int counter = 0, i;
- int used;
- int index;
- int num_samples = sync->num_samples;
-
- if (num_samples > ARRAY_SIZE(buffer)) {
- samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
- if (!samples) {
- samples = buffer;
- num_samples = ARRAY_SIZE(buffer);
- }
- } else {
- samples = buffer;
- }
-
- /* run until we have enough valid samples, but do not try forever */
- i = 0;
- counter = 0;
- while (1) {
- u64 ts;
- ktime_t start, end;
-
- start = sync->target();
- ts = timecounter_read(sync->source);
- end = sync->target();
-
- if (!i)
- start_source = ts;
-
- /* ignore negative durations */
- sample.duration_target = ktime_to_ns(ktime_sub(end, start));
- if (sample.duration_target >= 0) {
- /*
- * assume symetric delay to and from source:
- * average target time corresponds to measured
- * source time
- */
- sample.offset =
- (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
- ts;
-
- /* simple insertion sort based on duration */
- index = counter - 1;
- while (index >= 0) {
- if (samples[index].duration_target <
- sample.duration_target)
- break;
- samples[index + 1] = samples[index];
- index--;
- }
- samples[index + 1] = sample;
- counter++;
- }
-
- i++;
- if (counter >= num_samples || i >= 100000) {
- end_source = ts;
- break;
- }
- }
-
- *source_tstamp = (end_source + start_source) / 2;
-
- /* remove outliers by only using 75% of the samples */
- used = counter * 3 / 4;
- if (!used)
- used = counter;
- if (used) {
- /* calculate average */
- s64 off = 0;
- for (index = 0; index < used; index++)
- off += samples[index].offset;
- *offset = div_s64(off, used);
- }
-
- if (samples && samples != buffer)
- kfree(samples);
-
- return used;
-}
-EXPORT_SYMBOL_GPL(timecompare_offset);
-
-void __timecompare_update(struct timecompare *sync,
- u64 source_tstamp)
-{
- s64 offset;
- u64 average_time;
-
- if (!timecompare_offset(sync, &offset, &average_time))
- return;
-
- if (!sync->last_update) {
- sync->last_update = average_time;
- sync->offset = offset;
- sync->skew = 0;
- } else {
- s64 delta_nsec = average_time - sync->last_update;
-
- /* avoid division by negative or small deltas */
- if (delta_nsec >= 10000) {
- s64 delta_offset_nsec = offset - sync->offset;
- s64 skew; /* delta_offset_nsec *
- TIMECOMPARE_SKEW_RESOLUTION /
- delta_nsec */
- u64 divisor;
-
- /* div_s64() is limited to 32 bit divisor */
- skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
- divisor = delta_nsec;
- while (unlikely(divisor >= ((s64)1) << 32)) {
- /* divide both by 2; beware, right shift
- of negative value has undefined
- behavior and can only be used for
- the positive divisor */
- skew = div_s64(skew, 2);
- divisor >>= 1;
- }
- skew = div_s64(skew, divisor);
-
- /*
- * Calculate new overall skew as 4/16 the
- * old value and 12/16 the new one. This is
- * a rather arbitrary tradeoff between
- * only using the latest measurement (0/16 and
- * 16/16) and even more weight on past measurements.
- */
-#define TIMECOMPARE_NEW_SKEW_PER_16 12
- sync->skew =
- div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
- sync->skew +
- TIMECOMPARE_NEW_SKEW_PER_16 * skew,
- 16);
- sync->last_update = average_time;
- sync->offset = offset;
- }
- }
-}
-EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0c635818640..32d8d6aaedb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
@@ -20,40 +21,79 @@
#include <linux/time.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/compiler.h>
-/* Structure holding internal timekeeping values. */
-struct timekeeper {
- /* Current clocksource used for timekeeping. */
- struct clocksource *clock;
- /* The shift value of the current clocksource. */
- int shift;
-
- /* Number of clock cycles in one NTP interval. */
- cycle_t cycle_interval;
- /* Number of clock shifted nano seconds in one NTP interval. */
- u64 xtime_interval;
- /* shifted nano seconds left over when rounding cycle_interval */
- s64 xtime_remainder;
- /* Raw nano seconds accumulated per NTP interval. */
- u32 raw_interval;
-
- /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
- u64 xtime_nsec;
- /* Difference between accumulated time and NTP time in ntp
- * shifted nano seconds. */
- s64 ntp_error;
- /* Shift conversion between clock shifted nano seconds and
- * ntp shifted nano seconds. */
- int ntp_error_shift;
- /* NTP adjusted clock multiplier */
- u32 mult;
-};
+#include "tick-internal.h"
+#include "ntp_internal.h"
+#include "timekeeping_internal.h"
+
+#define TK_CLEAR_NTP (1 << 0)
+#define TK_MIRROR (1 << 1)
+#define TK_CLOCK_WAS_SET (1 << 2)
static struct timekeeper timekeeper;
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static seqcount_t timekeeper_seq;
+static struct timekeeper shadow_timekeeper;
+
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
+/* Flag for if there is a persistent clock on this platform */
+bool __read_mostly persistent_clock_exist = false;
+
+static inline void tk_normalize_xtime(struct timekeeper *tk)
+{
+ while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
+ tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+ tk->xtime_sec++;
+ }
+}
+
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+{
+ tk->xtime_sec = ts->tv_sec;
+ tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
+}
+
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+{
+ tk->xtime_sec += ts->tv_sec;
+ tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+ tk_normalize_xtime(tk);
+}
+
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+{
+ struct timespec tmp;
+
+ /*
+ * Verify consistency of: offset_real = -wall_to_monotonic
+ * before modifying anything
+ */
+ set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+ -tk->wall_to_monotonic.tv_nsec);
+ WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+ tk->wall_to_monotonic = wtm;
+ set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+ tk->offs_real = timespec_to_ktime(tmp);
+ tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
+}
+
+static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+{
+ /* Verify consistency before modifying */
+ WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
+
+ tk->total_sleep_time = t;
+ tk->offs_boot = timespec_to_ktime(t);
+}
/**
- * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ * tk_setup_internals - Set up internals to use clocksource clock.
*
+ * @tk: The target timekeeper to setup.
* @clock: Pointer to clocksource.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
@@ -61,13 +101,15 @@ static struct timekeeper timekeeper;
*
* Unless you're the timekeeping code, you should not be using this!
*/
-static void timekeeper_setup_internals(struct clocksource *clock)
+static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
cycle_t interval;
u64 tmp, ntpinterval;
+ struct clocksource *old_clock;
- timekeeper.clock = clock;
- clock->cycle_last = clock->read(clock);
+ old_clock = tk->clock;
+ tk->clock = clock;
+ tk->cycle_last = clock->cycle_last = clock->read(clock);
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -79,103 +121,144 @@ static void timekeeper_setup_internals(struct clocksource *clock)
tmp = 1;
interval = (cycle_t) tmp;
- timekeeper.cycle_interval = interval;
+ tk->cycle_interval = interval;
/* Go back from cycles -> shifted ns */
- timekeeper.xtime_interval = (u64) interval * clock->mult;
- timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
- timekeeper.raw_interval =
+ tk->xtime_interval = (u64) interval * clock->mult;
+ tk->xtime_remainder = ntpinterval - tk->xtime_interval;
+ tk->raw_interval =
((u64) interval * clock->mult) >> clock->shift;
- timekeeper.xtime_nsec = 0;
- timekeeper.shift = clock->shift;
+ /* if changing clocks, convert xtime_nsec shift units */
+ if (old_clock) {
+ int shift_change = clock->shift - old_clock->shift;
+ if (shift_change < 0)
+ tk->xtime_nsec >>= -shift_change;
+ else
+ tk->xtime_nsec <<= shift_change;
+ }
+ tk->shift = clock->shift;
- timekeeper.ntp_error = 0;
- timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+ tk->ntp_error = 0;
+ tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- timekeeper.mult = clock->mult;
+ tk->mult = clock->mult;
}
/* Timekeeper helper functions. */
-static inline s64 timekeeping_get_ns(void)
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 (*arch_gettimeoffset)(void);
+
+u32 get_arch_timeoffset(void)
+{
+ if (likely(arch_gettimeoffset))
+ return arch_gettimeoffset();
+ return 0;
+}
+#else
+static inline u32 get_arch_timeoffset(void) { return 0; }
+#endif
+
+static inline s64 timekeeping_get_ns(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
+ s64 nsec;
/* read clocksource: */
- clock = timekeeper.clock;
+ clock = tk->clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- /* return delta convert to nanoseconds using ntp adjusted mult. */
- return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
- timekeeper.shift);
+ nsec = cycle_delta * tk->mult + tk->xtime_nsec;
+ nsec >>= tk->shift;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + get_arch_timeoffset();
}
-static inline s64 timekeeping_get_ns_raw(void)
+static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
+ s64 nsec;
/* read clocksource: */
- clock = timekeeper.clock;
+ clock = tk->clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- /* return delta convert to nanoseconds. */
- return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ /* convert delta to nanoseconds. */
+ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + get_arch_timeoffset();
}
-/*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime.
- */
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
+static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
+{
+ raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
+}
-/*
- * The current time
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time. Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- *
- * wall_to_monotonic is moved after resume from suspend for the monotonic
- * time not to jump. We need to add total_sleep_time to wall_to_monotonic
- * to get the real boot based time offset.
- *
- * - wall_to_monotonic is no longer the boot time, getboottime must be
- * used instead.
+/**
+ * pvclock_gtod_register_notifier - register a pvclock timedata update listener
*/
-static struct timespec xtime __attribute__ ((aligned (16)));
-static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static struct timespec total_sleep_time;
+int pvclock_gtod_register_notifier(struct notifier_block *nb)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long flags;
+ int ret;
-/*
- * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
+ update_pvclock_gtod(tk, true);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
+
+/**
+ * pvclock_gtod_unregister_notifier - unregister a pvclock
+ * timedata update listener
*/
-static struct timespec raw_time;
+int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-/* must hold xtime_lock */
-void timekeeping_leap_insert(int leapsecond)
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
+
+/* must hold timekeeper_lock */
+static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
- xtime.tv_sec += leapsecond;
- wall_to_monotonic.tv_sec -= leapsecond;
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ if (action & TK_CLEAR_NTP) {
+ tk->ntp_error = 0;
+ ntp_clear();
+ }
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+
+ if (action & TK_MIRROR)
+ memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
}
/**
@@ -185,74 +268,88 @@ void timekeeping_leap_insert(int leapsecond)
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void timekeeping_forward_now(void)
+static void timekeeping_forward_now(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
s64 nsec;
- clock = timekeeper.clock;
+ clock = tk->clock;
cycle_now = clock->read(clock);
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- clock->cycle_last = cycle_now;
+ tk->cycle_last = clock->cycle_last = cycle_now;
- nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
- timekeeper.shift);
+ tk->xtime_nsec += cycle_delta * tk->mult;
- /* If arch requires, add in gettimeoffset() */
- nsec += arch_gettimeoffset();
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
- timespec_add_ns(&xtime, nsec);
+ tk_normalize_xtime(tk);
nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
- timespec_add_ns(&raw_time, nsec);
+ timespec_add_ns(&tk->raw_time, nsec);
}
/**
- * getnstimeofday - Returns the time of day in a timespec
+ * __getnstimeofday - Returns the time of day in a timespec.
* @ts: pointer to the timespec to be set
*
- * Returns the time of day in a timespec.
+ * Updates the time of day in the timespec.
+ * Returns 0 on success, or -ve when suspended (timespec will be undefined).
*/
-void getnstimeofday(struct timespec *ts)
+int __getnstimeofday(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
- s64 nsecs;
-
- WARN_ON(timekeeping_suspended);
+ s64 nsecs = 0;
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
- *ts = xtime;
- nsecs = timekeeping_get_ns();
+ ts->tv_sec = tk->xtime_sec;
+ nsecs = timekeeping_get_ns(tk);
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
-
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+ ts->tv_nsec = 0;
timespec_add_ns(ts, nsecs);
+
+ /*
+ * Do not bail out early, in case there were callers still using
+ * the value, even in the face of the WARN_ON.
+ */
+ if (unlikely(timekeeping_suspended))
+ return -EAGAIN;
+ return 0;
}
+EXPORT_SYMBOL(__getnstimeofday);
+/**
+ * getnstimeofday - Returns the time of day in a timespec.
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec (WARN if suspended).
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ WARN_ON(__getnstimeofday(ts));
+}
EXPORT_SYMBOL(getnstimeofday);
ktime_t ktime_get(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned int seq;
s64 secs, nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&xtime_lock);
- secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
- nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
- nsecs += timekeeping_get_ns();
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
-
- } while (read_seqretry(&xtime_lock, seq));
+ seq = read_seqcount_begin(&timekeeper_seq);
+ secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
+
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
/*
* Use ktime_set/ktime_add_ns to create a proper ktime on
* 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -271,27 +368,71 @@ EXPORT_SYMBOL_GPL(ktime_get);
*/
void ktime_get_ts(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec tomono;
+ s64 nsec;
unsigned int seq;
- s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&xtime_lock);
- *ts = xtime;
- tomono = wall_to_monotonic;
- nsecs = timekeeping_get_ns();
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
+ seq = read_seqcount_begin(&timekeeper_seq);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(tk);
+ tomono = tk->wall_to_monotonic;
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec + nsecs);
+ ts->tv_sec += tomono.tv_sec;
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
+
+/**
+ * timekeeping_clocktai - Returns the TAI time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void timekeeping_clocktai(struct timespec *ts)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long seq;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&timekeeper_seq);
+
+ ts->tv_sec = tk->xtime_sec + tk->tai_offset;
+ nsecs = timekeeping_get_ns(tk);
+
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsecs);
+
+}
+EXPORT_SYMBOL(timekeeping_clocktai);
+
+
+/**
+ * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ *
+ * Returns the time of day in a ktime.
+ */
+ktime_t ktime_get_clocktai(void)
+{
+ struct timespec ts;
+
+ timekeeping_clocktai(&ts);
+ return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL(ktime_get_clocktai);
+
#ifdef CONFIG_NTP_PPS
/**
@@ -305,28 +446,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
*/
void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
s64 nsecs_raw, nsecs_real;
WARN_ON_ONCE(timekeeping_suspended);
do {
- u32 arch_offset;
-
- seq = read_seqbegin(&xtime_lock);
-
- *ts_raw = raw_time;
- *ts_real = xtime;
+ seq = read_seqcount_begin(&timekeeper_seq);
- nsecs_raw = timekeeping_get_ns_raw();
- nsecs_real = timekeeping_get_ns();
+ *ts_raw = tk->raw_time;
+ ts_real->tv_sec = tk->xtime_sec;
+ ts_real->tv_nsec = 0;
- /* If arch requires, add in gettimeoffset() */
- arch_offset = arch_gettimeoffset();
- nsecs_raw += arch_offset;
- nsecs_real += arch_offset;
+ nsecs_raw = timekeeping_get_ns_raw(tk);
+ nsecs_real = timekeeping_get_ns(tk);
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
timespec_add_ns(ts_raw, nsecs_raw);
timespec_add_ns(ts_real, nsecs_real);
@@ -349,8 +485,8 @@ void do_gettimeofday(struct timeval *tv)
tv->tv_sec = now.tv_sec;
tv->tv_usec = now.tv_nsec/1000;
}
-
EXPORT_SYMBOL(do_gettimeofday);
+
/**
* do_settimeofday - Sets the time of day
* @tv: pointer to the timespec variable containing the new time
@@ -359,39 +495,38 @@ EXPORT_SYMBOL(do_gettimeofday);
*/
int do_settimeofday(const struct timespec *tv)
{
- struct timespec ts_delta;
+ struct timekeeper *tk = &timekeeper;
+ struct timespec ts_delta, xt;
unsigned long flags;
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_valid_strict(tv))
return -EINVAL;
- write_seqlock_irqsave(&xtime_lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
- timekeeping_forward_now();
+ timekeeping_forward_now(tk);
- ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
- ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
- wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
+ xt = tk_xtime(tk);
+ ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
- xtime = *tv;
+ tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
- timekeeper.ntp_error = 0;
- ntp_clear();
+ tk_set_xtime(tk, tv);
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_sequnlock_irqrestore(&xtime_lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
return 0;
}
-
EXPORT_SYMBOL(do_settimeofday);
-
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
* @tv: pointer to the timespec variable containing the offset
@@ -400,33 +535,89 @@ EXPORT_SYMBOL(do_settimeofday);
*/
int timekeeping_inject_offset(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
+ struct timespec tmp;
+ int ret = 0;
if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- write_seqlock_irqsave(&xtime_lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
- timekeeping_forward_now();
+ timekeeping_forward_now(tk);
- xtime = timespec_add(xtime, *ts);
- wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+ /* Make sure the proposed value is valid */
+ tmp = timespec_add(tk_xtime(tk), *ts);
+ if (!timespec_valid_strict(&tmp)) {
+ ret = -EINVAL;
+ goto error;
+ }
- timekeeper.ntp_error = 0;
- ntp_clear();
+ tk_xtime_add(tk, ts);
+ tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+error: /* even if we error out, we forwarded the time, so call update */
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_sequnlock_irqrestore(&xtime_lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/**
+ * timekeeping_get_tai_offset - Returns current TAI offset from UTC
+ *
+ */
+s32 timekeeping_get_tai_offset(void)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned int seq;
+ s32 ret;
+
+ do {
+ seq = read_seqcount_begin(&timekeeper_seq);
+ ret = tk->tai_offset;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
+ return ret;
+}
+
+/**
+ * __timekeeping_set_tai_offset - Lock free worker function
+ *
+ */
+static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+{
+ tk->tai_offset = tai_offset;
+ tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
+}
+
+/**
+ * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
+ *
+ */
+void timekeeping_set_tai_offset(s32 tai_offset)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+ __timekeeping_set_tai_offset(tk, tai_offset);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ clock_was_set();
+}
+
/**
* change_clocksource - Swaps clocksources if a new one is available
*
@@ -434,17 +625,36 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
*/
static int change_clocksource(void *data)
{
+ struct timekeeper *tk = &timekeeper;
struct clocksource *new, *old;
+ unsigned long flags;
new = (struct clocksource *) data;
- timekeeping_forward_now();
- if (!new->enable || new->enable(new) == 0) {
- old = timekeeper.clock;
- timekeeper_setup_internals(new);
- if (old->disable)
- old->disable(old);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ timekeeping_forward_now(tk);
+ /*
+ * If the cs is in module, get a module reference. Succeeds
+ * for built-in code (owner == NULL) as well.
+ */
+ if (try_module_get(new->owner)) {
+ if (!new->enable || new->enable(new) == 0) {
+ old = tk->clock;
+ tk_setup_internals(tk, new);
+ if (old->disable)
+ old->disable(old);
+ module_put(old->owner);
+ } else {
+ module_put(new->owner);
+ }
}
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
return 0;
}
@@ -455,12 +665,15 @@ static int change_clocksource(void *data)
* This function is called from clocksource.c after a new, better clock
* source has been registered. The caller holds the clocksource_mutex.
*/
-void timekeeping_notify(struct clocksource *clock)
+int timekeeping_notify(struct clocksource *clock)
{
- if (timekeeper.clock == clock)
- return;
+ struct timekeeper *tk = &timekeeper;
+
+ if (tk->clock == clock)
+ return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
+ return tk->clock == clock ? 0 : -1;
}
/**
@@ -486,48 +699,57 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
*/
void getrawmonotonic(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
s64 nsecs;
do {
- seq = read_seqbegin(&xtime_lock);
- nsecs = timekeeping_get_ns_raw();
- *ts = raw_time;
+ seq = read_seqcount_begin(&timekeeper_seq);
+ nsecs = timekeeping_get_ns_raw(tk);
+ *ts = tk->raw_time;
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
timespec_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getrawmonotonic);
-
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
int timekeeping_valid_for_hres(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
int ret;
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
- ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
return ret;
}
/**
* timekeeping_max_deferment - Returns max time the clocksource can be deferred
- *
- * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
- * ensure that the clocksource does not change!
*/
u64 timekeeping_max_deferment(void)
{
- return timekeeper.clock->max_idle_ns;
+ struct timekeeper *tk = &timekeeper;
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqcount_begin(&timekeeper_seq);
+
+ ret = tk->clock->max_idle_ns;
+
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
+ return ret;
}
/**
@@ -539,7 +761,7 @@ u64 timekeeping_max_deferment(void)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
@@ -554,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
@@ -565,35 +787,55 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
*/
void __init timekeeping_init(void)
{
+ struct timekeeper *tk = &timekeeper;
struct clocksource *clock;
unsigned long flags;
- struct timespec now, boot;
+ struct timespec now, boot, tmp;
read_persistent_clock(&now);
- read_boot_clock(&boot);
- write_seqlock_irqsave(&xtime_lock, flags);
+ if (!timespec_valid_strict(&now)) {
+ pr_warn("WARNING: Persistent clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ now.tv_sec = 0;
+ now.tv_nsec = 0;
+ } else if (now.tv_sec || now.tv_nsec)
+ persistent_clock_exist = true;
+
+ read_boot_clock(&boot);
+ if (!timespec_valid_strict(&boot)) {
+ pr_warn("WARNING: Boot clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ boot.tv_sec = 0;
+ boot.tv_nsec = 0;
+ }
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
ntp_init();
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
- timekeeper_setup_internals(clock);
-
- xtime.tv_sec = now.tv_sec;
- xtime.tv_nsec = now.tv_nsec;
- raw_time.tv_sec = 0;
- raw_time.tv_nsec = 0;
- if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
- boot.tv_sec = xtime.tv_sec;
- boot.tv_nsec = xtime.tv_nsec;
- }
- set_normalized_timespec(&wall_to_monotonic,
- -boot.tv_sec, -boot.tv_nsec);
- total_sleep_time.tv_sec = 0;
- total_sleep_time.tv_nsec = 0;
- write_sequnlock_irqrestore(&xtime_lock, flags);
+ tk_setup_internals(tk, clock);
+
+ tk_set_xtime(tk, &now);
+ tk->raw_time.tv_sec = 0;
+ tk->raw_time.tv_nsec = 0;
+ if (boot.tv_sec == 0 && boot.tv_nsec == 0)
+ boot = tk_xtime(tk);
+
+ set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
+ tk_set_wall_to_mono(tk, tmp);
+
+ tmp.tv_sec = 0;
+ tmp.tv_nsec = 0;
+ tk_set_sleep_time(tk, tmp);
+
+ memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
/* time in seconds when suspend began */
@@ -606,20 +848,21 @@ static struct timespec timekeeping_suspend_time;
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables.
*/
-static void __timekeeping_inject_sleeptime(struct timespec *delta)
+static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
+ struct timespec *delta)
{
- if (!timespec_valid(delta)) {
- printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
- "sleep delta value!\n");
+ if (!timespec_valid_strict(delta)) {
+ printk_deferred(KERN_WARNING
+ "__timekeeping_inject_sleeptime: Invalid "
+ "sleep delta value!\n");
return;
}
-
- xtime = timespec_add(xtime, *delta);
- wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
- total_sleep_time = timespec_add(total_sleep_time, *delta);
+ tk_xtime_add(tk, delta);
+ tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
+ tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
+ tk_debug_account_sleep_time(delta);
}
-
/**
* timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
* @delta: pointer to a timespec delta value
@@ -632,31 +875,32 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
*/
void timekeeping_inject_sleeptime(struct timespec *delta)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
- struct timespec ts;
- /* Make sure we don't set the clock twice */
- read_persistent_clock(&ts);
- if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+ /*
+ * Make sure we don't set the clock twice, as timekeeping_resume()
+ * already did it
+ */
+ if (has_persistent_clock())
return;
- write_seqlock_irqsave(&xtime_lock, flags);
- timekeeping_forward_now();
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ timekeeping_forward_now(tk);
- __timekeeping_inject_sleeptime(delta);
+ __timekeeping_inject_sleeptime(tk, delta);
- timekeeper.ntp_error = 0;
- ntp_clear();
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_sequnlock_irqrestore(&xtime_lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
}
-
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
*
@@ -666,24 +910,73 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
*/
static void timekeeping_resume(void)
{
+ struct timekeeper *tk = &timekeeper;
+ struct clocksource *clock = tk->clock;
unsigned long flags;
- struct timespec ts;
+ struct timespec ts_new, ts_delta;
+ cycle_t cycle_now, cycle_delta;
+ bool suspendtime_found = false;
- read_persistent_clock(&ts);
+ read_persistent_clock(&ts_new);
+ clockevents_resume();
clocksource_resume();
- write_seqlock_irqsave(&xtime_lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ /*
+ * After system resumes, we need to calculate the suspended time and
+ * compensate it for the OS time. There are 3 sources that could be
+ * used: Nonstop clocksource during suspend, persistent clock and rtc
+ * device.
+ *
+ * One specific platform may have 1 or 2 or all of them, and the
+ * preference will be:
+ * suspend-nonstop clocksource -> persistent clock -> rtc
+ * The less preferred source will only be tried if there is no better
+ * usable source. The rtc part is handled separately in rtc core code.
+ */
+ cycle_now = clock->read(clock);
+ if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
+ cycle_now > clock->cycle_last) {
+ u64 num, max = ULLONG_MAX;
+ u32 mult = clock->mult;
+ u32 shift = clock->shift;
+ s64 nsec = 0;
+
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
- ts = timespec_sub(ts, timekeeping_suspend_time);
- __timekeeping_inject_sleeptime(&ts);
+ /*
+ * "cycle_delta * mutl" may cause 64 bits overflow, if the
+ * suspended time is too long. In that case we need do the
+ * 64 bits math carefully
+ */
+ do_div(max, mult);
+ if (cycle_delta > max) {
+ num = div64_u64(cycle_delta, max);
+ nsec = (((u64) max * mult) >> shift) * num;
+ cycle_delta -= num * max;
+ }
+ nsec += ((u64) cycle_delta * mult) >> shift;
+
+ ts_delta = ns_to_timespec(nsec);
+ suspendtime_found = true;
+ } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+ ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
+ suspendtime_found = true;
}
- /* re-base the last cycle value */
- timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
- timekeeper.ntp_error = 0;
+
+ if (suspendtime_found)
+ __timekeeping_inject_sleeptime(tk, &ts_delta);
+
+ /* Re-base the last cycle value */
+ tk->cycle_last = clock->cycle_last = cycle_now;
+ tk->ntp_error = 0;
timekeeping_suspended = 0;
- write_sequnlock_irqrestore(&xtime_lock, flags);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
touch_softlockup_watchdog();
@@ -695,14 +988,24 @@ static void timekeeping_resume(void)
static int timekeeping_suspend(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
struct timespec delta, delta_delta;
static struct timespec old_delta;
read_persistent_clock(&timekeeping_suspend_time);
- write_seqlock_irqsave(&xtime_lock, flags);
- timekeeping_forward_now();
+ /*
+ * On some systems the persistent_clock can not be detected at
+ * timekeeping_init by its return value, so if we see a valid
+ * value returned, update the persistent_clock_exists flag.
+ */
+ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
+ persistent_clock_exist = true;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+ timekeeping_forward_now(tk);
timekeeping_suspended = 1;
/*
@@ -711,7 +1014,7 @@ static int timekeeping_suspend(void)
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
- delta = timespec_sub(xtime, timekeeping_suspend_time);
+ delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
delta_delta = timespec_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
@@ -724,10 +1027,14 @@ static int timekeeping_suspend(void)
timekeeping_suspend_time =
timespec_add(timekeeping_suspend_time, delta_delta);
}
- write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ timekeeping_update(tk, TK_MIRROR);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
clocksource_suspend();
+ clockevents_suspend();
return 0;
}
@@ -750,7 +1057,8 @@ device_initcall(timekeeping_init_ops);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
+ s64 error, s64 *interval,
s64 *offset)
{
s64 tick_error, i;
@@ -766,7 +1074,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
* here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
- error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = abs(error2);
for (look_ahead = 0; error2 > 0; look_ahead++)
error2 >>= 2;
@@ -775,8 +1083,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
- tick_error -= timekeeper.xtime_interval >> 1;
+ tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
+ tick_error -= tk->xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;
/* Finally calculate the adjustment shift value. */
@@ -801,13 +1109,13 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
* this is optimized for the most common adjustments of -1,0,1,
* for other values we can do a bit more work.
*/
-static void timekeeping_adjust(s64 offset)
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- s64 error, interval = timekeeper.cycle_interval;
+ s64 error, interval = tk->cycle_interval;
int adj;
/*
- * The point of this is to check if the error is greater then half
+ * The point of this is to check if the error is greater than half
* an interval.
*
* First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -815,52 +1123,46 @@ static void timekeeping_adjust(s64 offset)
* Note we subtract one in the shift, so that error is really error*2.
* This "saves" dividing(shifting) interval twice, but keeps the
* (error > interval) comparison as still measuring if error is
- * larger then half an interval.
+ * larger than half an interval.
*
* Note: It does not "save" on aggravation when reading the code.
*/
- error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
+ error = tk->ntp_error >> (tk->ntp_error_shift - 1);
if (error > interval) {
/*
* We now divide error by 4(via shift), which checks if
- * the error is greater then twice the interval.
+ * the error is greater than twice the interval.
* If it is greater, we need a bigadjust, if its smaller,
* we can adjust by 1.
*/
error >>= 2;
- /*
- * XXX - In update_wall_time, we round up to the next
- * nanosecond, and store the amount rounded up into
- * the error. This causes the likely below to be unlikely.
- *
- * The proper fix is to avoid rounding up by using
- * the high precision timekeeper.xtime_nsec instead of
- * xtime.tv_nsec everywhere. Fixing this will take some
- * time.
- */
if (likely(error <= interval))
adj = 1;
else
- adj = timekeeping_bigadjust(error, &interval, &offset);
- } else if (error < -interval) {
- /* See comment above, this is just switched for the negative */
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else
- adj = timekeeping_bigadjust(error, &interval, &offset);
- } else /* No adjustment needed */
- return;
+ adj = timekeeping_bigadjust(tk, error, &interval, &offset);
+ } else {
+ if (error < -interval) {
+ /* See comment above, this is just switched for the negative */
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else {
+ adj = timekeeping_bigadjust(tk, error, &interval, &offset);
+ }
+ } else {
+ goto out_adjust;
+ }
+ }
- WARN_ONCE(timekeeper.clock->maxadj &&
- (timekeeper.mult + adj > timekeeper.clock->mult +
- timekeeper.clock->maxadj),
- "Adjusting %s more then 11%% (%ld vs %ld)\n",
- timekeeper.clock->name, (long)timekeeper.mult + adj,
- (long)timekeeper.clock->mult +
- timekeeper.clock->maxadj);
+ if (unlikely(tk->clock->maxadj &&
+ (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
+ printk_deferred_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
+ tk->clock->name, (long)tk->mult + adj,
+ (long)tk->clock->mult + tk->clock->maxadj);
+ }
/*
* So the following can be confusing.
*
@@ -910,13 +1212,72 @@ static void timekeeping_adjust(s64 offset)
*
* XXX - TODO: Doc ntp_error calculation.
*/
- timekeeper.mult += adj;
- timekeeper.xtime_interval += interval;
- timekeeper.xtime_nsec -= offset;
- timekeeper.ntp_error -= (interval - offset) <<
- timekeeper.ntp_error_shift;
+ tk->mult += adj;
+ tk->xtime_interval += interval;
+ tk->xtime_nsec -= offset;
+ tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+
+out_adjust:
+ /*
+ * It may be possible that when we entered this function, xtime_nsec
+ * was very small. Further, if we're slightly speeding the clocksource
+ * in the code above, its possible the required corrective factor to
+ * xtime_nsec could cause it to underflow.
+ *
+ * Now, since we already accumulated the second, cannot simply roll
+ * the accumulated second back, since the NTP subsystem has been
+ * notified via second_overflow. So instead we push xtime_nsec forward
+ * by the amount we underflowed, and add that amount into the error.
+ *
+ * We'll correct this error next time through this function, when
+ * xtime_nsec is not as small.
+ */
+ if (unlikely((s64)tk->xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->xtime_nsec;
+ tk->xtime_nsec = 0;
+ tk->ntp_error += neg << tk->ntp_error_shift;
+ }
+
}
+/**
+ * accumulate_nsecs_to_secs - Accumulates nsecs into secs
+ *
+ * Helper function that accumulates a the nsecs greater then a second
+ * from the xtime_nsec field to the xtime_secs field.
+ * It also calls into the NTP code to handle leapsecond processing.
+ *
+ */
+static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+ unsigned int clock_set = 0;
+
+ while (tk->xtime_nsec >= nsecps) {
+ int leap;
+
+ tk->xtime_nsec -= nsecps;
+ tk->xtime_sec++;
+
+ /* Figure out if its a leap sec and apply if needed */
+ leap = second_overflow(tk->xtime_sec);
+ if (unlikely(leap)) {
+ struct timespec ts;
+
+ tk->xtime_sec += leap;
+
+ ts.tv_sec = leap;
+ ts.tv_nsec = 0;
+ tk_set_wall_to_mono(tk,
+ timespec_sub(tk->wall_to_monotonic, ts));
+
+ __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
+
+ clock_set = TK_CLOCK_WAS_SET;
+ }
+ }
+ return clock_set;
+}
/**
* logarithmic_accumulation - shifted accumulation of cycles
@@ -927,137 +1288,157 @@ static void timekeeping_adjust(s64 offset)
*
* Returns the unconsumed cycles.
*/
-static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
+ u32 shift,
+ unsigned int *clock_set)
{
- u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+ cycle_t interval = tk->cycle_interval << shift;
u64 raw_nsecs;
/* If the offset is smaller then a shifted interval, do nothing */
- if (offset < timekeeper.cycle_interval<<shift)
+ if (offset < interval)
return offset;
/* Accumulate one shifted interval */
- offset -= timekeeper.cycle_interval << shift;
- timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
-
- timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
- while (timekeeper.xtime_nsec >= nsecps) {
- timekeeper.xtime_nsec -= nsecps;
- xtime.tv_sec++;
- second_overflow();
- }
+ offset -= interval;
+ tk->cycle_last += interval;
+
+ tk->xtime_nsec += tk->xtime_interval << shift;
+ *clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = timekeeper.raw_interval << shift;
- raw_nsecs += raw_time.tv_nsec;
+ raw_nsecs = (u64)tk->raw_interval << shift;
+ raw_nsecs += tk->raw_time.tv_nsec;
if (raw_nsecs >= NSEC_PER_SEC) {
u64 raw_secs = raw_nsecs;
raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- raw_time.tv_sec += raw_secs;
+ tk->raw_time.tv_sec += raw_secs;
}
- raw_time.tv_nsec = raw_nsecs;
+ tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
- timekeeper.ntp_error += tick_length << shift;
- timekeeper.ntp_error -=
- (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
- (timekeeper.ntp_error_shift + shift);
+ tk->ntp_error += ntp_tick_length() << shift;
+ tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
+ (tk->ntp_error_shift + shift);
return offset;
}
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+ s64 remainder;
+
+ /*
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+ * users are removed, this can be killed.
+ */
+ remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
+ tk->xtime_nsec -= remainder;
+ tk->xtime_nsec += 1ULL << tk->shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
- * Called from the timer interrupt, must hold a write on xtime_lock.
*/
-static void update_wall_time(void)
+void update_wall_time(void)
{
struct clocksource *clock;
+ struct timekeeper *real_tk = &timekeeper;
+ struct timekeeper *tk = &shadow_timekeeper;
cycle_t offset;
int shift = 0, maxshift;
+ unsigned int clock_set = 0;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
- return;
+ goto out;
- clock = timekeeper.clock;
+ clock = real_tk->clock;
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
- offset = timekeeper.cycle_interval;
+ offset = real_tk->cycle_interval;
#else
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
- timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
+
+ /* Check if there's really nothing to do */
+ if (offset < real_tk->cycle_interval)
+ goto out;
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
* we calculate the largest doubling multiple of cycle_intervals
- * that is smaller then the offset. We then accumulate that
+ * that is smaller than the offset. We then accumulate that
* chunk in one go, and then try to consume the next smaller
* doubled multiple.
*/
- shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+ shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
- /* Bound shift to one less then what overflows tick_length */
- maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+ /* Bound shift to one less than what overflows tick_length */
+ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
- while (offset >= timekeeper.cycle_interval) {
- offset = logarithmic_accumulation(offset, shift);
- if(offset < timekeeper.cycle_interval<<shift)
+ while (offset >= tk->cycle_interval) {
+ offset = logarithmic_accumulation(tk, offset, shift,
+ &clock_set);
+ if (offset < tk->cycle_interval<<shift)
shift--;
}
/* correct the clock when NTP error is too big */
- timekeeping_adjust(offset);
+ timekeeping_adjust(tk, offset);
/*
- * Since in the loop above, we accumulate any amount of time
- * in xtime_nsec over a second into xtime.tv_sec, its possible for
- * xtime_nsec to be fairly small after the loop. Further, if we're
- * slightly speeding the clocksource up in timekeeping_adjust(),
- * its possible the required corrective factor to xtime_nsec could
- * cause it to underflow.
- *
- * Now, we cannot simply roll the accumulated second back, since
- * the NTP subsystem has been notified via second_overflow. So
- * instead we push xtime_nsec forward by the amount we underflowed,
- * and add that amount into the error.
- *
- * We'll correct this error next time through this function, when
- * xtime_nsec is not as small.
+ * XXX This can be killed once everyone converts
+ * to the new update_vsyscall.
*/
- if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
- s64 neg = -(s64)timekeeper.xtime_nsec;
- timekeeper.xtime_nsec = 0;
- timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
- }
-
+ old_vsyscall_fixup(tk);
/*
- * Store full nanoseconds into xtime after rounding it up and
- * add the remainder to the error difference.
+ * Finally, make sure that after the rounding
+ * xtime_nsec isn't larger than NSEC_PER_SEC
*/
- xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
- timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
- timekeeper.ntp_error += timekeeper.xtime_nsec <<
- timekeeper.ntp_error_shift;
+ clock_set |= accumulate_nsecs_to_secs(tk);
+ write_seqcount_begin(&timekeeper_seq);
+ /* Update clock->cycle_last with the new value */
+ clock->cycle_last = tk->cycle_last;
/*
- * Finally, make sure that after the rounding
- * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+ * Update the real timekeeper.
+ *
+ * We could avoid this memcpy by switching pointers, but that
+ * requires changes to all other timekeeper usage sites as
+ * well, i.e. move the timekeeper pointer getter into the
+ * spinlocked/seqcount protected sections. And we trade this
+ * memcpy under the timekeeper_seq against one before we start
+ * updating.
*/
- if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
- xtime.tv_nsec -= NSEC_PER_SEC;
- xtime.tv_sec++;
- second_overflow();
- }
-
- /* check to see if there is a new clocksource to use */
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ memcpy(real_tk, tk, sizeof(*tk));
+ timekeeping_update(real_tk, clock_set);
+ write_seqcount_end(&timekeeper_seq);
+out:
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (clock_set)
+ /* Have to call _delayed version, since in irq context*/
+ clock_was_set_delayed();
}
/**
@@ -1073,16 +1454,18 @@ static void update_wall_time(void)
*/
void getboottime(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec boottime = {
- .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
- .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
+ .tv_sec = tk->wall_to_monotonic.tv_sec +
+ tk->total_sleep_time.tv_sec,
+ .tv_nsec = tk->wall_to_monotonic.tv_nsec +
+ tk->total_sleep_time.tv_nsec
};
set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
}
EXPORT_SYMBOL_GPL(getboottime);
-
/**
* get_monotonic_boottime - Returns monotonic time since boot
* @ts: pointer to the timespec to be set
@@ -1094,23 +1477,25 @@ EXPORT_SYMBOL_GPL(getboottime);
*/
void get_monotonic_boottime(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec tomono, sleep;
+ s64 nsec;
unsigned int seq;
- s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&xtime_lock);
- *ts = xtime;
- tomono = wall_to_monotonic;
- sleep = total_sleep_time;
- nsecs = timekeeping_get_ns();
+ seq = read_seqcount_begin(&timekeeper_seq);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(tk);
+ tomono = tk->wall_to_monotonic;
+ sleep = tk->total_sleep_time;
- } while (read_seqretry(&xtime_lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
- ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+ ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
}
EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1137,31 +1522,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
*/
void monotonic_to_bootbased(struct timespec *ts)
{
- *ts = timespec_add(*ts, total_sleep_time);
+ struct timekeeper *tk = &timekeeper;
+
+ *ts = timespec_add(*ts, tk->total_sleep_time);
}
EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
unsigned long get_seconds(void)
{
- return xtime.tv_sec;
+ struct timekeeper *tk = &timekeeper;
+
+ return tk->xtime_sec;
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- return xtime;
+ struct timekeeper *tk = &timekeeper;
+
+ return tk_xtime(tk);
}
struct timespec current_kernel_time(void)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec now;
unsigned long seq;
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
- now = xtime;
- } while (read_seqretry(&xtime_lock, seq));
+ now = tk_xtime(tk);
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
return now;
}
@@ -1169,15 +1561,16 @@ EXPORT_SYMBOL(current_kernel_time);
struct timespec get_monotonic_coarse(void)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec now, mono;
unsigned long seq;
do {
- seq = read_seqbegin(&xtime_lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
- now = xtime;
- mono = wall_to_monotonic;
- } while (read_seqretry(&xtime_lock, seq));
+ now = tk_xtime(tk);
+ mono = tk->wall_to_monotonic;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
@@ -1185,14 +1578,11 @@ struct timespec get_monotonic_coarse(void)
}
/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
+ * Must hold jiffies_lock
*/
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
- update_wall_time();
calc_global_load(ticks);
}
@@ -1206,30 +1596,138 @@ void do_timer(unsigned long ticks)
void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
struct timespec *wtom, struct timespec *sleep)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
do {
- seq = read_seqbegin(&xtime_lock);
- *xtim = xtime;
- *wtom = wall_to_monotonic;
- *sleep = total_sleep_time;
- } while (read_seqretry(&xtime_lock, seq));
+ seq = read_seqcount_begin(&timekeeper_seq);
+ *xtim = tk_xtime(tk);
+ *wtom = tk->wall_to_monotonic;
+ *sleep = tk->total_sleep_time;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real: pointer to storage for monotonic -> realtime offset
+ * @offs_boot: pointer to storage for monotonic -> boottime offset
+ * @offs_tai: pointer to storage for monotonic -> clock tai offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interrupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+ ktime_t *offs_tai)
+{
+ struct timekeeper *tk = &timekeeper;
+ ktime_t now;
+ unsigned int seq;
+ u64 secs, nsecs;
+
+ do {
+ seq = read_seqcount_begin(&timekeeper_seq);
+
+ secs = tk->xtime_sec;
+ nsecs = timekeeping_get_ns(tk);
+
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
+ now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+ now = ktime_sub(now, *offs_real);
+ return now;
+}
+#endif
+
/**
* ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
*/
ktime_t ktime_get_monotonic_offset(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
struct timespec wtom;
do {
- seq = read_seqbegin(&xtime_lock);
- wtom = wall_to_monotonic;
- } while (read_seqretry(&xtime_lock, seq));
+ seq = read_seqcount_begin(&timekeeper_seq);
+ wtom = tk->wall_to_monotonic;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
return timespec_to_ktime(wtom);
}
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
+
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ */
+int do_adjtimex(struct timex *txc)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long flags;
+ struct timespec ts;
+ s32 orig_tai, tai;
+ int ret;
+
+ /* Validate the data before disabling interrupts */
+ ret = ntp_validate_timex(txc);
+ if (ret)
+ return ret;
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec delta;
+ delta.tv_sec = txc->time.tv_sec;
+ delta.tv_nsec = txc->time.tv_usec;
+ if (!(txc->modes & ADJ_NANO))
+ delta.tv_nsec *= 1000;
+ ret = timekeeping_inject_offset(&delta);
+ if (ret)
+ return ret;
+ }
+
+ getnstimeofday(&ts);
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ orig_tai = tai = tk->tai_offset;
+ ret = __do_adjtimex(txc, &ts, &tai);
+
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tk, tai);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ }
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ if (tai != orig_tai)
+ clock_was_set();
+
+ ntp_notify_cmos_timer();
+
+ return ret;
+}
+
+#ifdef CONFIG_NTP_PPS
+/**
+ * hardpps() - Accessor function to NTP __hardpps function
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ __hardpps(phase_ts, raw_ts);
+
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif
/**
* xtime_update() - advances the timekeeping infrastructure
@@ -1239,7 +1737,8 @@ ktime_t ktime_get_monotonic_offset(void)
*/
void xtime_update(unsigned long ticks)
{
- write_seqlock(&xtime_lock);
+ write_seqlock(&jiffies_lock);
do_timer(ticks);
- write_sequnlock(&xtime_lock);
+ write_sequnlock(&jiffies_lock);
+ update_wall_time();
}
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
new file mode 100644
index 00000000000..4d54f97558d
--- /dev/null
+++ b/kernel/time/timekeeping_debug.c
@@ -0,0 +1,74 @@
+/*
+ * debugfs file to track time spent in suspend
+ *
+ * Copyright (c) 2011, Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+
+#include "timekeeping_internal.h"
+
+static unsigned int sleep_time_bin[32] = {0};
+
+static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
+{
+ unsigned int bin;
+ seq_puts(s, " time (secs) count\n");
+ seq_puts(s, "------------------------------\n");
+ for (bin = 0; bin < 32; bin++) {
+ if (sleep_time_bin[bin] == 0)
+ continue;
+ seq_printf(s, "%10u - %-10u %4u\n",
+ bin ? 1 << (bin - 1) : 0, 1 << bin,
+ sleep_time_bin[bin]);
+ }
+ return 0;
+}
+
+static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, tk_debug_show_sleep_time, NULL);
+}
+
+static const struct file_operations tk_debug_sleep_time_fops = {
+ .open = tk_debug_sleep_time_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init tk_debug_sleep_time_init(void)
+{
+ struct dentry *d;
+
+ d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
+ &tk_debug_sleep_time_fops);
+ if (!d) {
+ pr_err("Failed to create sleep_time debug file\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+late_initcall(tk_debug_sleep_time_init);
+
+void tk_debug_account_sleep_time(struct timespec *t)
+{
+ sleep_time_bin[fls(t->tv_sec)]++;
+}
+
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
new file mode 100644
index 00000000000..13323ea08ff
--- /dev/null
+++ b/kernel/time/timekeeping_internal.h
@@ -0,0 +1,14 @@
+#ifndef _TIMEKEEPING_INTERNAL_H
+#define _TIMEKEEPING_INTERNAL_H
+/*
+ * timekeeping debug functions
+ */
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_FS
+extern void tk_debug_account_sleep_time(struct timespec *t);
+#else
+#define tk_debug_account_sleep_time(x)
+#endif
+
+#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455549f..61ed862cdd3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
#include <asm/uaccess.h>
+
+struct timer_list_iter {
+ int cpu;
+ bool second_pass;
+ u64 now;
+};
+
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- SEQ_printf(m, "\n");
SEQ_printf(m, "cpu: %d\n", cpu);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
SEQ_printf(m, " clock %d:\n", i);
@@ -167,7 +173,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
P(nohz_mode);
- P_ns(idle_tick);
+ P_ns(last_tick);
P(tick_stopped);
P(idle_jiffies);
P(idle_calls);
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
#undef P
#undef P_ns
+ SEQ_printf(m, "\n");
}
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
- SEQ_printf(m, "\n");
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+ SEQ_printf(m, "\n");
}
-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
{
- int cpu;
-
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,47 +251,111 @@ static void timer_list_show_tickdevices(struct seq_file *m)
#endif
SEQ_printf(m, "\n");
#endif
- for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
- SEQ_printf(m, "\n");
}
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
#endif
+static inline void timer_list_header(struct seq_file *m, u64 now)
+{
+ SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ SEQ_printf(m, "\n");
+}
+
static int timer_list_show(struct seq_file *m, void *v)
{
+ struct timer_list_iter *iter = v;
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, iter->now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Timer List Version: v0.6\n");
- SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
- SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ timer_list_header(NULL, now);
for_each_online_cpu(cpu)
- print_cpu(m, cpu, now);
+ print_cpu(NULL, cpu, now);
- SEQ_printf(m, "\n");
- timer_list_show_tickdevices(m);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ timer_list_show_tickdevices_header(NULL);
+ for_each_online_cpu(cpu)
+ print_tickdevice(NULL, tick_get_device(cpu), cpu);
+#endif
+ return;
+}
- return 0;
+static void *move_iter(struct timer_list_iter *iter, loff_t offset)
+{
+ for (; offset; offset--) {
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
+#else
+ return NULL;
+#endif
+ }
+ }
+ return iter;
}
-void sysrq_timer_list_show(void)
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset)
+ iter->now = ktime_to_ns(ktime_get());
+ iter->cpu = -1;
+ iter->second_pass = false;
+ return move_iter(iter, *offset);
+}
+
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+ ++*offset;
+ return move_iter(iter, 1);
+}
+
+static void timer_list_stop(struct seq_file *seq, void *v)
{
- timer_list_show(NULL, NULL);
}
+static const struct seq_operations timer_list_sops = {
+ .start = timer_list_start,
+ .next = timer_list_next,
+ .stop = timer_list_stop,
+ .show = timer_list_show,
+};
+
static int timer_list_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, timer_list_show, NULL);
+ return seq_open_private(filp, &timer_list_sops,
+ sizeof(struct timer_list_iter));
}
static const struct file_operations timer_list_fops = {
.open = timer_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release_private,
};
static int __init init_timer_list_procfs(void)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b55..1fb08f21302 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
period = ktime_to_timespec(time);
ms = period.tv_nsec / 1000000;
- seq_puts(m, "Timer Stats Version: v0.2\n");
+ seq_puts(m, "Timer Stats Version: v0.3\n");
seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
if (atomic_read(&overflow_count))
- seq_printf(m, "Overflow: %d entries\n",
- atomic_read(&overflow_count));
+ seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
+ seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
- if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
seq_printf(m, "%4luD, %5d %-16s ",
entry->count, entry->pid, entry->comm);
} else {
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 00000000000..511bdf2cafd
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
+scale=0
+
+define gcd(a,b) {
+ auto t;
+ while (b) {
+ t = b;
+ b = a % b;
+ a = t;
+ }
+ return a;
+}
+
+/* Division by reciprocal multiplication. */
+define fmul(b,n,d) {
+ return (2^b*n+d-1)/d;
+}
+
+/* Adjustment factor when a ceiling value is used. Use as:
+ (imul * n) + (fmulxx * n + fadjxx) >> xx) */
+define fadj(b,n,d) {
+ auto v;
+ d = d/gcd(n,d);
+ v = 2^b*(d-1)/d;
+ return v;
+}
+
+/* Compute the appropriate mul/adj values as well as a shift count,
+ which brings the mul value into the range 2^b-1 <= x < 2^b. Such
+ a shift value will be correct in the signed integer range and off
+ by at most one in the upper half of the unsigned range. */
+define fmuls(b,n,d) {
+ auto s, m;
+ for (s = 0; 1; s++) {
+ m = fmul(s,n,d);
+ if (m >= 2^(b-1))
+ return s;
+ }
+ return 0;
+}
+
+define timeconst(hz) {
+ print "/* Automatically generated by kernel/timeconst.bc */\n"
+ print "/* Time conversion constants for HZ == ", hz, " */\n"
+ print "\n"
+
+ print "#ifndef KERNEL_TIMECONST_H\n"
+ print "#define KERNEL_TIMECONST_H\n\n"
+
+ print "#include <linux/param.h>\n"
+ print "#include <linux/types.h>\n\n"
+
+ print "#if HZ != ", hz, "\n"
+ print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#endif\n\n"
+
+ if (hz < 2) {
+ print "#error Totally bogus HZ value!\n"
+ } else {
+ s=fmuls(32,1000,hz)
+ obase=16
+ print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
+ print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000)
+ obase=16
+ print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
+ print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
+ obase=10
+ print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000)
+ print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
+ print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
+ print "\n"
+
+ s=fmuls(32,1000000,hz)
+ obase=16
+ print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
+ print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_USEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000000)
+ obase=16
+ print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
+ print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
+ obase=10
+ print "#define USEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000000)
+ print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
+ print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+ print "\n"
+
+ print "#endif /* KERNEL_TIMECONST_H */\n"
+ }
+ halt
+}
+
+timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index eb51d76e058..00000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,378 +0,0 @@
-#!/usr/bin/perl
-# -----------------------------------------------------------------------
-#
-# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
-#
-# This file is part of the Linux kernel, and is made available under
-# the terms of the GNU General Public License version 2 or (at your
-# option) any later version; incorporated herein by reference.
-#
-# -----------------------------------------------------------------------
-#
-
-#
-# Usage: timeconst.pl HZ > timeconst.h
-#
-
-# Precomputed values for systems without Math::BigInt
-# Generated by:
-# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
-%canned_values = (
- 24 => [
- '0xa6aaaaab','0x2aaaaaa',26,
- 125,3,
- '0xc49ba5e4','0x1fbe76c8b4',37,
- 3,125,
- '0xa2c2aaab','0xaaaa',16,
- 125000,3,
- '0xc9539b89','0x7fffbce4217d',47,
- 3,125000,
- ], 32 => [
- '0xfa000000','0x6000000',27,
- 125,4,
- '0x83126e98','0xfdf3b645a',36,
- 4,125,
- '0xf4240000','0x0',17,
- 31250,1,
- '0x8637bd06','0x3fff79c842fa',46,
- 1,31250,
- ], 48 => [
- '0xa6aaaaab','0x6aaaaaa',27,
- 125,6,
- '0xc49ba5e4','0xfdf3b645a',36,
- 6,125,
- '0xa2c2aaab','0x15555',17,
- 62500,3,
- '0xc9539b89','0x3fffbce4217d',46,
- 3,62500,
- ], 64 => [
- '0xfa000000','0xe000000',28,
- 125,8,
- '0x83126e98','0x7ef9db22d',35,
- 8,125,
- '0xf4240000','0x0',18,
- 15625,1,
- '0x8637bd06','0x1fff79c842fa',45,
- 1,15625,
- ], 100 => [
- '0xa0000000','0x0',28,
- 10,1,
- '0xcccccccd','0x733333333',35,
- 1,10,
- '0x9c400000','0x0',18,
- 10000,1,
- '0xd1b71759','0x1fff2e48e8a7',45,
- 1,10000,
- ], 122 => [
- '0x8325c53f','0xfbcda3a',28,
- 500,61,
- '0xf9db22d1','0x7fbe76c8b',35,
- 61,500,
- '0x8012e2a0','0x3ef36',18,
- 500000,61,
- '0xffda4053','0x1ffffbce4217',45,
- 61,500000,
- ], 128 => [
- '0xfa000000','0x1e000000',29,
- 125,16,
- '0x83126e98','0x3f7ced916',34,
- 16,125,
- '0xf4240000','0x40000',19,
- 15625,2,
- '0x8637bd06','0xfffbce4217d',44,
- 2,15625,
- ], 200 => [
- '0xa0000000','0x0',29,
- 5,1,
- '0xcccccccd','0x333333333',34,
- 1,5,
- '0x9c400000','0x0',19,
- 5000,1,
- '0xd1b71759','0xfff2e48e8a7',44,
- 1,5000,
- ], 250 => [
- '0x80000000','0x0',29,
- 4,1,
- '0x80000000','0x180000000',33,
- 1,4,
- '0xfa000000','0x0',20,
- 4000,1,
- '0x83126e98','0x7ff7ced9168',43,
- 1,4000,
- ], 256 => [
- '0xfa000000','0x3e000000',30,
- 125,32,
- '0x83126e98','0x1fbe76c8b',33,
- 32,125,
- '0xf4240000','0xc0000',20,
- 15625,4,
- '0x8637bd06','0x7ffde7210be',43,
- 4,15625,
- ], 300 => [
- '0xd5555556','0x2aaaaaaa',30,
- 10,3,
- '0x9999999a','0x1cccccccc',33,
- 3,10,
- '0xd0555556','0xaaaaa',20,
- 10000,3,
- '0x9d495183','0x7ffcb923a29',43,
- 3,10000,
- ], 512 => [
- '0xfa000000','0x7e000000',31,
- 125,64,
- '0x83126e98','0xfdf3b645',32,
- 64,125,
- '0xf4240000','0x1c0000',21,
- 15625,8,
- '0x8637bd06','0x3ffef39085f',42,
- 8,15625,
- ], 1000 => [
- '0x80000000','0x0',31,
- 1,1,
- '0x80000000','0x0',31,
- 1,1,
- '0xfa000000','0x0',22,
- 1000,1,
- '0x83126e98','0x1ff7ced9168',41,
- 1,1000,
- ], 1024 => [
- '0xfa000000','0xfe000000',32,
- 125,128,
- '0x83126e98','0x7ef9db22',31,
- 128,125,
- '0xf4240000','0x3c0000',22,
- 15625,16,
- '0x8637bd06','0x1fff79c842f',41,
- 16,15625,
- ], 1200 => [
- '0xd5555556','0xd5555555',32,
- 5,6,
- '0x9999999a','0x66666666',31,
- 6,5,
- '0xd0555556','0x2aaaaa',22,
- 2500,3,
- '0x9d495183','0x1ffcb923a29',41,
- 3,2500,
- ]
-);
-
-$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
-
-sub bint($)
-{
- my($x) = @_;
- return Math::BigInt->new($x);
-}
-
-#
-# Constants for division by reciprocal multiplication.
-# (bits, numerator, denominator)
-#
-sub fmul($$$)
-{
- my ($b,$n,$d) = @_;
-
- $n = bint($n);
- $d = bint($d);
-
- return scalar (($n << $b)+$d-bint(1))/$d;
-}
-
-sub fadj($$$)
-{
- my($b,$n,$d) = @_;
-
- $n = bint($n);
- $d = bint($d);
-
- $d = $d/bgcd($n, $d);
- return scalar (($d-bint(1)) << $b)/$d;
-}
-
-sub fmuls($$$) {
- my($b,$n,$d) = @_;
- my($s,$m);
- my($thres) = bint(1) << ($b-1);
-
- $n = bint($n);
- $d = bint($d);
-
- for ($s = 0; 1; $s++) {
- $m = fmul($s,$n,$d);
- return $s if ($m >= $thres);
- }
- return 0;
-}
-
-# Generate a hex value if the result fits in 64 bits;
-# otherwise skip.
-sub bignum_hex($) {
- my($x) = @_;
- my $s = $x->as_hex();
-
- return (length($s) > 18) ? undef : $s;
-}
-
-# Provides mul, adj, and shr factors for a specific
-# (bit, time, hz) combination
-sub muladj($$$) {
- my($b, $t, $hz) = @_;
- my $s = fmuls($b, $t, $hz);
- my $m = fmul($s, $t, $hz);
- my $a = fadj($s, $t, $hz);
- return (bignum_hex($m), bignum_hex($a), $s);
-}
-
-# Provides numerator, denominator values
-sub numden($$) {
- my($n, $d) = @_;
- my $g = bgcd($n, $d);
- return ($n/$g, $d/$g);
-}
-
-# All values for a specific (time, hz) combo
-sub conversions($$) {
- my ($t, $hz) = @_;
- my @val = ();
-
- # HZ_TO_xx
- push(@val, muladj(32, $t, $hz));
- push(@val, numden($t, $hz));
-
- # xx_TO_HZ
- push(@val, muladj(32, $hz, $t));
- push(@val, numden($hz, $t));
-
- return @val;
-}
-
-sub compute_values($) {
- my($hz) = @_;
- my @val = ();
- my $s, $m, $a, $g;
-
- if (!$has_bigint) {
- die "$0: HZ == $hz not canned and ".
- "Math::BigInt not available\n";
- }
-
- # MSEC conversions
- push(@val, conversions(1000, $hz));
-
- # USEC conversions
- push(@val, conversions(1000000, $hz));
-
- return @val;
-}
-
-sub outputval($$)
-{
- my($name, $val) = @_;
- my $csuf;
-
- if (defined($val)) {
- if ($name !~ /SHR/) {
- $val = "U64_C($val)";
- }
- printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
- }
-}
-
-sub output($@)
-{
- my($hz, @val) = @_;
- my $pfx, $bit, $suf, $s, $m, $a;
-
- print "/* Automatically generated by kernel/timeconst.pl */\n";
- print "/* Conversion constants for HZ == $hz */\n";
- print "\n";
- print "#ifndef KERNEL_TIMECONST_H\n";
- print "#define KERNEL_TIMECONST_H\n";
- print "\n";
-
- print "#include <linux/param.h>\n";
- print "#include <linux/types.h>\n";
-
- print "\n";
- print "#if HZ != $hz\n";
- print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
- print "#endif\n";
- print "\n";
-
- foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
- 'HZ_TO_USEC','USEC_TO_HZ') {
- foreach $bit (32) {
- foreach $suf ('MUL', 'ADJ', 'SHR') {
- outputval("${pfx}_$suf$bit", shift(@val));
- }
- }
- foreach $suf ('NUM', 'DEN') {
- outputval("${pfx}_$suf", shift(@val));
- }
- }
-
- print "\n";
- print "#endif /* KERNEL_TIMECONST_H */\n";
-}
-
-# Pretty-print Perl values
-sub perlvals(@) {
- my $v;
- my @l = ();
-
- foreach $v (@_) {
- if (!defined($v)) {
- push(@l, 'undef');
- } elsif ($v =~ /^0x/) {
- push(@l, "\'".$v."\'");
- } else {
- push(@l, $v.'');
- }
- }
- return join(',', @l);
-}
-
-($hz) = @ARGV;
-
-# Use this to generate the %canned_values structure
-if ($hz eq '--can') {
- shift(@ARGV);
- @hzlist = sort {$a <=> $b} (@ARGV);
-
- print "# Precomputed values for systems without Math::BigInt\n";
- print "# Generated by:\n";
- print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
- print "\%canned_values = (\n";
- my $pf = "\t";
- foreach $hz (@hzlist) {
- my @values = compute_values($hz);
- print "$pf$hz => [\n";
- while (scalar(@values)) {
- my $bit;
- foreach $bit (32) {
- my $m = shift(@values);
- my $a = shift(@values);
- my $s = shift(@values);
- print "\t\t", perlvals($m,$a,$s), ",\n";
- }
- my $n = shift(@values);
- my $d = shift(@values);
- print "\t\t", perlvals($n,$d), ",\n";
- }
- print "\t]";
- $pf = ', ';
- }
- print "\n);\n";
-} else {
- $hz += 0; # Force to number
- if ($hz < 1) {
- die "Usage: $0 HZ\n";
- }
-
- @val = @{$canned_values{$hz}};
- if (!defined(@val)) {
- @val = compute_values($hz);
- }
- output($hz, @val);
-}
-exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888..3bb01a323b2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
/*
* linux/kernel/timer.c
*
- * Kernel internal timers, basic process system calls
+ * Kernel internal timers
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
@@ -39,7 +39,9 @@
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
#include <linux/slab.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -50,7 +52,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
@@ -63,6 +65,7 @@ EXPORT_SYMBOL(jiffies_64);
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
+#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
struct list_head vec[TVN_SIZE];
@@ -77,6 +80,8 @@ struct tvec_base {
struct timer_list *running_timer;
unsigned long timer_jiffies;
unsigned long next_timer;
+ unsigned long active_timers;
+ unsigned long all_timers;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -91,24 +96,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
- return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+ return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
}
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
{
- return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+ return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
}
-static inline void timer_set_deferrable(struct timer_list *timer)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
{
- timer->base = TBASE_MAKE_DEFERRED(timer->base);
+ return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
}
static inline void
timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
{
- timer->base = (struct tvec_base *)((unsigned long)(new_base) |
- tbase_get_deferrable(timer->base));
+ unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
+
+ timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
}
static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -144,9 +150,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
- if (j <= jiffies) /* rounding ate our timeout entirely; */
- return original;
- return j;
+ /*
+ * Make sure j is still in the future. Otherwise return the
+ * unmodified value.
+ */
+ return time_is_after_jiffies(j) ? j : original;
}
/**
@@ -330,7 +338,22 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+/*
+ * If the list is empty, catch up ->timer_jiffies to the current time.
+ * The caller must hold the tvec_base lock. Returns true if the list
+ * was empty and therefore ->timer_jiffies was updated.
+ */
+static bool catchup_timer_jiffies(struct tvec_base *base)
+{
+ if (!base->all_timers) {
+ base->timer_jiffies = jiffies;
+ return true;
+ }
+ return false;
+}
+
+static void
+__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
@@ -356,11 +379,12 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
} else {
int i;
- /* If the timeout is larger than 0xffffffff on 64-bit
- * architectures then we use the maximum timeout:
+ /* If the timeout is larger than MAX_TVAL (on 64-bit
+ * architectures or with CONFIG_BASE_SMALL=1) then we
+ * use the maximum timeout.
*/
- if (idx > 0xffffffffUL) {
- idx = 0xffffffffUL;
+ if (idx > MAX_TVAL) {
+ idx = MAX_TVAL;
expires = idx + base->timer_jiffies;
}
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
@@ -372,6 +396,21 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+ (void)catchup_timer_jiffies(base);
+ __internal_add_timer(base, timer);
+ /*
+ * Update base->active_timers and base->next_timer
+ */
+ if (!tbase_get_deferrable(timer->base)) {
+ if (!base->active_timers++ ||
+ time_before(timer->expires, base->next_timer))
+ base->next_timer = timer->expires;
+ }
+ base->all_timers++;
+}
+
#ifdef CONFIG_TIMER_STATS
void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
{
@@ -548,16 +587,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
debug_object_assert_init(timer, &timer_debug_descr);
}
-static void __init_timer(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key);
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
+void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
- __init_timer(timer, name, key);
+ do_init_timer(timer, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
@@ -598,12 +635,13 @@ static inline void debug_assert_init(struct timer_list *timer)
debug_timer_assert_init(timer);
}
-static void __init_timer(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
{
+ struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+
timer->entry.next = NULL;
- timer->base = __raw_get_cpu_var(tvec_bases);
+ timer->base = (void *)((unsigned long)base | flags);
timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
@@ -613,22 +651,10 @@ static void __init_timer(struct timer_list *timer,
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
-void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key,
- void (*function)(unsigned long),
- unsigned long data)
-{
- timer->function = function;
- timer->data = data;
- init_timer_on_stack_key(timer, name, key);
- timer_set_deferrable(timer);
-}
-EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
-
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @flags: timer flags
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
@@ -636,26 +662,15 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
+void init_timer_key(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
{
debug_init(timer);
- __init_timer(timer, name, key);
+ do_init_timer(timer, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);
-void init_timer_deferrable_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
-{
- init_timer_key(timer, name, key);
- timer_set_deferrable(timer);
-}
-EXPORT_SYMBOL(init_timer_deferrable_key);
-
-static inline void detach_timer(struct timer_list *timer,
- int clear_pending)
+static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
struct list_head *entry = &timer->entry;
@@ -667,6 +682,33 @@ static inline void detach_timer(struct timer_list *timer,
entry->prev = LIST_POISON2;
}
+static inline void
+detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+{
+ detach_timer(timer, true);
+ if (!tbase_get_deferrable(timer->base))
+ base->active_timers--;
+ base->all_timers--;
+ (void)catchup_timer_jiffies(base);
+}
+
+static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+ bool clear_pending)
+{
+ if (!timer_pending(timer))
+ return 0;
+
+ detach_timer(timer, clear_pending);
+ if (!tbase_get_deferrable(timer->base)) {
+ base->active_timers--;
+ if (timer->expires == base->next_timer)
+ base->next_timer = base->timer_jiffies;
+ }
+ base->all_timers--;
+ (void)catchup_timer_jiffies(base);
+ return 1;
+}
+
/*
* We are using hashed locking: holding per_cpu(tvec_bases).lock
* means that all timers which are tied to this base via timer->base are
@@ -712,25 +754,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
base = lock_timer_base(timer, &flags);
- if (timer_pending(timer)) {
- detach_timer(timer, 0);
- if (timer->expires == base->next_timer &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = base->timer_jiffies;
- ret = 1;
- } else {
- if (pending_only)
- goto out_unlock;
- }
+ ret = detach_if_pending(timer, base, false);
+ if (!ret && pending_only)
+ goto out_unlock;
debug_activate(timer, expires);
- cpu = smp_processor_id();
-
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
- cpu = get_nohz_timer_target();
-#endif
+ cpu = get_nohz_timer_target(pinned);
new_base = per_cpu(tvec_bases, cpu);
if (base != new_base) {
@@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
- if (time_before(timer->expires, base->next_timer) &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = timer->expires;
internal_add_timer(base, timer);
out_unlock:
@@ -811,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
bit = find_last_bit(&mask, BITS_PER_LONG);
- mask = (1 << bit) - 1;
+ mask = (1UL << bit) - 1;
expires_limit = expires_limit & ~(mask);
@@ -861,7 +888,13 @@ EXPORT_SYMBOL(mod_timer);
*
* mod_timer_pinned() is a way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
- * and not allow the timer to be migrated to a different CPU.
+ * and to ensure that the timer is scheduled on the current CPU.
+ *
+ * Note that this does not prevent the timer from being migrated
+ * when the current CPU goes offline. If this is a problem for
+ * you, use CPU-hotplug notifiers to handle it correctly, for
+ * example, cancelling the timer when the corresponding CPU goes
+ * offline.
*
* mod_timer_pinned(timer, expires) is equivalent to:
*
@@ -914,19 +947,23 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
debug_activate(timer, timer->expires);
- if (time_before(timer->expires, base->next_timer) &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = timer->expires;
internal_add_timer(base, timer);
/*
- * Check whether the other CPU is idle and needs to be
- * triggered to reevaluate the timer wheel when nohz is
- * active. We are protected against the other CPU fiddling
+ * Check whether the other CPU is in dynticks mode and needs
+ * to be triggered to reevaluate the timer wheel.
+ * We are protected against the other CPU fiddling
* with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to idle can not evaluate
- * the timer wheel.
+ * makes sure that a CPU on the way to stop its tick can not
+ * evaluate the timer wheel.
+ *
+ * Spare the IPI for deferrable timers on idle targets though.
+ * The next busy ticks will take care of it. Except full dynticks
+ * require special care against races with idle_cpu(), lets deal
+ * with that later.
*/
- wake_up_idle_cpu(cpu);
+ if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
+ wake_up_nohz_cpu(cpu);
+
spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
@@ -953,13 +990,7 @@ int del_timer(struct timer_list *timer)
timer_stats_timer_clear_start_info(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
- if (timer_pending(timer)) {
- detach_timer(timer, 1);
- if (timer->expires == base->next_timer &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = base->timer_jiffies;
- ret = 1;
- }
+ ret = detach_if_pending(timer, base, true);
spin_unlock_irqrestore(&base->lock, flags);
}
@@ -984,19 +1015,10 @@ int try_to_del_timer_sync(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
- if (base->running_timer == timer)
- goto out;
-
- timer_stats_timer_clear_start_info(timer);
- ret = 0;
- if (timer_pending(timer)) {
- detach_timer(timer, 1);
- if (timer->expires == base->next_timer &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = base->timer_jiffies;
- ret = 1;
+ if (base->running_timer != timer) {
+ timer_stats_timer_clear_start_info(timer);
+ ret = detach_if_pending(timer, base, true);
}
-out:
spin_unlock_irqrestore(&base->lock, flags);
return ret;
@@ -1014,14 +1036,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*
* Synchronization rules: Callers must prevent restarting of the timer,
* otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
- * completion of the timer's handler. The timer's handler must not call
- * add_timer_on(). Upon exit the timer is not queued and the handler is
- * not running on any CPU.
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's
+ * handler. The timer's handler must not call add_timer_on(). Upon exit the
+ * timer is not queued and the handler is not running on any CPU.
*
- * Note: You must not hold locks that are held in interrupt context
- * while calling this function. Even if the lock has nothing to do
- * with the timer in question. Here's why:
+ * Note: For !irqsafe timers, you must not hold locks that are held in
+ * interrupt context while calling this function. Even if the lock has
+ * nothing to do with the timer in question. Here's why:
*
* CPU0 CPU1
* ---- ----
@@ -1058,7 +1080,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq());
+ WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -1083,7 +1105,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
*/
list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
BUG_ON(tbase_get_base(timer->base) != base);
- internal_add_timer(base, timer);
+ /* No accounting, while moving them */
+ __internal_add_timer(base, timer);
}
return index;
@@ -1092,7 +1115,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
unsigned long data)
{
- int preempt_count = preempt_count();
+ int count = preempt_count();
#ifdef CONFIG_LOCKDEP
/*
@@ -1102,7 +1125,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
* warnings as well as problems when looking into
* timer->lockdep_map, make a copy and use that here.
*/
- struct lockdep_map lockdep_map = timer->lockdep_map;
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
/*
* Couple the lock chain with the lock chain at
@@ -1117,16 +1142,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
lock_map_release(&lockdep_map);
- if (preempt_count != preempt_count()) {
+ if (count != preempt_count()) {
WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
- fn, preempt_count, preempt_count());
+ fn, count, preempt_count());
/*
* Restore the preempt count. That gives us a decent
* chance to survive and extract information. If the
* callback kept a lock held, bad luck, but not worse
* than the BUG() we had.
*/
- preempt_count() = preempt_count;
+ preempt_count_set(count);
}
}
@@ -1144,6 +1169,10 @@ static inline void __run_timers(struct tvec_base *base)
struct timer_list *timer;
spin_lock_irq(&base->lock);
+ if (catchup_timer_jiffies(base)) {
+ spin_unlock_irq(&base->lock);
+ return;
+ }
while (time_after_eq(jiffies, base->timer_jiffies)) {
struct list_head work_list;
struct list_head *head = &work_list;
@@ -1158,30 +1187,38 @@ static inline void __run_timers(struct tvec_base *base)
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
- list_replace_init(base->tv1.vec + index, &work_list);
+ list_replace_init(base->tv1.vec + index, head);
while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
+ bool irqsafe;
timer = list_first_entry(head, struct timer_list,entry);
fn = timer->function;
data = timer->data;
+ irqsafe = tbase_get_irqsafe(timer->base);
timer_stats_account_timer(timer);
base->running_timer = timer;
- detach_timer(timer, 1);
-
- spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
- spin_lock_irq(&base->lock);
+ detach_expired_timer(timer, base);
+
+ if (irqsafe) {
+ spin_unlock(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock(&base->lock);
+ } else {
+ spin_unlock_irq(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock_irq(&base->lock);
+ }
}
}
base->running_timer = NULL;
spin_unlock_irq(&base->lock);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Find out when the next timer event is due to happen. This
* is used on S/390 to stop all activity when a CPU is idle.
@@ -1308,18 +1345,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
unsigned long get_next_timer_interrupt(unsigned long now)
{
struct tvec_base *base = __this_cpu_read(tvec_bases);
- unsigned long expires;
+ unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
/*
* Pretend that there is no timer pending if the cpu is offline.
* Possible pending timers will be migrated later to an active cpu.
*/
if (cpu_is_offline(smp_processor_id()))
- return now + NEXT_TIMER_MAX_DELTA;
+ return expires;
+
spin_lock(&base->lock);
- if (time_before_eq(base->next_timer, base->timer_jiffies))
- base->next_timer = __next_timer_interrupt(base);
- expires = base->next_timer;
+ if (base->active_timers) {
+ if (time_before_eq(base->next_timer, base->timer_jiffies))
+ base->next_timer = __next_timer_interrupt(base);
+ expires = base->next_timer;
+ }
spin_unlock(&base->lock);
if (time_before_eq(expires, now))
@@ -1342,7 +1382,6 @@ void update_process_times(int user_tick)
account_process_tick(p, user_tick);
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
- printk_tick();
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_run();
@@ -1386,70 +1425,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
#endif
-#ifndef __alpha__
-
-/*
- * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
- * should be moved into arch/i386 instead?
- */
-
-/**
- * sys_getpid - return the thread group id of the current process
- *
- * Note, despite the name, this returns the tgid not the pid. The tgid and
- * the pid are identical unless CLONE_THREAD was specified on clone() in
- * which case the tgid is the same in all threads of the same group.
- *
- * This is SMP safe as current->tgid does not change.
- */
-SYSCALL_DEFINE0(getpid)
-{
- return task_tgid_vnr(current);
-}
-
-/*
- * Accessing ->real_parent is not SMP-safe, it could
- * change from under us. However, we can use a stale
- * value of ->real_parent under rcu_read_lock(), see
- * release_task()->call_rcu(delayed_put_task_struct).
- */
-SYSCALL_DEFINE0(getppid)
-{
- int pid;
-
- rcu_read_lock();
- pid = task_tgid_vnr(rcu_dereference(current->real_parent));
- rcu_read_unlock();
-
- return pid;
-}
-
-SYSCALL_DEFINE0(getuid)
-{
- /* Only we change this so SMP safe */
- return current_uid();
-}
-
-SYSCALL_DEFINE0(geteuid)
-{
- /* Only we change this so SMP safe */
- return current_euid();
-}
-
-SYSCALL_DEFINE0(getgid)
-{
- /* Only we change this so SMP safe */
- return current_gid();
-}
-
-SYSCALL_DEFINE0(getegid)
-{
- /* Only we change this so SMP safe */
- return current_egid();
-}
-
-#endif
-
static void process_timeout(unsigned long __data)
{
wake_up_process((struct task_struct *)__data);
@@ -1557,96 +1532,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-/* Thread ID - the internal kernel "pid" */
-SYSCALL_DEFINE0(gettid)
-{
- return task_pid_vnr(current);
-}
-
-/**
- * do_sysinfo - fill in sysinfo struct
- * @info: pointer to buffer to fill
- */
-int do_sysinfo(struct sysinfo *info)
-{
- unsigned long mem_total, sav_total;
- unsigned int mem_unit, bitcount;
- struct timespec tp;
-
- memset(info, 0, sizeof(struct sysinfo));
-
- ktime_get_ts(&tp);
- monotonic_to_bootbased(&tp);
- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-
- get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
-
- info->procs = nr_threads;
-
- si_meminfo(info);
- si_swapinfo(info);
-
- /*
- * If the sum of all the available memory (i.e. ram + swap)
- * is less than can be stored in a 32 bit unsigned long then
- * we can be binary compatible with 2.2.x kernels. If not,
- * well, in that case 2.2.x was broken anyways...
- *
- * -Erik Andersen <andersee@debian.org>
- */
-
- mem_total = info->totalram + info->totalswap;
- if (mem_total < info->totalram || mem_total < info->totalswap)
- goto out;
- bitcount = 0;
- mem_unit = info->mem_unit;
- while (mem_unit > 1) {
- bitcount++;
- mem_unit >>= 1;
- sav_total = mem_total;
- mem_total <<= 1;
- if (mem_total < sav_total)
- goto out;
- }
-
- /*
- * If mem_total did not overflow, multiply all memory values by
- * info->mem_unit and set it to 1. This leaves things compatible
- * with 2.2.x, and also retains compatibility with earlier 2.4.x
- * kernels...
- */
-
- info->mem_unit = 1;
- info->totalram <<= bitcount;
- info->freeram <<= bitcount;
- info->sharedram <<= bitcount;
- info->bufferram <<= bitcount;
- info->totalswap <<= bitcount;
- info->freeswap <<= bitcount;
- info->totalhigh <<= bitcount;
- info->freehigh <<= bitcount;
-
-out:
- return 0;
-}
-
-SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
-{
- struct sysinfo val;
-
- do_sysinfo(&val);
-
- if (copy_to_user(info, &val, sizeof(struct sysinfo)))
- return -EFAULT;
-
- return 0;
-}
-
-static int __cpuinit init_timers_cpu(int cpu)
+static int init_timers_cpu(int cpu)
{
int j;
struct tvec_base *base;
- static char __cpuinitdata tvec_base_done[NR_CPUS];
+ static char tvec_base_done[NR_CPUS];
if (!tvec_base_done[cpu]) {
static char boot_done;
@@ -1655,15 +1545,13 @@ static int __cpuinit init_timers_cpu(int cpu)
/*
* The APs use this path later in boot
*/
- base = kmalloc_node(sizeof(*base),
- GFP_KERNEL | __GFP_ZERO,
- cpu_to_node(cpu));
+ base = kzalloc_node(sizeof(*base), GFP_KERNEL,
+ cpu_to_node(cpu));
if (!base)
return -ENOMEM;
- /* Make sure that tvec_base is 2 byte aligned */
- if (tbase_get_deferrable(base)) {
- WARN_ON(1);
+ /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
+ if (WARN_ON(base != tbase_get_base(base))) {
kfree(base);
return -ENOMEM;
}
@@ -1678,12 +1566,12 @@ static int __cpuinit init_timers_cpu(int cpu)
boot_done = 1;
base = &boot_tvec_bases;
}
+ spin_lock_init(&base->lock);
tvec_base_done[cpu] = 1;
} else {
base = per_cpu(tvec_bases, cpu);
}
- spin_lock_init(&base->lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1696,6 +1584,8 @@ static int __cpuinit init_timers_cpu(int cpu)
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
+ base->active_timers = 0;
+ base->all_timers = 0;
return 0;
}
@@ -1706,16 +1596,14 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
while (!list_empty(head)) {
timer = list_first_entry(head, struct timer_list, entry);
- detach_timer(timer, 0);
+ /* We ignore the accounting on the dying cpu */
+ detach_timer(timer, false);
timer_set_base(timer, new_base);
- if (time_before(timer->expires, new_base->next_timer) &&
- !tbase_get_deferrable(timer->base))
- new_base->next_timer = timer->expires;
internal_add_timer(new_base, timer);
}
}
-static void __cpuinit migrate_timers(int cpu)
+static void migrate_timers(int cpu)
{
struct tvec_base *old_base;
struct tvec_base *new_base;
@@ -1748,7 +1636,7 @@ static void __cpuinit migrate_timers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit timer_cpu_notify(struct notifier_block *self,
+static int timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1773,19 +1661,23 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata timers_nb = {
+static struct notifier_block timers_nb = {
.notifier_call = timer_cpu_notify,
};
void __init init_timers(void)
{
- int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
+ int err;
- init_timer_stats();
+ /* ensure there are enough low bits for flags in timer->base pointer */
+ BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
+ err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
BUG_ON(err != NOTIFY_OK);
+
+ init_timer_stats();
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 00000000000..40bb511cca4
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,733 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+static char *torture_type;
+static bool verbose;
+
+/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */
+#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
+static int fullstop = FULLSTOP_RMMOD;
+static DEFINE_MUTEX(fullstop_mutex);
+static int *torture_runnable;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Variables for online-offline handling. Only present if CPU hotplug
+ * is enabled, otherwise does nothing.
+ */
+
+static struct task_struct *onoff_task;
+static long onoff_holdoff;
+static long onoff_interval;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+torture_onoff(void *arg)
+{
+ int cpu;
+ unsigned long delta;
+ int maxcpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+ int ret;
+ unsigned long starttime;
+
+ VERBOSE_TOROUT_STRING("torture_onoff task started");
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (onoff_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
+ schedule_timeout_interruptible(onoff_holdoff);
+ VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
+ }
+ while (!torture_must_stop()) {
+ cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_offline_attempts++;
+ ret = cpu_down(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offline %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlined %d\n",
+ torture_type, cpu);
+ n_offline_successes++;
+ delta = jiffies - starttime;
+ sum_offline += delta;
+ if (min_offline < 0) {
+ min_offline = delta;
+ max_offline = delta;
+ }
+ if (min_offline > delta)
+ min_offline = delta;
+ if (max_offline < delta)
+ max_offline = delta;
+ }
+ } else if (cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_online_attempts++;
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlined %d\n",
+ torture_type, cpu);
+ n_online_successes++;
+ delta = jiffies - starttime;
+ sum_online += delta;
+ if (min_online < 0) {
+ min_online = delta;
+ max_online = delta;
+ }
+ if (min_online > delta)
+ min_online = delta;
+ if (max_online < delta)
+ max_online = delta;
+ }
+ }
+ schedule_timeout_interruptible(onoff_interval);
+ }
+ torture_kthread_stopping("torture_onoff");
+ return 0;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Initiate online-offline handling.
+ */
+int torture_onoff_init(long ooholdoff, long oointerval)
+{
+ int ret = 0;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ onoff_holdoff = ooholdoff;
+ onoff_interval = oointerval;
+ if (onoff_interval <= 0)
+ return 0;
+ ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_init);
+
+/*
+ * Clean up after online/offline testing.
+ */
+static void torture_onoff_cleanup(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ if (onoff_task == NULL)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
+ kthread_stop(onoff_task);
+ onoff_task = NULL;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
+
+/*
+ * Print online/offline testing statistics.
+ */
+char *torture_onoff_stats(char *page)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ page += sprintf(page,
+ "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return page;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_stats);
+
+/*
+ * Were all the online/offline operations successful?
+ */
+bool torture_onoff_failures(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ return n_online_successes != n_online_attempts ||
+ n_offline_successes != n_offline_attempts;
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+ return false;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_failures);
+
+#define TORTURE_RANDOM_MULT 39916801 /* prime */
+#define TORTURE_RANDOM_ADD 479001701 /* prime */
+#define TORTURE_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+unsigned long
+torture_random(struct torture_random_state *trsp)
+{
+ if (--trsp->trs_count < 0) {
+ trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_count = TORTURE_RANDOM_REFRESH;
+ }
+ trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
+ TORTURE_RANDOM_ADD;
+ return swahw32(trsp->trs_state);
+}
+EXPORT_SYMBOL_GPL(torture_random);
+
+/*
+ * Variables for shuffling. The idea is to ensure that each CPU stays
+ * idle for an extended period to test interactions with dyntick idle,
+ * as well as interactions with any per-CPU varibles.
+ */
+struct shuffle_task {
+ struct list_head st_l;
+ struct task_struct *st_t;
+};
+
+static long shuffle_interval; /* In jiffies. */
+static struct task_struct *shuffler_task;
+static cpumask_var_t shuffle_tmp_mask;
+static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */
+static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
+static DEFINE_MUTEX(shuffle_task_mutex);
+
+/*
+ * Register a task to be shuffled. If there is no memory, just splat
+ * and don't bother registering.
+ */
+void torture_shuffle_task_register(struct task_struct *tp)
+{
+ struct shuffle_task *stp;
+
+ if (WARN_ON_ONCE(tp == NULL))
+ return;
+ stp = kmalloc(sizeof(*stp), GFP_KERNEL);
+ if (WARN_ON_ONCE(stp == NULL))
+ return;
+ stp->st_t = tp;
+ mutex_lock(&shuffle_task_mutex);
+ list_add(&stp->st_l, &shuffle_task_list);
+ mutex_unlock(&shuffle_task_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
+
+/*
+ * Unregister all tasks, for example, at the end of the torture run.
+ */
+static void torture_shuffle_task_unregister_all(void)
+{
+ struct shuffle_task *stp;
+ struct shuffle_task *p;
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
+ list_del(&stp->st_l);
+ kfree(stp);
+ }
+ mutex_unlock(&shuffle_task_mutex);
+}
+
+/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
+ * A special case is when shuffle_idle_cpu = -1, in which case we allow
+ * the tasks to run on all CPUs.
+ */
+static void torture_shuffle_tasks(void)
+{
+ struct shuffle_task *stp;
+
+ cpumask_setall(shuffle_tmp_mask);
+ get_online_cpus();
+
+ /* No point in shuffling if there is only one online CPU (ex: UP) */
+ if (num_online_cpus() == 1) {
+ put_online_cpus();
+ return;
+ }
+
+ /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */
+ shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
+ if (shuffle_idle_cpu >= nr_cpu_ids)
+ shuffle_idle_cpu = -1;
+ else
+ cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry(stp, &shuffle_task_list, st_l)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ mutex_unlock(&shuffle_task_mutex);
+
+ put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int torture_shuffle(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_shuffle task started");
+ do {
+ schedule_timeout_interruptible(shuffle_interval);
+ torture_shuffle_tasks();
+ torture_shutdown_absorb("torture_shuffle");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_shuffle");
+ return 0;
+}
+
+/*
+ * Start the shuffler, with shuffint in jiffies.
+ */
+int torture_shuffle_init(long shuffint)
+{
+ shuffle_interval = shuffint;
+
+ shuffle_idle_cpu = -1;
+
+ if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+ VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ return -ENOMEM;
+ }
+
+ /* Create the shuffler thread */
+ return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_init);
+
+/*
+ * Stop the shuffling.
+ */
+static void torture_shuffle_cleanup(void)
+{
+ torture_shuffle_task_unregister_all();
+ if (shuffler_task) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
+ kthread_stop(shuffler_task);
+ free_cpumask_var(shuffle_tmp_mask);
+ }
+ shuffler_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
+
+/*
+ * Variables for auto-shutdown. This allows "lights out" torture runs
+ * to be fully scripted.
+ */
+static int shutdown_secs; /* desired test duration in seconds. */
+static struct task_struct *shutdown_task;
+static unsigned long shutdown_time; /* jiffies to system shutdown. */
+static void (*torture_shutdown_hook)(void);
+
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+void torture_shutdown_absorb(const char *title)
+{
+ while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_notice("torture thread %s parking due to system shutdown\n",
+ title);
+ schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
+
+/*
+ * Cause the torture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs parameter.
+ */
+static int torture_shutdown(void *arg)
+{
+ long delta;
+ unsigned long jiffies_snap;
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task started");
+ jiffies_snap = jiffies;
+ while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ !torture_must_stop()) {
+ delta = shutdown_time - jiffies_snap;
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_shutdown task: %lu jiffies remaining\n",
+ torture_type, delta);
+ schedule_timeout_interruptible(delta);
+ jiffies_snap = jiffies;
+ }
+ if (torture_must_stop()) {
+ torture_kthread_stopping("torture_shutdown");
+ return 0;
+ }
+
+ /* OK, shut down the system. */
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
+ shutdown_task = NULL; /* Avoid self-kill deadlock. */
+ if (torture_shutdown_hook)
+ torture_shutdown_hook();
+ else
+ VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+ kernel_power_off(); /* Shut down the system. */
+ return 0;
+}
+
+/*
+ * Start up the shutdown task.
+ */
+int torture_shutdown_init(int ssecs, void (*cleanup)(void))
+{
+ int ret = 0;
+
+ shutdown_secs = ssecs;
+ torture_shutdown_hook = cleanup;
+ if (shutdown_secs > 0) {
+ shutdown_time = jiffies + shutdown_secs * HZ;
+ ret = torture_create_kthread(torture_shutdown, NULL,
+ shutdown_task);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_init);
+
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int torture_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
+ ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ } else {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ }
+ mutex_unlock(&fullstop_mutex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block torture_shutdown_nb = {
+ .notifier_call = torture_shutdown_notify,
+};
+
+/*
+ * Shut down the shutdown task. Say what??? Heh! This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives. ;-)
+ */
+static void torture_shutdown_cleanup(void)
+{
+ unregister_reboot_notifier(&torture_shutdown_nb);
+ if (shutdown_task != NULL) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+ kthread_stop(shutdown_task);
+ }
+ shutdown_task = NULL;
+}
+
+/*
+ * Variables for stuttering, which means to periodically pause and
+ * restart testing in order to catch bugs that appear when load is
+ * suddenly applied to or removed from the system.
+ */
+static struct task_struct *stutter_task;
+static int stutter_pause_test;
+static int stutter;
+
+/*
+ * Block until the stutter interval ends. This must be called periodically
+ * by all running kthreads that need to be subject to stuttering.
+ */
+void stutter_wait(const char *title)
+{
+ while (ACCESS_ONCE(stutter_pause_test) ||
+ (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ if (stutter_pause_test)
+ if (ACCESS_ONCE(stutter_pause_test) == 1)
+ schedule_timeout_interruptible(1);
+ else
+ while (ACCESS_ONCE(stutter_pause_test))
+ cond_resched();
+ else
+ schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ torture_shutdown_absorb(title);
+ }
+}
+EXPORT_SYMBOL_GPL(stutter_wait);
+
+/*
+ * Cause the torture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int torture_stutter(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_stutter task started");
+ do {
+ if (!torture_must_stop()) {
+ if (stutter > 1) {
+ schedule_timeout_interruptible(stutter - 1);
+ ACCESS_ONCE(stutter_pause_test) = 2;
+ }
+ schedule_timeout_interruptible(1);
+ ACCESS_ONCE(stutter_pause_test) = 1;
+ }
+ if (!torture_must_stop())
+ schedule_timeout_interruptible(stutter);
+ ACCESS_ONCE(stutter_pause_test) = 0;
+ torture_shutdown_absorb("torture_stutter");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_stutter");
+ return 0;
+}
+
+/*
+ * Initialize and kick off the torture_stutter kthread.
+ */
+int torture_stutter_init(int s)
+{
+ int ret;
+
+ stutter = s;
+ ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_init);
+
+/*
+ * Cleanup after the torture_stutter kthread.
+ */
+static void torture_stutter_cleanup(void)
+{
+ if (!stutter_task)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
+ kthread_stop(stutter_task);
+ stutter_task = NULL;
+}
+
+/*
+ * Initialize torture module. Please note that this is -not- invoked via
+ * the usual module_init() mechanism, but rather by an explicit call from
+ * the client torture module. This call must be paired with a later
+ * torture_init_end().
+ *
+ * The runnable parameter points to a flag that controls whether or not
+ * the test is currently runnable. If there is no such flag, pass in NULL.
+ */
+bool torture_init_begin(char *ttype, bool v, int *runnable)
+{
+ mutex_lock(&fullstop_mutex);
+ if (torture_type != NULL) {
+ pr_alert("torture_init_begin: refusing %s init: %s running",
+ ttype, torture_type);
+ mutex_unlock(&fullstop_mutex);
+ return false;
+ }
+ torture_type = ttype;
+ verbose = v;
+ torture_runnable = runnable;
+ fullstop = FULLSTOP_DONTSTOP;
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_init_begin);
+
+/*
+ * Tell the torture module that initialization is complete.
+ */
+void torture_init_end(void)
+{
+ mutex_unlock(&fullstop_mutex);
+ register_reboot_notifier(&torture_shutdown_nb);
+}
+EXPORT_SYMBOL_GPL(torture_init_end);
+
+/*
+ * Clean up torture module. Please note that this is -not- invoked via
+ * the usual module_exit() mechanism, but rather by an explicit call from
+ * the client torture module. Returns true if a race with system shutdown
+ * is detected, otherwise, all kthreads started by functions in this file
+ * will be shut down.
+ *
+ * This must be called before the caller starts shutting down its own
+ * kthreads.
+ */
+bool torture_cleanup(void)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ mutex_unlock(&fullstop_mutex);
+ schedule_timeout_uninterruptible(10);
+ return true;
+ }
+ ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ mutex_unlock(&fullstop_mutex);
+ torture_shutdown_cleanup();
+ torture_shuffle_cleanup();
+ torture_stutter_cleanup();
+ torture_onoff_cleanup();
+ mutex_lock(&fullstop_mutex);
+ torture_type = NULL;
+ mutex_unlock(&fullstop_mutex);
+ return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup);
+
+/*
+ * Is it time for the current torture test to stop?
+ */
+bool torture_must_stop(void)
+{
+ return torture_must_stop_irq() || kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(torture_must_stop);
+
+/*
+ * Is it time for the current torture test to stop? This is the irq-safe
+ * version, hence no check for kthread_should_stop().
+ */
+bool torture_must_stop_irq(void)
+{
+ return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_must_stop_irq);
+
+/*
+ * Each kthread must wait for kthread_should_stop() before returning from
+ * its top-level function, otherwise segfaults ensue. This function
+ * prints a "stopping" message and waits for kthread_should_stop(), and
+ * should be called from all torture kthreads immediately prior to
+ * returning.
+ */
+void torture_kthread_stopping(char *title)
+{
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "Stopping %s", title);
+ VERBOSE_TOROUT_STRING(buf);
+ while (!kthread_should_stop()) {
+ torture_shutdown_absorb(title);
+ schedule_timeout_uninterruptible(1);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_kthread_stopping);
+
+/*
+ * Create a generic torture kthread that is immediately runnable. If you
+ * need the kthread to be stopped so that you can do something to it before
+ * it starts, you will need to open-code your own.
+ */
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+ char *f, struct task_struct **tp)
+{
+ int ret = 0;
+
+ VERBOSE_TOROUT_STRING(m);
+ *tp = kthread_run(fn, arg, s);
+ if (IS_ERR(*tp)) {
+ ret = PTR_ERR(*tp);
+ VERBOSE_TOROUT_ERRSTRING(f);
+ *tp = NULL;
+ }
+ torture_shuffle_task_register(*tp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_torture_create_kthread);
+
+/*
+ * Stop a generic kthread, emitting a message.
+ */
+void _torture_stop_kthread(char *m, struct task_struct **tp)
+{
+ if (*tp == NULL)
+ return;
+ VERBOSE_TOROUT_STRING(m);
+ kthread_stop(*tp);
+ *tp = NULL;
+}
+EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cd3134510f3..d4409356f40 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
help
See Documentation/trace/ftrace-design.txt
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+ bool
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -49,6 +52,11 @@ config HAVE_SYSCALL_TRACEPOINTS
help
See Documentation/trace/ftrace-design.txt
+config HAVE_FENTRY
+ bool
+ help
+ Arch supports the gcc options -pg with -mfentry
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -57,8 +65,13 @@ config HAVE_C_RECORDMCOUNT
config TRACER_MAX_TRACE
bool
+config TRACE_CLOCK
+ bool
+
config RING_BUFFER
bool
+ select TRACE_CLOCK
+ select IRQ_WORK
config FTRACE_NMI_ENTER
bool
@@ -69,21 +82,6 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
-config EVENT_POWER_TRACING_DEPRECATED
- depends on EVENT_TRACING
- bool "Deprecated power event trace API, to be removed"
- default y
- help
- Provides old power event types:
- C-state/idle accounting events:
- power:power_start
- power:power_end
- and old cpufreq accounting event:
- power:power_frequency
- This is for userspace compatibility
- and will vanish after 5 kernel iterations,
- namely 3.1.
-
config CONTEXT_SWITCH_TRACER
bool
@@ -109,6 +107,7 @@ config TRACING
select NOP_TRACER
select BINARY_PRINTF
select EVENT_TRACING
+ select TRACE_CLOCK
config GENERIC_TRACER
bool
@@ -141,7 +140,6 @@ if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
@@ -178,6 +176,8 @@ config IRQSOFF_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
+ select TRACER_SNAPSHOT
+ select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in irqs-off critical
sections, with microsecond accuracy.
@@ -200,6 +200,8 @@ config PREEMPT_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
+ select TRACER_SNAPSHOT
+ select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in preemption-off critical
sections, with microsecond accuracy.
@@ -219,6 +221,7 @@ config SCHED_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
+ select TRACER_SNAPSHOT
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
@@ -240,6 +243,37 @@ config FTRACE_SYSCALLS
help
Basic tracer to catch the syscall entry and exit events.
+config TRACER_SNAPSHOT
+ bool "Create a snapshot trace buffer"
+ select TRACER_MAX_TRACE
+ help
+ Allow tracing users to take snapshot of the current buffer using the
+ ftrace interface, e.g.:
+
+ echo 1 > /sys/kernel/debug/tracing/snapshot
+ cat snapshot
+
+config TRACER_SNAPSHOT_PER_CPU_SWAP
+ bool "Allow snapshot to swap per CPU"
+ depends on TRACER_SNAPSHOT
+ select RING_BUFFER_ALLOW_SWAP
+ help
+ Allow doing a snapshot of a single CPU buffer instead of a
+ full swap (all buffers). If this is set, then the following is
+ allowed:
+
+ echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+
+ After which, only the tracing buffer for CPU 2 was swapped with
+ the main tracing buffer, and the other CPU buffers remain the same.
+
+ When this is enabled, this adds a little more overhead to the
+ trace recording, as it needs to add some checks to synchronize
+ recording with swaps. But this does not affect the performance
+ of the overall system. This is enabled by default when the preempt
+ or irq latency tracers are enabled, as those need to swap as well
+ and already adds the overhead (plus a lot more).
+
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -272,7 +306,7 @@ config PROFILE_ANNOTATED_BRANCHES
bool "Trace likely/unlikely profiler"
select TRACE_BRANCH_PROFILING
help
- This tracer profiles all the the likely and unlikely macros
+ This tracer profiles all likely and unlikely macros
in the kernel. It will display the results in:
/sys/kernel/debug/tracing/trace_stat/branch_annotated
@@ -373,6 +407,7 @@ config KPROBE_EVENT
depends on HAVE_REGS_AND_STACK_ACCESS_API
bool "Enable kprobes-based dynamic events"
select TRACING
+ select PROBE_EVENTS
default y
help
This allows the user to add tracing events (similar to tracepoints)
@@ -385,24 +420,53 @@ config KPROBE_EVENT
This option is also required by perf-probe subcommand of perf tools.
If you want to use perf tools, this option is strongly recommended.
+config UPROBE_EVENT
+ bool "Enable uprobes-based dynamic events"
+ depends on ARCH_SUPPORTS_UPROBES
+ depends on MMU
+ depends on PERF_EVENTS
+ select UPROBES
+ select PROBE_EVENTS
+ select TRACING
+ default n
+ help
+ This allows the user to add tracing events on top of userspace
+ dynamic events (similar to tracepoints) on the fly via the trace
+ events interface. Those events can be inserted wherever uprobes
+ can probe, and record various registers.
+ This option is required if you plan to use perf-probe subcommand
+ of perf tools on user space applications.
+
+config PROBE_EVENTS
+ def_bool n
+
config DYNAMIC_FTRACE
- bool "enable/disable ftrace tracepoints dynamically"
+ bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
help
- This option will modify all the calls to ftrace dynamically
- (will patch them out of the binary image and replace them
- with a No-Op instruction) as they are called. A table is
- created to dynamically enable them again.
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
otherwise has native performance as long as no tracing is active.
- The changes to the code are done by a kernel thread that
- wakes up once a second and checks to see if any ftrace calls
- were made. If so, it runs stop_machine (stops all CPUS)
- and modifies the code to jump over the call to ftrace.
+config DYNAMIC_FTRACE_WITH_REGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
config FUNCTION_PROFILER
bool "Kernel function profiler"
@@ -471,6 +535,36 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
+config TRACEPOINT_BENCHMARK
+ bool "Add tracepoint that benchmarks tracepoints"
+ help
+ This option creates the tracepoint "benchmark:benchmark_event".
+ When the tracepoint is enabled, it kicks off a kernel thread that
+ goes into an infinite loop (calling cond_sched() to let other tasks
+ run), and calls the tracepoint. Each iteration will record the time
+ it took to write to the tracepoint and the next iteration that
+ data will be passed to the tracepoint itself. That is, the tracepoint
+ will report the time it took to do the previous tracepoint.
+ The string written to the tracepoint is a static string of 128 bytes
+ to keep the time the same. The initial string is simply a write of
+ "START". The second string records the cold cache time of the first
+ write which is not added to the rest of the calculations.
+
+ As it is a tight loop, it benchmarks as hot cache. That's fine because
+ we care most about hot paths that are probably in cache already.
+
+ An example of the output:
+
+ START
+ first=3672 [COLD CACHED]
+ last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
+ last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
+ last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
+ last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
+ last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
+ last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
+
+
config RING_BUFFER_BENCHMARK
tristate "Ring buffer benchmark stress tester"
depends on RING_BUFFER
@@ -487,6 +581,29 @@ config RING_BUFFER_BENCHMARK
If unsure, say N.
+config RING_BUFFER_STARTUP_TEST
+ bool "Ring buffer startup self test"
+ depends on RING_BUFFER
+ help
+ Run a simple self test on the ring buffer on boot up. Late in the
+ kernel boot sequence, the test will start that kicks off
+ a thread per cpu. Each thread will write various size events
+ into the ring buffer. Another thread is created to send IPIs
+ to each of the threads, where the IPI handler will also write
+ to the ring buffer, to test/stress the nesting ability.
+ If any anomalies are discovered, a warning will be displayed
+ and all ring buffers will be disabled.
+
+ The test runs for 10 seconds. This will slow your boot time
+ by at least 10 more seconds.
+
+ At the end of the test, statics and more checks are done.
+ It will output the stats of each per cpu buffer. What
+ was written, the sizes, what was read, what was lost, and
+ other similar details.
+
+ If unsure, say N
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5e..2611613f14f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -5,23 +5,22 @@ ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
CFLAGS_trace_selftest_dynamic.o = -pg
obj-y += trace_selftest_dynamic.o
endif
+endif
# If unlikely tracing is enabled, do not trace these files
ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
-#
-# Make the trace clocks available generally: it's infrastructure
-# relied on by ptrace for example:
-#
-obj-y += trace_clock.o
+obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
@@ -41,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
obj-$(CONFIG_EVENT_TRACING) += blktrace.o
@@ -53,6 +51,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM_RUNTIME),y)
@@ -61,5 +60,9 @@ endif
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
+obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+
+obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cdea7b56b0c..c1bd4ada2a0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/time.h>
#include <linux/uaccess.h>
+#include <linux/list.h>
#include <trace/events/block.h>
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;
static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
+static LIST_HEAD(running_trace_list);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
+
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -72,7 +76,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
bool blk_tracer = blk_tracer_enabled;
if (blk_tracer) {
- buffer = blk_tr->buffer;
+ buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len,
@@ -107,10 +111,18 @@ record_it:
* Send out a notify for this process, if we haven't done so since a trace
* started
*/
-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+static void trace_note_tsk(struct task_struct *tsk)
{
+ unsigned long flags;
+ struct blk_trace *bt;
+
tsk->btrace_seq = blktrace_seq;
- trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+ spin_lock_irqsave(&running_trace_lock, flags);
+ list_for_each_entry(bt, &running_trace_list, running_list) {
+ trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
+ sizeof(tsk->comm));
+ }
+ spin_unlock_irqrestore(&running_trace_lock, flags);
}
static void trace_note_time(struct blk_trace *bt)
@@ -147,7 +159,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
return;
local_irq_save(flags);
- buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+ buf = this_cpu_ptr(bt->msg_data);
va_start(args, fmt);
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
@@ -218,7 +230,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (blk_tracer) {
tracing_record_cmdline(current);
- buffer = blk_tr->buffer;
+ buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len,
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
goto record_it;
}
+ if (unlikely(tsk->btrace_seq != blktrace_seq))
+ trace_note_tsk(tsk);
+
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
-
- if (unlikely(tsk->btrace_seq != blktrace_seq))
- trace_note_tsk(bt, tsk);
-
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -311,13 +322,6 @@ int blk_trace_remove(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_trace_remove);
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
-
- return 0;
-}
-
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
@@ -331,18 +335,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
- .open = blk_dropped_open,
+ .open = simple_open,
.read = blk_dropped_read,
.llseek = default_llseek,
};
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
-
- return 0;
-}
-
static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
size_t count, loff_t *ppos)
{
@@ -371,7 +368,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
static const struct file_operations blk_msg_fops = {
.owner = THIS_MODULE,
- .open = blk_msg_open,
+ .open = simple_open,
.write = blk_msg_write,
.llseek = noop_llseek,
};
@@ -491,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->dir = dir;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
+ INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -581,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.end_lba = cbuts.end_lba,
.pid = cbuts.pid,
};
- memcpy(&buts.name, &cbuts.name, 32);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
if (ret)
return ret;
- if (copy_to_user(arg, &buts.name, 32)) {
+ if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
blk_trace_remove(q);
return -EFAULT;
}
@@ -615,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
+ spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ spin_unlock_irq(&running_trace_lock);
trace_note_time(bt);
ret = 0;
@@ -622,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
ret = 0;
}
@@ -699,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q)
* blk_add_trace_rq - Add a trace for a request oriented action
* @q: queue the io is for
* @rq: the source request
+ * @nr_bytes: number of completed bytes
* @what: the action
*
* Description:
@@ -706,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q)
*
**/
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
+ unsigned int nr_bytes, u32 what)
{
struct blk_trace *bt = q->blk_trace;
@@ -715,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
+ __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -727,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
static void blk_add_trace_rq_abort(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
}
static void blk_add_trace_rq_complete(void *ignore,
struct request_queue *q,
- struct request *rq)
+ struct request *rq,
+ unsigned int nr_bytes)
{
- blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+ blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
}
/**
@@ -778,8 +783,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
if (!error && !bio_flagged(bio, BIO_UPTODATE))
error = EIO;
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
- error, 0, NULL);
+ __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+ bio->bi_rw, what, error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -797,6 +802,7 @@ static void blk_add_trace_bio_complete(void *ignore,
static void blk_add_trace_bio_backmerge(void *ignore,
struct request_queue *q,
+ struct request *rq,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -804,6 +810,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
static void blk_add_trace_bio_frontmerge(void *ignore,
struct request_queue *q,
+ struct request *rq,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
@@ -880,8 +887,9 @@ static void blk_add_trace_split(void *ignore,
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
- BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+ __blk_add_trace(bt, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
+ !bio_flagged(bio, BIO_UPTODATE),
sizeof(rpdu), &rpdu);
}
}
@@ -913,9 +921,9 @@ static void blk_add_trace_bio_remap(void *ignore,
r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
r.sector_from = cpu_to_be64(from);
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
- BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
- sizeof(r), &r);
+ __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+ bio->bi_rw, BLK_TA_REMAP,
+ !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
}
/**
@@ -1421,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
return print_one_line(iter, true);
}
-static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+static int
+blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
@@ -1484,6 +1493,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
+ spin_lock_irq(&running_trace_lock);
+ list_del(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
@@ -1820,6 +1832,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i] = '\0';
}
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
#endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 683d559a0ee..ac9d1dad630 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
* Based on code in the latency_tracer, that is:
*
* Copyright (C) 2004-2006 Ingo Molnar
- * Copyright (C) 2004 William Lee Irwin III
+ * Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/stop_machine.h>
@@ -62,12 +62,31 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
+#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define INIT_REGEX_LOCK(opsname) \
+ .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
+#else
+#define INIT_REGEX_LOCK(opsname)
+#endif
+
+static struct ftrace_ops ftrace_list_end __read_mostly = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+};
+
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
/* Quick disabling of function tracer. */
-int function_trace_stop;
+int function_trace_stop __read_mostly;
+
+/* Current function tracing op */
+struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
+/* What to set function_trace_op to */
+static struct ftrace_ops *set_function_trace_op;
/* List for set_ftrace_pid's pids. */
LIST_HEAD(ftrace_pids);
@@ -84,53 +103,80 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops ftrace_list_end __read_mostly = {
- .func = ftrace_stub,
-};
-
-static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
-ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
+static struct ftrace_ops control_ops;
-static void
-ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+#else
+/* See comment below, where ftrace_ops_list_func is defined */
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
+#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
+#endif
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw() calls are needed to handle
+ * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
-static void ftrace_global_list_func(unsigned long ip,
- unsigned long parent_ip)
+#define do_for_each_ftrace_op(op, list) \
+ op = rcu_dereference_raw_notrace(list); \
+ do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op) \
+ while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ unlikely((op) != &ftrace_list_end))
+
+static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
- struct ftrace_ops *op;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
+ mutex_init(&ops->regex_lock);
+ ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
+}
- if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
- return;
+/**
+ * ftrace_nr_registered_ops - return number of ops registered
+ *
+ * Returns the number of ftrace_ops registered and tracing functions
+ */
+int ftrace_nr_registered_ops(void)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ mutex_lock(&ftrace_lock);
- trace_recursion_set(TRACE_GLOBAL_BIT);
- op = rcu_dereference_raw(ftrace_global_list); /*see above*/
- while (op != &ftrace_list_end) {
- op->func(ip, parent_ip);
- op = rcu_dereference_raw(op->next); /*see above*/
- };
- trace_recursion_clear(TRACE_GLOBAL_BIT);
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next)
+ cnt++;
+
+ mutex_unlock(&ftrace_lock);
+
+ return cnt;
}
-static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
{
if (!test_tsk_trace_trace(current))
return;
- ftrace_pid_function(ip, parent_ip);
+ ftrace_pid_function(ip, parent_ip, op, regs);
}
static void set_ftrace_pid_function(ftrace_func_t func)
@@ -149,78 +195,127 @@ static void set_ftrace_pid_function(ftrace_func_t func)
void clear_ftrace_function(void)
{
ftrace_trace_function = ftrace_stub;
- __ftrace_trace_function = ftrace_stub;
- __ftrace_trace_function_delay = ftrace_stub;
ftrace_pid_function = ftrace_stub;
}
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-/*
- * For those archs that do not test ftrace_trace_stop in their
- * mcount call site, we need to do it from C.
- */
-static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
+static void control_ops_disable_all(struct ftrace_ops *ops)
{
- if (function_trace_stop)
- return;
+ int cpu;
- __ftrace_trace_function(ip, parent_ip);
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(ops->disabled, cpu) = 1;
}
-#endif
-static void update_global_ops(void)
+static int control_ops_alloc(struct ftrace_ops *ops)
{
- ftrace_func_t func;
+ int __percpu *disabled;
+ disabled = alloc_percpu(int);
+ if (!disabled)
+ return -ENOMEM;
+
+ ops->disabled = disabled;
+ control_ops_disable_all(ops);
+ return 0;
+}
+
+static void ftrace_sync(struct work_struct *work)
+{
/*
- * If there's only one function registered, then call that
- * function directly. Otherwise, we need to iterate over the
- * registered callers.
+ * This function is just a stub to implement a hard force
+ * of synchronize_sched(). This requires synchronizing
+ * tasks even in userspace and idle.
+ *
+ * Yes, function tracing is rude.
*/
- if (ftrace_global_list == &ftrace_list_end ||
- ftrace_global_list->next == &ftrace_list_end)
- func = ftrace_global_list->func;
- else
- func = ftrace_global_list_func;
-
- /* If we filter on pids, update to use the pid function */
- if (!list_empty(&ftrace_pids)) {
- set_ftrace_pid_function(func);
- func = ftrace_pid_func;
- }
+}
- global_ops.func = func;
+static void ftrace_sync_ipi(void *data)
+{
+ /* Probably not needed, but do it anyway */
+ smp_rmb();
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void update_function_graph_func(void);
+#else
+static inline void update_function_graph_func(void) { }
+#endif
+
static void update_ftrace_function(void)
{
ftrace_func_t func;
- update_global_ops();
-
/*
* If we are at the end of the list and this ops is
- * not dynamic, then have the mcount trampoline call
- * the function directly
+ * recursion safe and not dynamic and the arch supports passing ops,
+ * then have the mcount trampoline call the function directly.
*/
if (ftrace_ops_list == &ftrace_list_end ||
(ftrace_ops_list->next == &ftrace_list_end &&
- !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
+ !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
+ (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
+ !FTRACE_FORCE_LIST_FUNC)) {
+ /* Set the ftrace_ops that the arch callback uses */
+ set_function_trace_op = ftrace_ops_list;
func = ftrace_ops_list->func;
- else
+ } else {
+ /* Just use the default ftrace_ops */
+ set_function_trace_op = &ftrace_list_end;
func = ftrace_ops_list_func;
+ }
+
+ update_function_graph_func();
+
+ /* If there's no change, then do nothing more here */
+ if (ftrace_trace_function == func)
+ return;
+
+ /*
+ * If we are using the list function, it doesn't care
+ * about the function_trace_ops.
+ */
+ if (func == ftrace_ops_list_func) {
+ ftrace_trace_function = func;
+ /*
+ * Don't even bother setting function_trace_ops,
+ * it would be racy to do so anyway.
+ */
+ return;
+ }
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+ /*
+ * For static tracing, we need to be a bit more careful.
+ * The function change takes affect immediately. Thus,
+ * we need to coorditate the setting of the function_trace_ops
+ * with the setting of the ftrace_trace_function.
+ *
+ * Set the function to the list ops, which will call the
+ * function we want, albeit indirectly, but it handles the
+ * ftrace_ops and doesn't depend on function_trace_op.
+ */
+ ftrace_trace_function = ftrace_ops_list_func;
+ /*
+ * Make sure all CPUs see this. Yes this is slow, but static
+ * tracing is slow and nasty to have enabled.
+ */
+ schedule_on_each_cpu(ftrace_sync);
+ /* Now all cpus are using the list ops. */
+ function_trace_op = set_function_trace_op;
+ /* Make sure the function_trace_op is visible on all CPUs */
+ smp_wmb();
+ /* Nasty way to force a rmb on all cpus */
+ smp_call_function(ftrace_sync_ipi, NULL, 1);
+ /* OK, we are all set to update the ftrace_trace_function now! */
+#endif /* !CONFIG_DYNAMIC_FTRACE */
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
ftrace_trace_function = func;
-#else
-#ifdef CONFIG_DYNAMIC_FTRACE
- /* do not update till all functions have been modified */
- __ftrace_trace_function_delay = func;
-#else
- __ftrace_trace_function = func;
-#endif
- ftrace_trace_function = ftrace_test_stop_func;
-#endif
+}
+
+int using_ftrace_ops_list_func(void)
+{
+ return ftrace_trace_function == ftrace_ops_list_func;
}
static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
@@ -259,26 +354,55 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
return 0;
}
-static int __register_ftrace_function(struct ftrace_ops *ops)
+static void add_ftrace_list_ops(struct ftrace_ops **list,
+ struct ftrace_ops *main_ops,
+ struct ftrace_ops *ops)
{
- if (ftrace_disabled)
- return -ENODEV;
+ int first = *list == &ftrace_list_end;
+ add_ftrace_ops(list, ops);
+ if (first)
+ add_ftrace_ops(&ftrace_ops_list, main_ops);
+}
+
+static int remove_ftrace_list_ops(struct ftrace_ops **list,
+ struct ftrace_ops *main_ops,
+ struct ftrace_ops *ops)
+{
+ int ret = remove_ftrace_ops(list, ops);
+ if (!ret && *list == &ftrace_list_end)
+ ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
+ return ret;
+}
- if (FTRACE_WARN_ON(ops == &global_ops))
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+ if (ops->flags & FTRACE_OPS_FL_DELETED)
return -EINVAL;
if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
return -EBUSY;
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ /*
+ * If the ftrace_ops specifies SAVE_REGS, then it only can be used
+ * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
+ * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
+ */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
+ !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
+ return -EINVAL;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
+ ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
+#endif
+
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- int first = ftrace_global_list == &ftrace_list_end;
- add_ftrace_ops(&ftrace_global_list, ops);
- ops->flags |= FTRACE_OPS_FL_ENABLED;
- if (first)
- add_ftrace_ops(&ftrace_ops_list, &global_ops);
+ if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+ if (control_ops_alloc(ops))
+ return -ENOMEM;
+ add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
} else
add_ftrace_ops(&ftrace_ops_list, ops);
@@ -292,21 +416,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
{
int ret;
- if (ftrace_disabled)
- return -ENODEV;
-
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (FTRACE_WARN_ON(ops == &global_ops))
- return -EINVAL;
-
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- ret = remove_ftrace_ops(&ftrace_global_list, ops);
- if (!ret && ftrace_global_list == &ftrace_list_end)
- ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
- if (!ret)
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+ ret = remove_ftrace_list_ops(&ftrace_control_list,
+ &control_ops, ops);
} else
ret = remove_ftrace_ops(&ftrace_ops_list, ops);
@@ -316,13 +431,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_enabled)
update_ftrace_function();
- /*
- * Dynamic ops may be freed, we must make sure that all
- * callers are done before leaving this function.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- synchronize_sched();
-
return 0;
}
@@ -366,7 +474,6 @@ struct ftrace_profile_stat {
#define PROFILES_PER_PAGE \
(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
-static int ftrace_profile_bits __read_mostly;
static int ftrace_profile_enabled __read_mostly;
/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -374,7 +481,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
-#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+#define FTRACE_PROFILE_HASH_BITS 10
+#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
static void *
function_stat_next(void *v, int idx)
@@ -485,12 +593,18 @@ static int function_stat_show(struct seq_file *m, void *v)
if (rec->counter <= 1)
stddev = 0;
else {
- stddev = rec->time_squared - rec->counter * avg * avg;
+ /*
+ * Apply Welford's method:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ */
+ stddev = rec->counter * rec->time_squared -
+ rec->time * rec->time;
+
/*
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, (rec->counter - 1) * 1000);
+ do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
@@ -556,7 +670,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
- for (i = 0; i < pages; i++) {
+ for (i = 1; i < pages; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
if (!pg->next)
goto out_free;
@@ -574,7 +688,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
free_page(tmp);
}
- free_page((unsigned long)stat->pages);
stat->pages = NULL;
stat->start = NULL;
@@ -605,13 +718,6 @@ static int ftrace_profile_init_cpu(int cpu)
if (!stat->hash)
return -ENOMEM;
- if (!ftrace_profile_bits) {
- size--;
-
- for (; size; size >>= 1)
- ftrace_profile_bits++;
- }
-
/* Preallocate the function profiling pages */
if (ftrace_profile_pages_init(stat) < 0) {
kfree(stat->hash);
@@ -627,7 +733,7 @@ static int ftrace_profile_init(void)
int cpu;
int ret = 0;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
ret = ftrace_profile_init_cpu(cpu);
if (ret)
break;
@@ -642,16 +748,15 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
{
struct ftrace_profile *rec;
struct hlist_head *hhd;
- struct hlist_node *n;
unsigned long key;
- key = hash_long(ip, ftrace_profile_bits);
+ key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
hhd = &stat->hash[key];
if (hlist_empty(hhd))
return NULL;
- hlist_for_each_entry_rcu(rec, n, hhd, node) {
+ hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
if (rec->ip == ip)
return rec;
}
@@ -664,7 +769,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
{
unsigned long key;
- key = hash_long(rec->ip, ftrace_profile_bits);
+ key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
hlist_add_head_rcu(&rec->node, &stat->hash[key]);
}
@@ -705,7 +810,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
}
static void
-function_profile_call(unsigned long ip, unsigned long parent_ip)
+function_profile_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
@@ -716,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
local_irq_save(flags);
- stat = &__get_cpu_var(ftrace_profile_stats);
+ stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
@@ -735,7 +841,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int profile_graph_entry(struct ftrace_graph_ent *trace)
{
- function_profile_call(trace->func, 0);
+ function_profile_call(trace->func, 0, NULL, NULL);
return 1;
}
@@ -747,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
local_irq_save(flags);
- stat = &__get_cpu_var(ftrace_profile_stats);
+ stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
@@ -795,6 +901,8 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
@@ -946,7 +1054,7 @@ struct ftrace_func_probe {
unsigned long flags;
unsigned long ip;
void *data;
- struct rcu_head rcu;
+ struct list_head free_list;
};
struct ftrace_func_entry {
@@ -977,10 +1085,10 @@ static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.notrace_hash = EMPTY_HASH,
.filter_hash = EMPTY_HASH,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(global_ops)
};
-static DEFINE_MUTEX(ftrace_regex_lock);
-
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -988,8 +1096,6 @@ struct ftrace_page {
int size;
};
-static struct ftrace_page *ftrace_new_pgs;
-
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
@@ -999,7 +1105,7 @@ static struct ftrace_page *ftrace_new_pgs;
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static bool ftrace_hash_empty(struct ftrace_hash *hash)
+static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
{
return !hash || !hash->count;
}
@@ -1010,7 +1116,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
unsigned long key;
struct ftrace_func_entry *entry;
struct hlist_head *hhd;
- struct hlist_node *n;
if (ftrace_hash_empty(hash))
return NULL;
@@ -1022,7 +1127,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
hhd = &hash->buckets[key];
- hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+ hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
if (entry->ip == ip)
return entry;
}
@@ -1079,7 +1184,7 @@ remove_hash_entry(struct ftrace_hash *hash,
static void ftrace_hash_clear(struct ftrace_hash *hash)
{
struct hlist_head *hhd;
- struct hlist_node *tp, *tn;
+ struct hlist_node *tn;
struct ftrace_func_entry *entry;
int size = 1 << hash->size_bits;
int i;
@@ -1089,7 +1194,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
for (i = 0; i < size; i++) {
hhd = &hash->buckets[i];
- hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
free_hash_entry(hash, entry);
}
FTRACE_WARN_ON(hash->count);
@@ -1119,6 +1224,13 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
}
+void ftrace_free_filter(struct ftrace_ops *ops)
+{
+ ftrace_ops_init(ops);
+ free_ftrace_hash(ops->filter_hash);
+ free_ftrace_hash(ops->notrace_hash);
+}
+
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
struct ftrace_hash *hash;
@@ -1129,7 +1241,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return NULL;
size = 1 << size_bits;
- hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
+ hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
if (!hash->buckets) {
kfree(hash);
@@ -1146,7 +1258,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
- struct hlist_node *tp;
int size;
int ret;
int i;
@@ -1161,7 +1272,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
ret = add_hash_entry(new_hash, entry->ip);
if (ret < 0)
goto free_hash;
@@ -1187,11 +1298,10 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
- struct hlist_node *tp, *tn;
+ struct hlist_node *tn;
struct hlist_head *hhd;
struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
- unsigned long key;
int size = src->count;
int bits = 0;
int ret;
@@ -1233,11 +1343,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
- hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
- if (bits > 0)
- key = hash_long(entry->ip, bits);
- else
- key = 0;
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
remove_hash_entry(src, entry);
__add_hash_entry(new_hash, entry);
}
@@ -1272,14 +1378,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
* the hashes are freed with call_rcu_sched().
*/
static int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
struct ftrace_hash *filter_hash;
struct ftrace_hash *notrace_hash;
int ret;
- filter_hash = rcu_dereference_raw(ops->filter_hash);
- notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ /*
+ * There's a small race when adding ops that the ftrace handler
+ * that wants regs, may be called without them. We can not
+ * allow that handler to be called if regs is NULL.
+ */
+ if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS))
+ return 0;
+#endif
+
+ filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
+ notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
if ((ftrace_hash_empty(filter_hash) ||
ftrace_lookup_ip(filter_hash, ip)) &&
@@ -1309,44 +1425,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
static int ftrace_cmp_recs(const void *a, const void *b)
{
- const struct dyn_ftrace *reca = a;
- const struct dyn_ftrace *recb = b;
+ const struct dyn_ftrace *key = a;
+ const struct dyn_ftrace *rec = b;
- if (reca->ip > recb->ip)
- return 1;
- if (reca->ip < recb->ip)
+ if (key->flags < rec->ip)
return -1;
+ if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
+ return 1;
return 0;
}
-/**
- * ftrace_location - return true if the ip giving is a traced location
- * @ip: the instruction pointer to check
- *
- * Returns 1 if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
- */
-int ftrace_location(unsigned long ip)
+static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
- key.ip = ip;
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
rec = bsearch(&key, pg->records, pg->index,
sizeof(struct dyn_ftrace),
ftrace_cmp_recs);
if (rec)
- return 1;
+ return rec->ip;
}
return 0;
}
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns rec->ip if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+unsigned long ftrace_location(unsigned long ip)
+{
+ return ftrace_location_range(ip, ip);
+}
+
+/**
+ * ftrace_text_reserved - return true if range contains an ftrace location
+ * @start: start of range to search
+ * @end: end of range to search (inclusive). @end points to the last byte to check.
+ *
+ * Returns 1 if @start and @end contains a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_text_reserved(const void *start, const void *end)
+{
+ unsigned long ret;
+
+ ret = ftrace_location_range((unsigned long)start,
+ (unsigned long)end);
+
+ return (int)!!ret;
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1407,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
/*
+ * If filter_hash is set, we want to match all functions
+ * that are in the hash but not in the other hash.
*
+ * If filter_hash is not set, then we are decrementing.
+ * That means we match anything that is in the hash
+ * and also in the other_hash. That is, we need to turn
+ * off functions in the other hash because they are disabled
+ * by this hash.
*/
if (filter_hash && in_hash && !in_other_hash)
match = 1;
@@ -1422,6 +1574,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
rec->flags++;
if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
return;
+ /*
+ * If any ops wants regs saved for this function
+ * then all ops will get saved regs.
+ */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
} else {
if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
return;
@@ -1446,35 +1604,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
-static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
-{
- if (ftrace_pages->index == ftrace_pages->size) {
- /* We should have allocated enough */
- if (WARN_ON(!ftrace_pages->next))
- return NULL;
- ftrace_pages = ftrace_pages->next;
- }
-
- return &ftrace_pages->records[ftrace_pages->index++];
-}
-
-static struct dyn_ftrace *
-ftrace_record_ip(unsigned long ip)
-{
- struct dyn_ftrace *rec;
-
- if (ftrace_disabled)
- return NULL;
-
- rec = ftrace_alloc_dyn_node(ip);
- if (!rec)
- return NULL;
-
- rec->ip = ip;
-
- return rec;
-}
-
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1524,21 +1653,6 @@ void ftrace_bug(int failed, unsigned long ip)
}
}
-
-/* Return 1 if the address range is reserved for ftrace */
-int ftrace_text_reserved(void *start, void *end)
-{
- struct dyn_ftrace *rec;
- struct ftrace_page *pg;
-
- do_for_each_ftrace_rec(pg, rec) {
- if (rec->ip <= (unsigned long)end &&
- rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
- return 1;
- } while_for_each_ftrace_rec();
- return 0;
-}
-
static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
@@ -1557,18 +1671,55 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
if (enable && (rec->flags & ~FTRACE_FL_MASK))
flag = FTRACE_FL_ENABLED;
+ /*
+ * If enabling and the REGS flag does not match the REGS_EN, then
+ * do not ignore this record. Set flags to fail the compare against
+ * ENABLED.
+ */
+ if (flag &&
+ (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
+ flag |= FTRACE_FL_REGS;
+
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
return FTRACE_UPDATE_IGNORE;
if (flag) {
- if (update)
+ /* Save off if rec is being enabled (for return value) */
+ flag ^= rec->flags & FTRACE_FL_ENABLED;
+
+ if (update) {
rec->flags |= FTRACE_FL_ENABLED;
- return FTRACE_UPDATE_MAKE_CALL;
+ if (flag & FTRACE_FL_REGS) {
+ if (rec->flags & FTRACE_FL_REGS)
+ rec->flags |= FTRACE_FL_REGS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_REGS_EN;
+ }
+ }
+
+ /*
+ * If this record is being updated from a nop, then
+ * return UPDATE_MAKE_CALL.
+ * Otherwise,
+ * return UPDATE_MODIFY_CALL to tell the caller to convert
+ * from the save regs, to a non-save regs function or
+ * vice versa.
+ */
+ if (flag & FTRACE_FL_ENABLED)
+ return FTRACE_UPDATE_MAKE_CALL;
+
+ return FTRACE_UPDATE_MODIFY_CALL;
}
- if (update)
- rec->flags &= ~FTRACE_FL_ENABLED;
+ if (update) {
+ /* If there's no more users, clear all flags */
+ if (!(rec->flags & ~FTRACE_FL_MASK))
+ rec->flags = 0;
+ else
+ /* Just disable the record (keep REGS state) */
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ }
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -1600,13 +1751,53 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
return ftrace_check_record(rec, enable, 0);
}
+/**
+ * ftrace_get_addr_new - Get the call address to set to
+ * @rec: The ftrace record descriptor
+ *
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ *
+ * Returns the address of the trampoline to set to
+ */
+unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_REGS)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
+/**
+ * ftrace_get_addr_curr - Get the call address that is already there
+ * @rec: The ftrace record descriptor
+ *
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ *
+ * Returns the address of the trampoline that is currently being called
+ */
+unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_REGS_EN)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
+ unsigned long ftrace_old_addr;
unsigned long ftrace_addr;
int ret;
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ /* This needs to be done before we call ftrace_update_record */
+ ftrace_old_addr = ftrace_get_addr_curr(rec);
ret = ftrace_update_record(rec, enable);
@@ -1619,12 +1810,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
case FTRACE_UPDATE_MAKE_NOP:
return ftrace_make_nop(NULL, rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
return -1; /* unknow ftrace bug */
}
-static void ftrace_replace_code(int update)
+void __weak ftrace_replace_code(int enable)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -1634,7 +1828,7 @@ static void ftrace_replace_code(int update)
return;
do_for_each_ftrace_rec(pg, rec) {
- failed = __ftrace_replace_code(rec, update);
+ failed = __ftrace_replace_code(rec, enable);
if (failed) {
ftrace_bug(failed, rec->ip);
/* Stop processing */
@@ -1752,22 +1946,55 @@ int __weak ftrace_arch_code_modify_post_process(void)
return 0;
}
-static int __ftrace_modify_code(void *data)
+void ftrace_modify_all_code(int command)
{
- int *command = data;
+ int update = command & FTRACE_UPDATE_TRACE_FUNC;
+ int err = 0;
- if (*command & FTRACE_UPDATE_CALLS)
+ /*
+ * If the ftrace_caller calls a ftrace_ops func directly,
+ * we need to make sure that it only traces functions it
+ * expects to trace. When doing the switch of functions,
+ * we need to update to the ftrace_ops_list_func first
+ * before the transition between old and new calls are set,
+ * as the ftrace_ops_list_func will check the ops hashes
+ * to make sure the ops are having the right functions
+ * traced.
+ */
+ if (update) {
+ err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+ if (FTRACE_WARN_ON(err))
+ return;
+ }
+
+ if (command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
- else if (*command & FTRACE_DISABLE_CALLS)
+ else if (command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
- if (*command & FTRACE_UPDATE_TRACE_FUNC)
- ftrace_update_ftrace_func(ftrace_trace_function);
+ if (update && ftrace_trace_function != ftrace_ops_list_func) {
+ function_trace_op = set_function_trace_op;
+ smp_wmb();
+ /* If irqs are disabled, we are in stop machine */
+ if (!irqs_disabled())
+ smp_call_function(ftrace_sync_ipi, NULL, 1);
+ err = ftrace_update_ftrace_func(ftrace_trace_function);
+ if (FTRACE_WARN_ON(err))
+ return;
+ }
- if (*command & FTRACE_START_FUNC_RET)
- ftrace_enable_ftrace_graph_caller();
- else if (*command & FTRACE_STOP_FUNC_RET)
- ftrace_disable_ftrace_graph_caller();
+ if (command & FTRACE_START_FUNC_RET)
+ err = ftrace_enable_ftrace_graph_caller();
+ else if (command & FTRACE_STOP_FUNC_RET)
+ err = ftrace_disable_ftrace_graph_caller();
+ FTRACE_WARN_ON(err);
+}
+
+static int __ftrace_modify_code(void *data)
+{
+ int *command = data;
+
+ ftrace_modify_all_code(*command);
return 0;
}
@@ -1818,16 +2045,6 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- /*
- * For archs that call ftrace_test_stop_func(), we must
- * wait till after we update all the function callers
- * before we update the callback. This keeps different
- * ops that record different functions from corrupting
- * each other.
- */
- __ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
function_trace_stop--;
ret = ftrace_arch_code_modify_post_process();
@@ -1838,6 +2055,11 @@ static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
static int global_start_up;
+static void control_ops_free(struct ftrace_ops *ops)
+{
+ free_percpu(ops->disabled);
+}
+
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -1853,38 +2075,37 @@ static void ftrace_startup_enable(int command)
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
- bool hash_enable = true;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
+ ret = __register_ftrace_function(ops);
+ if (ret)
+ return ret;
+
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
- /* ops marked global share the filter hashes */
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- ops = &global_ops;
- /* Don't update hash if global is already set */
- if (global_start_up)
- hash_enable = false;
- global_start_up++;
- }
-
ops->flags |= FTRACE_OPS_FL_ENABLED;
- if (hash_enable)
- ftrace_hash_rec_enable(ops, 1);
+
+ ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
return 0;
}
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
{
- bool hash_disable = true;
+ int ret;
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
+
+ ret = __unregister_ftrace_function(ops);
+ if (ret)
+ return ret;
ftrace_start_up--;
/*
@@ -1894,21 +2115,9 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- ops = &global_ops;
- global_start_up--;
- WARN_ON_ONCE(global_start_up < 0);
- /* Don't update hash if global still has users */
- if (global_start_up) {
- WARN_ON_ONCE(!ftrace_start_up);
- hash_disable = false;
- }
- }
-
- if (hash_disable)
- ftrace_hash_rec_disable(ops, 1);
+ ftrace_hash_rec_disable(ops, 1);
- if (ops != &global_ops || !global_start_up)
+ if (!global_start_up)
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
command |= FTRACE_UPDATE_CALLS;
@@ -1918,10 +2127,42 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
command |= FTRACE_UPDATE_TRACE_FUNC;
}
- if (!command || !ftrace_enabled)
- return;
+ if (!command || !ftrace_enabled) {
+ /*
+ * If these are control ops, they still need their
+ * per_cpu field freed. Since, function tracing is
+ * not currently active, we can just free them
+ * without synchronizing all CPUs.
+ */
+ if (ops->flags & FTRACE_OPS_FL_CONTROL)
+ control_ops_free(ops);
+ return 0;
+ }
ftrace_run_update_code(command);
+
+ /*
+ * Dynamic ops may be freed, we must make sure that all
+ * callers are done before leaving this function.
+ * The same goes for freeing the per_cpu data of the control
+ * ops.
+ *
+ * Again, normal synchronize_sched() is not good enough.
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ schedule_on_each_cpu(ftrace_sync);
+
+ if (ops->flags & FTRACE_OPS_FL_CONTROL)
+ control_ops_free(ops);
+ }
+
+ return 0;
}
static void ftrace_startup_sysctl(void)
@@ -1947,23 +2188,69 @@ static void ftrace_shutdown_sysctl(void)
}
static cycle_t ftrace_update_time;
-static unsigned long ftrace_update_cnt;
unsigned long ftrace_update_tot_cnt;
-static int ops_traces_mod(struct ftrace_ops *ops)
+static inline int ops_traces_mod(struct ftrace_ops *ops)
{
- struct ftrace_hash *hash;
+ /*
+ * Filter_hash being empty will default to trace module.
+ * But notrace hash requires a test of individual module functions.
+ */
+ return ftrace_hash_empty(ops->filter_hash) &&
+ ftrace_hash_empty(ops->notrace_hash);
+}
+
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static inline bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ /* If ops traces all mods, we already accounted for it */
+ if (ops_traces_mod(ops))
+ return 0;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->filter_hash) &&
+ !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+ return 0;
- hash = ops->filter_hash;
- return ftrace_hash_empty(hash);
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+ return 0;
+
+ return 1;
}
-static int ftrace_update_code(struct module *mod)
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
+static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
+ unsigned long update_cnt = 0;
unsigned long ref = 0;
+ bool test = false;
int i;
/*
@@ -1977,24 +2264,30 @@ static int ftrace_update_code(struct module *mod)
for (ops = ftrace_ops_list;
ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED &&
- ops_traces_mod(ops))
- ref++;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ if (ops_traces_mod(ops))
+ ref++;
+ else
+ test = true;
+ }
}
}
start = ftrace_now(raw_smp_processor_id());
- ftrace_update_cnt = 0;
- for (pg = ftrace_new_pgs; pg; pg = pg->next) {
+ for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
+ int cnt = ref;
+
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- p->flags = ref;
+ if (test)
+ cnt += referenced_filters(p);
+ p->flags = cnt;
/*
* Do the initial record conversion from mcount jump
@@ -2003,7 +2296,7 @@ static int ftrace_update_code(struct module *mod)
if (!ftrace_code_disable(mod, p))
break;
- ftrace_update_cnt++;
+ update_cnt++;
/*
* If the tracing is enabled, go ahead and enable the record.
@@ -2014,7 +2307,7 @@ static int ftrace_update_code(struct module *mod)
* conversion puts the module to the correct state, thus
* passing the ftrace_make_call check.
*/
- if (ftrace_start_up && ref) {
+ if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
ftrace_bug(failed, p->ip);
@@ -2022,11 +2315,9 @@ static int ftrace_update_code(struct module *mod)
}
}
- ftrace_new_pgs = NULL;
-
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
- ftrace_update_tot_cnt += ftrace_update_cnt;
+ ftrace_update_tot_cnt += update_cnt;
return 0;
}
@@ -2118,22 +2409,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
return NULL;
}
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
-{
- int cnt;
-
- if (!num_to_init) {
- pr_info("ftrace: No functions to be traced?\n");
- return -1;
- }
-
- cnt = num_to_init / ENTRIES_PER_PAGE;
- pr_info("ftrace: allocating %ld entries in %d pages\n",
- num_to_init, cnt + 1);
-
- return 0;
-}
-
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
struct ftrace_iterator {
@@ -2275,7 +2550,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
!ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
- !(rec->flags & ~FTRACE_FL_MASK))) {
+ !(rec->flags & FTRACE_FL_ENABLED))) {
rec = NULL;
goto retry;
@@ -2294,7 +2569,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
{
iter->pos = 0;
iter->func_pos = 0;
- iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
}
static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2377,8 +2652,9 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED)
- seq_printf(m, " (%ld)",
- rec->flags & ~FTRACE_FL_MASK);
+ seq_printf(m, " (%ld)%s",
+ rec->flags & ~FTRACE_FL_MASK,
+ rec->flags & FTRACE_FL_REGS ? " R" : "");
seq_printf(m, "\n");
return 0;
@@ -2395,57 +2671,35 @@ static int
ftrace_avail_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
-
- iter->pg = ftrace_pages_start;
- iter->ops = &global_ops;
-
- ret = seq_open(file, &show_ftrace_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = iter;
- } else {
- kfree(iter);
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (iter) {
+ iter->pg = ftrace_pages_start;
+ iter->ops = &global_ops;
}
- return ret;
+ return iter ? 0 : -ENOMEM;
}
static int
ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
-
- iter->pg = ftrace_pages_start;
- iter->flags = FTRACE_ITER_ENABLED;
- iter->ops = &global_ops;
-
- ret = seq_open(file, &show_ftrace_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = iter;
- } else {
- kfree(iter);
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (iter) {
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ENABLED;
+ iter->ops = &global_ops;
}
- return ret;
+ return iter ? 0 : -ENOMEM;
}
static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -2468,7 +2722,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
* routine, you can use ftrace_filter_write() for the write
* routine if @flag has FTRACE_ITER_FILTER set, or
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * tracing_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
*/
int
@@ -2479,6 +2733,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
struct ftrace_hash *hash;
int ret = 0;
+ ftrace_ops_init(ops);
+
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -2491,28 +2747,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
return -ENOMEM;
}
+ iter->ops = ops;
+ iter->flags = flag;
+
+ mutex_lock(&ops->regex_lock);
+
if (flag & FTRACE_ITER_NOTRACE)
hash = ops->notrace_hash;
else
hash = ops->filter_hash;
- iter->ops = ops;
- iter->flags = flag;
-
if (file->f_mode & FMODE_WRITE) {
- mutex_lock(&ftrace_lock);
iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
- mutex_unlock(&ftrace_lock);
-
if (!iter->hash) {
trace_parser_put(&iter->parser);
kfree(iter);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_unlock;
}
}
- mutex_lock(&ftrace_regex_lock);
-
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
ftrace_filter_reset(iter->hash);
@@ -2532,7 +2786,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
} else
file->private_data = iter;
- mutex_unlock(&ftrace_regex_lock);
+
+ out_unlock:
+ mutex_unlock(&ops->regex_lock);
return ret;
}
@@ -2540,7 +2796,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
static int
ftrace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&global_ops,
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops,
FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
inode, file);
}
@@ -2548,21 +2806,10 @@ ftrace_filter_open(struct inode *inode, struct file *file)
static int
ftrace_notrace_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
- inode, file);
-}
-
-loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
-{
- loff_t ret;
-
- if (file->f_mode & FMODE_READ)
- ret = seq_lseek(file, offset, origin);
- else
- file->f_pos = ret = 1;
+ struct ftrace_ops *ops = inode->i_private;
- return ret;
+ return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
+ inode, file);
}
static int ftrace_match(char *str, char *regex, int len, int type)
@@ -2746,14 +2993,13 @@ static int __init ftrace_mod_cmd_init(void)
{
return register_ftrace_command(&ftrace_mod_cmd);
}
-device_initcall(ftrace_mod_cmd_init);
+core_initcall(ftrace_mod_cmd_init);
-static void
-function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
+static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct ftrace_func_probe *entry;
struct hlist_head *hhd;
- struct hlist_node *n;
unsigned long key;
key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2769,7 +3015,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
* on the hash. rcu_read_lock is too dangerous here.
*/
preempt_disable_notrace();
- hlist_for_each_entry_rcu(entry, n, hhd, node) {
+ hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
if (entry->ip == ip)
entry->ops->func(ip, parent_ip, &entry->data);
}
@@ -2779,6 +3025,8 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(trace_probe_ops)
};
static int ftrace_probe_registered;
@@ -2788,8 +3036,12 @@ static void __enable_ftrace_function_probe(void)
int ret;
int i;
- if (ftrace_probe_registered)
+ if (ftrace_probe_registered) {
+ /* still need to update the function call sites */
+ if (ftrace_enabled)
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
return;
+ }
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
@@ -2800,16 +3052,13 @@ static void __enable_ftrace_function_probe(void)
if (i == FTRACE_FUNC_HASHSIZE)
return;
- ret = __register_ftrace_function(&trace_probe_ops);
- if (!ret)
- ret = ftrace_startup(&trace_probe_ops, 0);
+ ret = ftrace_startup(&trace_probe_ops, 0);
ftrace_probe_registered = 1;
}
static void __disable_ftrace_function_probe(void)
{
- int ret;
int i;
if (!ftrace_probe_registered)
@@ -2822,36 +3071,33 @@ static void __disable_ftrace_function_probe(void)
}
/* no more funcs left */
- ret = __unregister_ftrace_function(&trace_probe_ops);
- if (!ret)
- ftrace_shutdown(&trace_probe_ops, 0);
+ ftrace_shutdown(&trace_probe_ops, 0);
ftrace_probe_registered = 0;
}
-static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+static void ftrace_free_entry(struct ftrace_func_probe *entry)
{
- struct ftrace_func_probe *entry =
- container_of(rhp, struct ftrace_func_probe, rcu);
-
if (entry->ops->free)
- entry->ops->free(&entry->data);
+ entry->ops->free(entry->ops, entry->ip, &entry->data);
kfree(entry);
}
-
int
register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
struct ftrace_func_probe *entry;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
int type, len, not;
unsigned long key;
int count = 0;
char *search;
+ int ret;
type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
@@ -2860,10 +3106,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
if (WARN_ON(not))
return -EINVAL;
- mutex_lock(&ftrace_lock);
+ mutex_lock(&trace_probe_ops.regex_lock);
- if (unlikely(ftrace_disabled))
- goto out_unlock;
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (!hash) {
+ count = -ENOMEM;
+ goto out;
+ }
+
+ if (unlikely(ftrace_disabled)) {
+ count = -ENODEV;
+ goto out;
+ }
+
+ mutex_lock(&ftrace_lock);
do_for_each_ftrace_rec(pg, rec) {
@@ -2887,14 +3143,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
* for each function we find. We call the callback
* to give the caller an opportunity to do so.
*/
- if (ops->callback) {
- if (ops->callback(rec->ip, &entry->data) < 0) {
+ if (ops->init) {
+ if (ops->init(ops, rec->ip, &entry->data) < 0) {
/* caller does not like this func */
kfree(entry);
continue;
}
}
+ ret = enter_record(hash, rec, 0);
+ if (ret < 0) {
+ kfree(entry);
+ count = ret;
+ goto out_unlock;
+ }
+
entry->ops = ops;
entry->ip = rec->ip;
@@ -2902,10 +3165,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
} while_for_each_ftrace_rec();
+
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ if (ret < 0)
+ count = ret;
+
__enable_ftrace_function_probe();
out_unlock:
mutex_unlock(&ftrace_lock);
+ out:
+ mutex_unlock(&trace_probe_ops.regex_lock);
+ free_ftrace_hash(hash);
return count;
}
@@ -2919,8 +3190,13 @@ static void
__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data, int flags)
{
+ struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
- struct hlist_node *n, *tmp;
+ struct ftrace_func_probe *p;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct list_head free_list;
+ struct ftrace_hash *hash;
+ struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int type = MATCH_FULL;
int i, len = 0;
@@ -2939,11 +3215,19 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
return;
}
- mutex_lock(&ftrace_lock);
+ mutex_lock(&trace_probe_ops.regex_lock);
+
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (!hash)
+ /* Hmm, should report this somehow */
+ goto out_unlock;
+
+ INIT_LIST_HEAD(&free_list);
+
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
- hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+ hlist_for_each_entry_safe(entry, tmp, hhd, node) {
/* break up if statements for readability */
if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -2960,12 +3244,32 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
}
- hlist_del(&entry->node);
- call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+ rec_entry = ftrace_lookup_ip(hash, entry->ip);
+ /* It is possible more than one entry had this ip */
+ if (rec_entry)
+ free_hash_entry(hash, rec_entry);
+
+ hlist_del_rcu(&entry->node);
+ list_add(&entry->free_list, &free_list);
}
}
+ mutex_lock(&ftrace_lock);
__disable_ftrace_function_probe();
+ /*
+ * Remove after the disable is called. Otherwise, if the last
+ * probe is removed, a null hash means *all enabled*.
+ */
+ ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ synchronize_sched();
+ list_for_each_entry_safe(entry, p, &free_list, free_list) {
+ list_del(&entry->free_list);
+ ftrace_free_entry(entry);
+ }
mutex_unlock(&ftrace_lock);
+
+ out_unlock:
+ mutex_unlock(&trace_probe_ops.regex_lock);
+ free_ftrace_hash(hash);
}
void
@@ -2990,7 +3294,11 @@ void unregister_ftrace_function_probe_all(char *glob)
static LIST_HEAD(ftrace_commands);
static DEFINE_MUTEX(ftrace_cmd_mutex);
-int register_ftrace_command(struct ftrace_func_command *cmd)
+/*
+ * Currently we only register ftrace commands from __init, so mark this
+ * __init too.
+ */
+__init int register_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p;
int ret = 0;
@@ -3009,7 +3317,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd)
return ret;
}
-int unregister_ftrace_command(struct ftrace_func_command *cmd)
+/*
+ * Currently we only unregister ftrace commands from __init, so mark
+ * this __init too.
+ */
+__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p, *n;
int ret = -ENODEV;
@@ -3074,18 +3386,17 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
if (!cnt)
return 0;
- mutex_lock(&ftrace_regex_lock);
-
- ret = -ENODEV;
- if (unlikely(ftrace_disabled))
- goto out_unlock;
-
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
iter = m->private;
} else
iter = file->private_data;
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ /* iter->hash is a local copy, so we don't need regex_lock */
+
parser = &iter->parser;
read = trace_get_user(parser, ubuf, cnt, ppos);
@@ -3094,14 +3405,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
ret = ftrace_process_regex(iter->hash, parser->buffer,
parser->idx, enable);
trace_parser_clear(parser);
- if (ret)
- goto out_unlock;
+ if (ret < 0)
+ goto out;
}
ret = read;
-out_unlock:
- mutex_unlock(&ftrace_regex_lock);
-
+ out:
return ret;
}
@@ -3120,49 +3429,112 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
- int reset, int enable)
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+{
+ struct ftrace_func_entry *entry;
+
+ if (!ftrace_location(ip))
+ return -EINVAL;
+
+ if (remove) {
+ entry = ftrace_lookup_ip(hash, ip);
+ if (!entry)
+ return -ENOENT;
+ free_hash_entry(hash, entry);
+ return 0;
+ }
+
+ return add_hash_entry(hash, ip);
+}
+
+static void ftrace_ops_update_code(struct ftrace_ops *ops)
+{
+ if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+}
+
+static int
+ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
+ unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
int ret;
- /* All global ops uses the global ops filters */
- if (ops->flags & FTRACE_OPS_FL_GLOBAL)
- ops = &global_ops;
-
if (unlikely(ftrace_disabled))
return -ENODEV;
+ mutex_lock(&ops->regex_lock);
+
if (enable)
orig_hash = &ops->filter_hash;
else
orig_hash = &ops->notrace_hash;
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
- if (!hash)
- return -ENOMEM;
+ if (!hash) {
+ ret = -ENOMEM;
+ goto out_regex_unlock;
+ }
- mutex_lock(&ftrace_regex_lock);
if (reset)
ftrace_filter_reset(hash);
- if (buf)
- ftrace_match_records(hash, buf, len);
+ if (buf && !ftrace_match_records(hash, buf, len)) {
+ ret = -EINVAL;
+ goto out_regex_unlock;
+ }
+ if (ip) {
+ ret = ftrace_match_addr(hash, ip, remove);
+ if (ret < 0)
+ goto out_regex_unlock;
+ }
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
- && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ if (!ret)
+ ftrace_ops_update_code(ops);
mutex_unlock(&ftrace_lock);
- mutex_unlock(&ftrace_regex_lock);
+ out_regex_unlock:
+ mutex_unlock(&ops->regex_lock);
free_ftrace_hash(hash);
return ret;
}
+static int
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
+ int reset, int enable)
+{
+ return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+}
+
+/**
+ * ftrace_set_filter_ip - set a function to filter on in ftrace by address
+ * @ops - the ops to set the filter with
+ * @ip - the address to add to or remove from the filter.
+ * @remove - non zero to remove the ip from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ip is NULL, it failes to update filter.
+ */
+int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
+ int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ip, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable)
+{
+ return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+}
+
/**
* ftrace_set_filter - set a function to filter on in ftrace
* @ops - the ops to set the filter with
@@ -3173,10 +3545,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
*/
-void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
{
- ftrace_set_regex(ops, buf, len, reset, 1);
+ ftrace_ops_init(ops);
+ return ftrace_set_regex(ops, buf, len, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3191,15 +3564,15 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
* for tracing.
*/
-void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
{
- ftrace_set_regex(ops, buf, len, reset, 0);
+ ftrace_ops_init(ops);
+ return ftrace_set_regex(ops, buf, len, reset, 0);
}
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
/**
- * ftrace_set_filter - set a function to filter on in ftrace
- * @ops - the ops to set the filter with
+ * ftrace_set_global_filter - set a function to filter on with global tracers
* @buf - the string that holds the function filter text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -3214,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
/**
- * ftrace_set_notrace - set a function to not trace in ftrace
- * @ops - the ops to set the notrace filter with
+ * ftrace_set_global_notrace - set a function to not trace with global tracers
* @buf - the string that holds the function notrace text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -3237,23 +3609,28 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+/* Used by function selftest to not test if filter is set */
+bool ftrace_filter_param __initdata;
+
static int __init set_ftrace_notrace(char *str)
{
- strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ ftrace_filter_param = true;
+ strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
- strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ ftrace_filter_param = true;
+ strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
static int __init set_graph_function(char *str)
{
@@ -3271,7 +3648,7 @@ static void __init set_ftrace_early_graph(char *buf)
func = strsep(&buf, ",");
/* we allow only one expression at a time */
ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- func);
+ FTRACE_GRAPH_MAX_FUNCS, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -3284,6 +3661,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
{
char *func;
+ ftrace_ops_init(ops);
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -3311,10 +3690,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
int filter_hash;
int ret;
- mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
iter = m->private;
-
seq_release(inode, file);
} else
iter = file->private_data;
@@ -3327,6 +3704,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
trace_parser_put(parser);
+ mutex_lock(&iter->ops->regex_lock);
+
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
@@ -3338,16 +3717,16 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
- if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
- && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ if (!ret)
+ ftrace_ops_update_code(iter->ops);
mutex_unlock(&ftrace_lock);
}
+
+ mutex_unlock(&iter->ops->regex_lock);
free_ftrace_hash(iter->hash);
kfree(iter);
- mutex_unlock(&ftrace_regex_lock);
return 0;
}
@@ -3369,7 +3748,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = tracing_lseek,
.release = ftrace_regex_release,
};
@@ -3377,7 +3756,7 @@ static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
.read = seq_read,
.write = ftrace_notrace_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = tracing_lseek,
.release = ftrace_regex_release,
};
@@ -3386,15 +3765,25 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
int ftrace_graph_count;
-int ftrace_graph_filter_enabled;
+int ftrace_graph_notrace_count;
unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+
+struct ftrace_graph_data {
+ unsigned long *table;
+ size_t size;
+ int *count;
+ const struct seq_operations *seq_ops;
+};
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
- if (*pos >= ftrace_graph_count)
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (*pos >= *fgd->count)
return NULL;
- return &ftrace_graph_funcs[*pos];
+ return &fgd->table[*pos];
}
static void *
@@ -3406,10 +3795,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos)
static void *g_start(struct seq_file *m, loff_t *pos)
{
+ struct ftrace_graph_data *fgd = m->private;
+
mutex_lock(&graph_lock);
/* Nothing, tell g_show to print all functions are enabled */
- if (!ftrace_graph_filter_enabled && !*pos)
+ if (!*fgd->count && !*pos)
return (void *)1;
return __g_next(m, pos);
@@ -3445,38 +3836,88 @@ static const struct seq_operations ftrace_graph_seq_ops = {
};
static int
-ftrace_graph_open(struct inode *inode, struct file *file)
+__ftrace_graph_open(struct inode *inode, struct file *file,
+ struct ftrace_graph_data *fgd)
{
int ret = 0;
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
mutex_lock(&graph_lock);
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
- ftrace_graph_filter_enabled = 0;
- ftrace_graph_count = 0;
- memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
+ *fgd->count = 0;
+ memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
}
mutex_unlock(&graph_lock);
- if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &ftrace_graph_seq_ops);
+ if (file->f_mode & FMODE_READ) {
+ ret = seq_open(file, fgd->seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = fgd;
+ }
+ } else
+ file->private_data = fgd;
return ret;
}
static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_graph_data *fgd;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+ if (fgd == NULL)
+ return -ENOMEM;
+
+ fgd->table = ftrace_graph_funcs;
+ fgd->size = FTRACE_GRAPH_MAX_FUNCS;
+ fgd->count = &ftrace_graph_count;
+ fgd->seq_ops = &ftrace_graph_seq_ops;
+
+ return __ftrace_graph_open(inode, file, fgd);
+}
+
+static int
+ftrace_graph_notrace_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_graph_data *fgd;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+ if (fgd == NULL)
+ return -ENOMEM;
+
+ fgd->table = ftrace_graph_notrace_funcs;
+ fgd->size = FTRACE_GRAPH_MAX_FUNCS;
+ fgd->count = &ftrace_graph_notrace_count;
+ fgd->seq_ops = &ftrace_graph_seq_ops;
+
+ return __ftrace_graph_open(inode, file, fgd);
+}
+
+static int
ftrace_graph_release(struct inode *inode, struct file *file)
{
- if (file->f_mode & FMODE_READ)
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+
+ kfree(m->private);
seq_release(inode, file);
+ } else {
+ kfree(file->private_data);
+ }
+
return 0;
}
static int
-ftrace_set_func(unsigned long *array, int *idx, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -3489,7 +3930,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
/* decode regex */
type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
- if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
+ if (!not && *idx >= size)
return -EBUSY;
search_len = strlen(search);
@@ -3517,7 +3958,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
fail = 0;
if (!exists) {
array[(*idx)++] = rec->ip;
- if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+ if (*idx >= size)
goto out;
}
} else {
@@ -3535,7 +3976,6 @@ out:
if (fail)
return -EINVAL;
- ftrace_graph_filter_enabled = 1;
return 0;
}
@@ -3544,36 +3984,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
- ssize_t read, ret;
+ ssize_t read, ret = 0;
+ struct ftrace_graph_data *fgd = file->private_data;
if (!cnt)
return 0;
- mutex_lock(&graph_lock);
-
- if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
- ret = -ENOMEM;
- goto out_unlock;
- }
+ if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
+ return -ENOMEM;
read = trace_get_user(&parser, ubuf, cnt, ppos);
if (read >= 0 && trace_parser_loaded((&parser))) {
parser.buffer[parser.idx] = 0;
+ mutex_lock(&graph_lock);
+
/* we allow only one expression at a time */
- ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- parser.buffer);
- if (ret)
- goto out_free;
+ ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
+ parser.buffer);
+
+ mutex_unlock(&graph_lock);
}
- ret = read;
+ if (!ret)
+ ret = read;
-out_free:
trace_parser_put(&parser);
-out_unlock:
- mutex_unlock(&graph_lock);
return ret;
}
@@ -3582,11 +4019,49 @@ static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
+ .llseek = tracing_lseek,
+ .release = ftrace_graph_release,
+};
+
+static const struct file_operations ftrace_graph_notrace_fops = {
+ .open = ftrace_graph_notrace_open,
+ .read = seq_read,
+ .write = ftrace_graph_write,
+ .llseek = tracing_lseek,
.release = ftrace_graph_release,
- .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+ struct dentry *parent)
+{
+
+ trace_create_file("set_ftrace_filter", 0644, parent,
+ ops, &ftrace_filter_fops);
+
+ trace_create_file("set_ftrace_notrace", 0644, parent,
+ ops, &ftrace_notrace_fops);
+}
+
+/*
+ * The name "destroy_filter_files" is really a misnomer. Although
+ * in the future, it may actualy delete the files, but this is
+ * really intended to make sure the ops passed in are disabled
+ * and that when this function returns, the caller is free to
+ * free the ops.
+ *
+ * The "destroy" name is only to match the "create" name that this
+ * should be paired with.
+ */
+void ftrace_destroy_filter_files(struct ftrace_ops *ops)
+{
+ mutex_lock(&ftrace_lock);
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ ftrace_shutdown(ops, 0);
+ ops->flags |= FTRACE_OPS_FL_DELETED;
+ mutex_unlock(&ftrace_lock);
+}
+
static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
{
@@ -3596,37 +4071,50 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
trace_create_file("enabled_functions", 0444,
d_tracer, NULL, &ftrace_enabled_fops);
- trace_create_file("set_ftrace_filter", 0644, d_tracer,
- NULL, &ftrace_filter_fops);
-
- trace_create_file("set_ftrace_notrace", 0644, d_tracer,
- NULL, &ftrace_notrace_fops);
+ ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
trace_create_file("set_graph_function", 0444, d_tracer,
NULL,
&ftrace_graph_fops);
+ trace_create_file("set_graph_notrace", 0444, d_tracer,
+ NULL,
+ &ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
return 0;
}
-static void ftrace_swap_recs(void *a, void *b, int size)
+static int ftrace_cmp_ips(const void *a, const void *b)
{
- struct dyn_ftrace *reca = a;
- struct dyn_ftrace *recb = b;
- struct dyn_ftrace t;
+ const unsigned long *ipa = a;
+ const unsigned long *ipb = b;
- t = *reca;
- *reca = *recb;
- *recb = t;
+ if (*ipa > *ipb)
+ return 1;
+ if (*ipa < *ipb)
+ return -1;
+ return 0;
+}
+
+static void ftrace_swap_ips(void *a, void *b, int size)
+{
+ unsigned long *ipa = a;
+ unsigned long *ipb = b;
+ unsigned long t;
+
+ t = *ipa;
+ *ipa = *ipb;
+ *ipb = t;
}
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *start_pg;
struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
unsigned long count;
unsigned long *p;
unsigned long addr;
@@ -3638,8 +4126,11 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;
- pg = ftrace_allocate_pages(count);
- if (!pg)
+ sort(start, count, sizeof(*start),
+ ftrace_cmp_ips, ftrace_swap_ips);
+
+ start_pg = ftrace_allocate_pages(count);
+ if (!start_pg)
return -ENOMEM;
mutex_lock(&ftrace_lock);
@@ -3652,7 +4143,7 @@ static int ftrace_process_locs(struct module *mod,
if (!mod) {
WARN_ON(ftrace_pages || ftrace_pages_start);
/* First initialization */
- ftrace_pages = ftrace_pages_start = pg;
+ ftrace_pages = ftrace_pages_start = start_pg;
} else {
if (!ftrace_pages)
goto out;
@@ -3663,11 +4154,11 @@ static int ftrace_process_locs(struct module *mod,
ftrace_pages = ftrace_pages->next;
}
- ftrace_pages->next = pg;
- ftrace_pages = pg;
+ ftrace_pages->next = start_pg;
}
p = start;
+ pg = start_pg;
while (p < end) {
addr = ftrace_call_adjust(*p++);
/*
@@ -3678,17 +4169,23 @@ static int ftrace_process_locs(struct module *mod,
*/
if (!addr)
continue;
- if (!ftrace_record_ip(addr))
- break;
+
+ if (pg->index == pg->size) {
+ /* We should have allocated enough */
+ if (WARN_ON(!pg->next))
+ break;
+ pg = pg->next;
+ }
+
+ rec = &pg->records[pg->index++];
+ rec->ip = addr;
}
- /* These new locations need to be initialized */
- ftrace_new_pgs = pg;
+ /* We should have used all pages */
+ WARN_ON(pg->next);
- /* Make each individual set of pages sorted by ips */
- for (; pg; pg = pg->next)
- sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
- ftrace_cmp_recs, ftrace_swap_recs);
+ /* Assign the last page to ftrace_pages */
+ ftrace_pages = pg;
/*
* We only need to disable interrupts on start up
@@ -3700,7 +4197,7 @@ static int ftrace_process_locs(struct module *mod,
*/
if (!mod)
local_irq_save(flags);
- ftrace_update_code(mod);
+ ftrace_update_code(mod, start_pg);
if (!mod)
local_irq_restore(flags);
ret = 0;
@@ -3764,61 +4261,57 @@ static void ftrace_init_module(struct module *mod,
ftrace_process_locs(mod, start, end);
}
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+void ftrace_module_init(struct module *mod)
+{
+ ftrace_init_module(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
+}
+
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
- switch (val) {
- case MODULE_STATE_COMING:
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
- break;
- case MODULE_STATE_GOING:
+ if (val == MODULE_STATE_GOING)
ftrace_release_mod(mod);
- break;
- }
return 0;
}
#else
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_nb = {
- .notifier_call = ftrace_module_notify,
- .priority = 0,
+struct notifier_block ftrace_module_exit_nb = {
+ .notifier_call = ftrace_module_notify_exit,
+ .priority = INT_MIN, /* Run after anything that can remove kprobes */
};
-extern unsigned long __start_mcount_loc[];
-extern unsigned long __stop_mcount_loc[];
-
void __init ftrace_init(void)
{
- unsigned long count, addr, flags;
+ extern unsigned long __start_mcount_loc[];
+ extern unsigned long __stop_mcount_loc[];
+ unsigned long count, flags;
int ret;
- /* Keep the ftrace pointer to the stub */
- addr = (unsigned long)ftrace_stub;
-
local_irq_save(flags);
- ftrace_dyn_arch_init(&addr);
+ ret = ftrace_dyn_arch_init();
local_irq_restore(flags);
-
- /* ftrace_dyn_arch_init places the return code in addr */
- if (addr)
+ if (ret)
goto failed;
count = __stop_mcount_loc - __start_mcount_loc;
-
- ret = ftrace_dyn_table_alloc(count);
- if (ret)
+ if (!count) {
+ pr_info("ftrace: No functions to be traced?\n");
goto failed;
+ }
+
+ pr_info("ftrace: allocating %ld entries in %ld pages\n",
+ count, count / ENTRIES_PER_PAGE + 1);
last_ftrace_enabled = ftrace_enabled = 1;
@@ -3826,9 +4319,9 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_nb);
+ ret = register_module_notifier(&ftrace_module_exit_nb);
if (ret)
- pr_warning("Failed to register trace ftrace module notifier\n");
+ pr_warning("Failed to register trace ftrace module exit notifier\n");
set_ftrace_early_filters();
@@ -3841,6 +4334,8 @@ void __init ftrace_init(void)
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(global_ops)
};
static int __init ftrace_nodyn_init(void)
@@ -3848,51 +4343,162 @@ static int __init ftrace_nodyn_init(void)
ftrace_enabled = 1;
return 0;
}
-device_initcall(ftrace_nodyn_init);
+core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) \
- ({ \
- (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
- 0; \
+# define ftrace_startup(ops, command) \
+ ({ \
+ int ___ret = __register_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
+ })
+# define ftrace_shutdown(ops, command) \
+ ({ \
+ int ___ret = __unregister_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
})
-# define ftrace_shutdown(ops, command) do { } while (0)
+
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
static inline int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
return 1;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
+__init void ftrace_init_global_array_ops(struct trace_array *tr)
+{
+ tr->ops = &global_ops;
+ tr->ops->private = tr;
+}
+
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
+{
+ /* If we filter on pids, update to use the pid function */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+ if (WARN_ON(tr->ops->func != ftrace_stub))
+ printk("ftrace ops had %pS for function\n",
+ tr->ops->func);
+ /* Only the top level instance does pid tracing */
+ if (!list_empty(&ftrace_pids)) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ }
+ }
+ tr->ops->func = func;
+ tr->ops->private = tr;
+}
+
+void ftrace_reset_array_ops(struct trace_array *tr)
+{
+ tr->ops->func = ftrace_stub;
+}
+
static void
-ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
+ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
+ return;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they must be freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+ trace_recursion_set(TRACE_CONTROL_BIT);
+
+ /*
+ * Control funcs (perf) uses RCU. Only trace if
+ * RCU is currently active.
+ */
+ if (!rcu_is_watching())
+ goto out;
+
+ do_for_each_ftrace_op(op, ftrace_control_list) {
+ if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+ !ftrace_function_local_disabled(op) &&
+ ftrace_ops_test(op, ip, regs))
+ op->func(ip, parent_ip, op, regs);
+ } while_for_each_ftrace_op(op);
+ out:
+ trace_recursion_clear(TRACE_CONTROL_BIT);
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops control_ops = {
+ .func = ftrace_ops_control_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_REGEX_LOCK(control_ops)
+};
+
+static inline void
+__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ignored, struct pt_regs *regs)
{
struct ftrace_ops *op;
+ int bit;
+
+ if (function_trace_stop)
+ return;
- if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
return;
- trace_recursion_set(TRACE_INTERNAL_BIT);
/*
* Some of the ops may be dynamically allocated,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
- op = rcu_dereference_raw(ftrace_ops_list);
- while (op != &ftrace_list_end) {
- if (ftrace_ops_test(op, ip))
- op->func(ip, parent_ip);
- op = rcu_dereference_raw(op->next);
- };
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (ftrace_ops_test(op, ip, regs)) {
+ if (WARN_ON(!op->func)) {
+ function_trace_stop = 1;
+ printk("op=%p %pS\n", op, op);
+ goto out;
+ }
+ op->func(ip, parent_ip, op, regs);
+ }
+ } while_for_each_ftrace_op(op);
+out:
preempt_enable_notrace();
- trace_recursion_clear(TRACE_INTERNAL_BIT);
+ trace_clear_recursion(bit);
+}
+
+/*
+ * Some archs only support passing ip and parent_ip. Even though
+ * the list function ignores the op parameter, we do not want any
+ * C side effects, where a function is called without the caller
+ * sending a third parameter.
+ * Archs are to support both the regs and ftrace_ops at the same time.
+ * If they support ftrace_ops, it is assumed they support regs.
+ * If call backs want to use regs, they must either check for regs
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
+ * An architecture can pass partial regs with ftrace_ops and still
+ * set the ARCH_SUPPORT_FTARCE_OPS.
+ */
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+}
+#else
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+{
+ __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
+#endif
static void clear_ftrace_swapper(void)
{
@@ -4114,7 +4720,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
if (strlen(tmp) == 0)
return 1;
- ret = strict_strtol(tmp, 10, &val);
+ ret = kstrtol(tmp, 10, &val);
if (ret < 0)
return ret;
@@ -4136,7 +4742,7 @@ static const struct file_operations ftrace_pid_fops = {
.open = ftrace_pid_open,
.write = ftrace_pid_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = tracing_lseek,
.release = ftrace_pid_release,
};
@@ -4196,18 +4802,14 @@ int register_ftrace_function(struct ftrace_ops *ops)
{
int ret = -1;
- mutex_lock(&ftrace_lock);
-
- if (unlikely(ftrace_disabled))
- goto out_unlock;
+ ftrace_ops_init(ops);
- ret = __register_ftrace_function(ops);
- if (!ret)
- ret = ftrace_startup(ops, 0);
+ mutex_lock(&ftrace_lock);
+ ret = ftrace_startup(ops, 0);
- out_unlock:
mutex_unlock(&ftrace_lock);
+
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_function);
@@ -4223,9 +4825,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
int ret;
mutex_lock(&ftrace_lock);
- ret = __unregister_ftrace_function(ops);
- if (!ret)
- ftrace_shutdown(ops, 0);
+ ret = ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
return ret;
@@ -4256,12 +4856,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end) {
- if (ftrace_ops_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_ops_list->func;
- else
- ftrace_trace_function = ftrace_ops_list_func;
- }
+ if (ftrace_ops_list != &ftrace_list_end)
+ update_ftrace_function();
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
@@ -4278,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int ftrace_graph_active;
-static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -4289,6 +4884,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
trace_func_graph_ret_t ftrace_graph_return =
(trace_func_graph_ret_t)ftrace_stub;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -4423,6 +5019,34 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
return NOTIFY_DONE;
}
+static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
+{
+ if (!ftrace_ops_test(&global_ops, trace->func, NULL))
+ return 0;
+ return __ftrace_graph_entry(trace);
+}
+
+/*
+ * The function graph tracer should only trace the functions defined
+ * by set_ftrace_filter and set_ftrace_notrace. If another function
+ * tracer ops is registered, the graph tracer requires testing the
+ * function against the global ops, and not just trace any function
+ * that any ftrace_ops registered.
+ */
+static void update_function_graph_func(void)
+{
+ if (ftrace_ops_list == &ftrace_list_end ||
+ (ftrace_ops_list == &global_ops &&
+ global_ops.next == &ftrace_list_end))
+ ftrace_graph_entry = __ftrace_graph_entry;
+ else
+ ftrace_graph_entry = ftrace_graph_entry_test;
+}
+
+static struct notifier_block ftrace_suspend_notifier = {
+ .notifier_call = ftrace_suspend_notifier_call,
+};
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -4436,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
goto out;
}
- ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
register_pm_notifier(&ftrace_suspend_notifier);
ftrace_graph_active++;
@@ -4447,7 +5070,19 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
}
ftrace_graph_return = retfunc;
- ftrace_graph_entry = entryfunc;
+
+ /*
+ * Update the indirect function to the entryfunc, and the
+ * function that gets called to the entry_test first. Then
+ * call the update fgraph entry function to determine if
+ * the entryfunc should be called directly or not.
+ */
+ __ftrace_graph_entry = entryfunc;
+ ftrace_graph_entry = ftrace_graph_entry_test;
+ update_function_graph_func();
+
+ /* Function graph doesn't use the .func field of global_ops */
+ global_ops.flags |= FTRACE_OPS_FL_STUB;
ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
@@ -4466,7 +5101,9 @@ void unregister_ftrace_graph(void)
ftrace_graph_active--;
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
+ __ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+ global_ops.flags &= ~FTRACE_OPS_FL_STUB;
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b22..1c71382b283 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
-#ifdef EVENT_POWER_TRACING_DEPRECATED
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5b7b5c1195..ff7027199a9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,16 +3,21 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ftrace_event.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
#include <linux/spinlock.h>
+#include <linux/irq_work.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
+#include <linux/kthread.h> /* for self test */
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
+#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
@@ -21,7 +26,8 @@
#include <linux/fs.h>
#include <asm/local.h>
-#include "trace.h"
+
+static void update_pages_handler(struct work_struct *work);
/*
* The ring buffer header is special. We must manually up keep it.
@@ -30,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
{
int ret;
- ret = trace_seq_printf(s, "# compressed entry header\n");
- ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
- ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
- ret = trace_seq_printf(s, "\tarray : 32 bits\n");
- ret = trace_seq_printf(s, "\n");
+ ret = trace_seq_puts(s, "# compressed entry header\n");
+ ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
+ ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
+ ret = trace_seq_puts(s, "\tarray : 32 bits\n");
+ ret = trace_seq_putc(s, '\n');
ret = trace_seq_printf(s, "\tpadding : type == %d\n",
RINGBUF_TYPE_PADDING);
ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
@@ -154,33 +160,10 @@ enum {
static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+/* Used for individual buffers (after the counter) */
+#define RB_BUFFER_OFF (1 << 20)
-/**
- * tracing_on - enable all tracing buffers
- *
- * This function enables all tracing buffers that may have been
- * disabled with tracing_off.
- */
-void tracing_on(void)
-{
- set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_on);
-
-/**
- * tracing_off - turn off all tracing buffers
- *
- * This function stops all tracing buffers from recording data.
- * It does not disable any overhead the tracers themselves may
- * be causing. This function simply causes all recording to
- * the ring buffers to fail.
- */
-void tracing_off(void)
-{
- clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_off);
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
/**
* tracing_off_permanent - permanently disable ring buffers
@@ -193,21 +176,12 @@ void tracing_off_permanent(void)
set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
}
-/**
- * tracing_is_on - show state of ring buffers enabled
- */
-int tracing_is_on(void)
-{
- return ring_buffer_flags == RB_BUFFERS_ON;
-}
-EXPORT_SYMBOL_GPL(tracing_is_on);
-
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
# define RB_FORCE_8BYTE_ALIGNMENT 0
# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
#else
@@ -215,6 +189,8 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
# define RB_ARCH_ALIGNMENT 8U
#endif
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
+
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -363,7 +339,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
- unsigned char data[]; /* data of buffer page */
+ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
/*
@@ -471,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
return ret;
}
+struct rb_irq_work {
+ struct irq_work work;
+ wait_queue_head_t waiters;
+ bool waiters_pending;
+};
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -481,6 +463,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
+ unsigned int nr_pages;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
@@ -489,22 +472,30 @@ struct ring_buffer_per_cpu {
unsigned long lost_events;
unsigned long last_overrun;
local_t entries_bytes;
- local_t commit_overrun;
- local_t overrun;
local_t entries;
+ local_t overrun;
+ local_t commit_overrun;
+ local_t dropped_events;
local_t committing;
local_t commits;
unsigned long read;
unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
+ /* ring buffer pages to update, > 0 to add, < 0 to remove */
+ int nr_pages_to_update;
+ struct list_head new_pages; /* new pages to add */
+ struct work_struct update_pages_work;
+ struct completion update_done;
+
+ struct rb_irq_work irq_work;
};
struct ring_buffer {
- unsigned pages;
unsigned flags;
int cpus;
atomic_t record_disabled;
+ atomic_t resize_disabled;
cpumask_var_t cpumask;
struct lock_class_key *reader_lock_key;
@@ -517,6 +508,8 @@ struct ring_buffer {
struct notifier_block cpu_notify;
#endif
u64 (*clock)(void);
+
+ struct rb_irq_work irq_work;
};
struct ring_buffer_iter {
@@ -528,6 +521,120 @@ struct ring_buffer_iter {
u64 read_stamp;
};
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+ struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+
+ wake_up_all(&rbwork->waiters);
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ DEFINE_WAIT(wait);
+ struct rb_irq_work *work;
+
+ /*
+ * Depending on what the caller is waiting for, either any
+ * data in any cpu buffer, or a specific buffer, put the
+ * caller on the appropriate wait queue.
+ */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ work = &buffer->irq_work;
+ else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
+ cpu_buffer = buffer->buffers[cpu];
+ work = &cpu_buffer->irq_work;
+ }
+
+
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ work->waiters_pending = true;
+
+ if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
+ (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
+ schedule();
+
+ finish_wait(&work->waiters, &wait);
+ return 0;
+}
+
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+ struct file *filp, poll_table *poll_table)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *work;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ work = &buffer->irq_work;
+ else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ work = &cpu_buffer->irq_work;
+ }
+
+ work->waiters_pending = true;
+ poll_wait(filp, &work->waiters, poll_table);
+
+ if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+ (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
#define RB_WARN_ON(b, cond) \
({ \
@@ -958,7 +1065,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * check_pages - integrity check of buffer pages
+ * rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
*
* As a safety measure we check to make sure the data pages have not
@@ -969,6 +1076,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ /* Reset the head page if it exists */
+ if (cpu_buffer->head_page)
+ rb_set_head_page(cpu_buffer);
+
rb_head_page_deactivate(cpu_buffer);
if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -995,14 +1106,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
{
+ int i;
struct buffer_page *bpage, *tmp;
- LIST_HEAD(pages);
- unsigned i;
-
- WARN_ON(!nr_pages);
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1013,15 +1120,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu_buffer->cpu));
+ cpu_to_node(cpu));
if (!bpage)
goto free_pages;
- rb_check_bpage(cpu_buffer, bpage);
+ list_add(&bpage->list, pages);
- list_add(&bpage->list, &pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
goto free_pages;
@@ -1029,6 +1134,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}
+ return 0;
+
+free_pages:
+ list_for_each_entry_safe(bpage, tmp, pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+
+ return -ENOMEM;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ LIST_HEAD(pages);
+
+ WARN_ON(!nr_pages);
+
+ if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ return -ENOMEM;
+
/*
* The ring buffer page list is a circular list that does not
* start and end with a list head. All page list items point to
@@ -1037,20 +1163,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->pages = pages.next;
list_del(&pages);
+ cpu_buffer->nr_pages = nr_pages;
+
rb_check_pages(cpu_buffer);
return 0;
-
- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- return -ENOMEM;
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1067,6 +1188,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
raw_spin_lock_init(&cpu_buffer->reader_lock);
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
+ init_completion(&cpu_buffer->update_done);
+ init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
+ init_waitqueue_head(&cpu_buffer->irq_work.waiters);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1083,8 +1208,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
rb_init_page(bpage->page);
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
- ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ ret = rb_allocate_pages(cpu_buffer, nr_pages);
if (ret < 0)
goto fail_free_reader;
@@ -1131,7 +1257,7 @@ static int rb_cpu_notify(struct notifier_block *self,
#endif
/**
- * ring_buffer_alloc - allocate a new ring_buffer
+ * __ring_buffer_alloc - allocate a new ring_buffer
* @size: the size in bytes per cpu that is needed.
* @flags: attributes to set for the ring buffer.
*
@@ -1145,7 +1271,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
{
struct ring_buffer *buffer;
int bsize;
- int cpu;
+ int cpu, nr_pages;
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1156,14 +1282,17 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
+ init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+ init_waitqueue_head(&buffer->irq_work.waiters);
+
/* need at least two pages */
- if (buffer->pages < 2)
- buffer->pages = 2;
+ if (nr_pages < 2)
+ nr_pages = 2;
/*
* In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1171,7 +1300,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
* In that off case, we need to allocate for all possible cpus.
*/
#ifdef CONFIG_HOTPLUG_CPU
- get_online_cpus();
+ cpu_notifier_register_begin();
cpumask_copy(buffer->cpumask, cpu_online_mask);
#else
cpumask_copy(buffer->cpumask, cpu_possible_mask);
@@ -1186,7 +1315,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
for_each_buffer_cpu(buffer, cpu) {
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
goto fail_free_buffers;
}
@@ -1194,10 +1323,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
#ifdef CONFIG_HOTPLUG_CPU
buffer->cpu_notify.notifier_call = rb_cpu_notify;
buffer->cpu_notify.priority = 0;
- register_cpu_notifier(&buffer->cpu_notify);
+ __register_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_done();
#endif
- put_online_cpus();
mutex_init(&buffer->mutex);
return buffer;
@@ -1211,7 +1340,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
fail_free_buffer:
kfree(buffer);
@@ -1228,16 +1359,17 @@ ring_buffer_free(struct ring_buffer *buffer)
{
int cpu;
- get_online_cpus();
-
#ifdef CONFIG_HOTPLUG_CPU
- unregister_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&buffer->cpu_notify);
#endif
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
- put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
@@ -1254,77 +1386,241 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-static void
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
- struct buffer_page *bpage;
- struct list_head *p;
- unsigned i;
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
+{
+ return local_read(&bpage->write) & RB_WRITE_MASK;
+}
+
+static int
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+{
+ struct list_head *tail_page, *to_remove, *next_page;
+ struct buffer_page *to_remove_page, *tmp_iter_page;
+ struct buffer_page *last_page, *first_page;
+ unsigned int nr_removed;
+ unsigned long head_bit;
+ int page_entries;
+
+ head_bit = 0;
raw_spin_lock_irq(&cpu_buffer->reader_lock);
- rb_head_page_deactivate(cpu_buffer);
+ atomic_inc(&cpu_buffer->record_disabled);
+ /*
+ * We don't race with the readers since we have acquired the reader
+ * lock. We also don't race with writers after disabling recording.
+ * This makes it easy to figure out the first and the last page to be
+ * removed from the list. We unlink all the pages in between including
+ * the first and last pages. This is done in a busy loop so that we
+ * lose the least number of traces.
+ * The pages are freed after we restart recording and unlock readers.
+ */
+ tail_page = &cpu_buffer->tail_page->list;
- for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
- goto out;
- p = cpu_buffer->pages->next;
- bpage = list_entry(p, struct buffer_page, list);
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
+ /*
+ * tail page might be on reader page, we remove the next page
+ * from the ring buffer
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+ tail_page = rb_list_head(tail_page->next);
+ to_remove = tail_page;
+
+ /* start of pages to remove */
+ first_page = list_entry(rb_list_head(to_remove->next),
+ struct buffer_page, list);
+
+ for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
+ to_remove = rb_list_head(to_remove)->next;
+ head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
- if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
- goto out;
- rb_reset_cpu(cpu_buffer);
- rb_check_pages(cpu_buffer);
+ next_page = rb_list_head(to_remove)->next;
-out:
+ /*
+ * Now we remove all pages between tail_page and next_page.
+ * Make sure that we have head_bit value preserved for the
+ * next page
+ */
+ tail_page->next = (struct list_head *)((unsigned long)next_page |
+ head_bit);
+ next_page = rb_list_head(next_page);
+ next_page->prev = tail_page;
+
+ /* make sure pages points to a valid page in the ring buffer */
+ cpu_buffer->pages = next_page;
+
+ /* update head page */
+ if (head_bit)
+ cpu_buffer->head_page = list_entry(next_page,
+ struct buffer_page, list);
+
+ /*
+ * change read pointer to make sure any read iterators reset
+ * themselves
+ */
+ cpu_buffer->read = 0;
+
+ /* pages are removed, resume tracing and then free the pages */
+ atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
+
+ /* last buffer page to remove */
+ last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
+ list);
+ tmp_iter_page = first_page;
+
+ do {
+ to_remove_page = tmp_iter_page;
+ rb_inc_page(cpu_buffer, &tmp_iter_page);
+
+ /* update the counters */
+ page_entries = rb_page_entries(to_remove_page);
+ if (page_entries) {
+ /*
+ * If something was added to this page, it was full
+ * since it is not the tail page. So we deduct the
+ * bytes consumed in ring buffer from here.
+ * Increment overrun to account for the lost events.
+ */
+ local_add(page_entries, &cpu_buffer->overrun);
+ local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ }
+
+ /*
+ * We have already removed references to this list item, just
+ * free up the buffer_page and its page
+ */
+ free_buffer_page(to_remove_page);
+ nr_removed--;
+
+ } while (to_remove_page != last_page);
+
+ RB_WARN_ON(cpu_buffer, nr_removed);
+
+ return nr_removed == 0;
}
-static void
-rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *pages, unsigned nr_pages)
+static int
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *bpage;
- struct list_head *p;
- unsigned i;
+ struct list_head *pages = &cpu_buffer->new_pages;
+ int retries, success;
raw_spin_lock_irq(&cpu_buffer->reader_lock);
- rb_head_page_deactivate(cpu_buffer);
+ /*
+ * We are holding the reader lock, so the reader page won't be swapped
+ * in the ring buffer. Now we are racing with the writer trying to
+ * move head page and the tail page.
+ * We are going to adapt the reader page update process where:
+ * 1. We first splice the start and end of list of new pages between
+ * the head page and its previous page.
+ * 2. We cmpxchg the prev_page->next to point from head page to the
+ * start of new pages list.
+ * 3. Finally, we update the head->prev to the end of new list.
+ *
+ * We will try this process 10 times, to make sure that we don't keep
+ * spinning.
+ */
+ retries = 10;
+ success = 0;
+ while (retries--) {
+ struct list_head *head_page, *prev_page, *r;
+ struct list_head *last_page, *first_page;
+ struct list_head *head_page_with_bit;
- for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
- goto out;
- p = pages->next;
- bpage = list_entry(p, struct buffer_page, list);
- list_del_init(&bpage->list);
- list_add_tail(&bpage->list, cpu_buffer->pages);
+ head_page = &rb_set_head_page(cpu_buffer)->list;
+ if (!head_page)
+ break;
+ prev_page = head_page->prev;
+
+ first_page = pages->next;
+ last_page = pages->prev;
+
+ head_page_with_bit = (struct list_head *)
+ ((unsigned long)head_page | RB_PAGE_HEAD);
+
+ last_page->next = head_page_with_bit;
+ first_page->prev = prev_page;
+
+ r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
+
+ if (r == head_page_with_bit) {
+ /*
+ * yay, we replaced the page pointer to our new list,
+ * now, we just have to update to head page's prev
+ * pointer to point to end of list
+ */
+ head_page->prev = last_page;
+ success = 1;
+ break;
+ }
}
- rb_reset_cpu(cpu_buffer);
- rb_check_pages(cpu_buffer);
-out:
+ if (success)
+ INIT_LIST_HEAD(pages);
+ /*
+ * If we weren't successful in adding in new pages, warn and stop
+ * tracing
+ */
+ RB_WARN_ON(cpu_buffer, !success);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ /* free pages if they weren't inserted */
+ if (!success) {
+ struct buffer_page *bpage, *tmp;
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+ return success;
+}
+
+static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int success;
+
+ if (cpu_buffer->nr_pages_to_update > 0)
+ success = rb_insert_pages(cpu_buffer);
+ else
+ success = rb_remove_pages(cpu_buffer,
+ -cpu_buffer->nr_pages_to_update);
+
+ if (success)
+ cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+}
+
+static void update_pages_handler(struct work_struct *work)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
+ struct ring_buffer_per_cpu, update_pages_work);
+ rb_update_pages(cpu_buffer);
+ complete(&cpu_buffer->update_done);
}
/**
* ring_buffer_resize - resize the ring buffer
* @buffer: the buffer to resize.
* @size: the new size.
+ * @cpu_id: the cpu buffer to resize
*
* Minimum size is 2 * BUF_PAGE_SIZE.
*
- * Returns -1 on failure.
+ * Returns 0 on success and < 0 on failure.
*/
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+ int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages, rm_pages, new_pages;
- struct buffer_page *bpage, *tmp;
- unsigned long buffer_size;
- LIST_HEAD(pages);
- int i, cpu;
+ unsigned nr_pages;
+ int cpu, err = 0;
/*
* Always succeed at resizing a non-existent buffer:
@@ -1332,115 +1628,186 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
if (!buffer)
return size;
+ /* Make sure the requested buffer exists */
+ if (cpu_id != RING_BUFFER_ALL_CPUS &&
+ !cpumask_test_cpu(cpu_id, buffer->cpumask))
+ return size;
+
size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
- buffer_size = buffer->pages * BUF_PAGE_SIZE;
/* we need a minimum of two pages */
if (size < BUF_PAGE_SIZE * 2)
size = BUF_PAGE_SIZE * 2;
- if (size == buffer_size)
- return size;
-
- atomic_inc(&buffer->record_disabled);
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
- /* Make sure all writers are done with this buffer. */
- synchronize_sched();
+ /*
+ * Don't succeed if resizing is disabled, as a reader might be
+ * manipulating the ring buffer and is expecting a sane state while
+ * this is true.
+ */
+ if (atomic_read(&buffer->resize_disabled))
+ return -EBUSY;
+ /* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
- get_online_cpus();
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-
- if (size < buffer_size) {
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ /* calculate the pages to update */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
- /* easy case, just free pages */
- if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
- goto out_fail;
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+ /*
+ * nothing more to do for removing pages or no update
+ */
+ if (cpu_buffer->nr_pages_to_update <= 0)
+ continue;
+ /*
+ * to add pages, make sure all new pages can be
+ * allocated without receiving ENOMEM
+ */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto out_err;
+ }
+ }
- rm_pages = buffer->pages - nr_pages;
+ get_online_cpus();
+ /*
+ * Fire off all the required work handlers
+ * We can't schedule on offline CPUs, but it's not necessary
+ * since we can change their buffer sizes without any race.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ /* The update must run on the CPU that is being updated. */
+ preempt_disable();
+ if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ rb_update_pages(cpu_buffer);
+ cpu_buffer->nr_pages_to_update = 0;
+ } else {
+ /*
+ * Can not disable preemption for schedule_work_on()
+ * on PREEMPT_RT.
+ */
+ preempt_enable();
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ preempt_disable();
+ }
+ preempt_enable();
+ }
+ /* wait for all the updates to complete */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- rb_remove_pages(cpu_buffer, rm_pages);
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ if (cpu_online(cpu))
+ wait_for_completion(&cpu_buffer->update_done);
+ cpu_buffer->nr_pages_to_update = 0;
}
- goto out;
- }
- /*
- * This is a bit more difficult. We only want to add pages
- * when we can allocate enough for all CPUs. We do this
- * by allocating all the pages and storing them on a local
- * link list. If we succeed in our allocation, then we
- * add these pages to the cpu_buffers. Otherwise we just free
- * them all and return -ENOMEM;
- */
- if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
- goto out_fail;
+ put_online_cpus();
+ } else {
+ /* Make sure this CPU has been intitialized */
+ if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
+ goto out;
- new_pages = nr_pages - buffer->pages;
+ cpu_buffer = buffer->buffers[cpu_id];
- for_each_buffer_cpu(buffer, cpu) {
- for (i = 0; i < new_pages; i++) {
- struct page *page;
+ if (nr_pages == cpu_buffer->nr_pages)
+ goto out;
+
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (cpu_buffer->nr_pages_to_update > 0 &&
+ __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu_id)) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ get_online_cpus();
+
+ preempt_disable();
+ /* The update must run on the CPU that is being updated. */
+ if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+ rb_update_pages(cpu_buffer);
+ else {
/*
- * __GFP_NORETRY flag makes sure that the allocation
- * fails gracefully without invoking oom-killer and
- * the system is not destabilized.
+ * Can not disable preemption for schedule_work_on()
+ * on PREEMPT_RT.
*/
- bpage = kzalloc_node(ALIGN(sizeof(*bpage),
- cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu));
- if (!bpage)
- goto free_pages;
- list_add(&bpage->list, &pages);
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ preempt_enable();
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ preempt_disable();
}
- }
+ preempt_enable();
- for_each_buffer_cpu(buffer, cpu) {
- cpu_buffer = buffer->buffers[cpu];
- rb_insert_pages(cpu_buffer, &pages, new_pages);
+ cpu_buffer->nr_pages_to_update = 0;
+ put_online_cpus();
}
- if (RB_WARN_ON(buffer, !list_empty(&pages)))
- goto out_fail;
-
out:
- buffer->pages = nr_pages;
- put_online_cpus();
+ /*
+ * The ring buffer resize can happen with the ring buffer
+ * enabled, so that the update disturbs the tracing as little
+ * as possible. But if the buffer is disabled, we do not need
+ * to worry about that, and we can take the time to verify
+ * that the buffer is not corrupt.
+ */
+ if (atomic_read(&buffer->record_disabled)) {
+ atomic_inc(&buffer->record_disabled);
+ /*
+ * Even though the buffer was disabled, we must make sure
+ * that it is truly disabled before calling rb_check_pages.
+ * There could have been a race between checking
+ * record_disable and incrementing it.
+ */
+ synchronize_sched();
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_check_pages(cpu_buffer);
+ }
+ atomic_dec(&buffer->record_disabled);
+ }
+
mutex_unlock(&buffer->mutex);
+ return size;
- atomic_dec(&buffer->record_disabled);
+ out_err:
+ for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_page *bpage, *tmp;
- return size;
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->nr_pages_to_update = 0;
- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- put_online_cpus();
- mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -ENOMEM;
+ if (list_empty(&cpu_buffer->new_pages))
+ continue;
- /*
- * Something went totally wrong, and we are too paranoid
- * to even clean up the mess.
- */
- out_fail:
- put_online_cpus();
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -1;
+ return err;
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1479,21 +1846,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned long rb_page_write(struct buffer_page *bpage)
-{
- return local_read(&bpage->write) & RB_WRITE_MASK;
-}
-
static inline unsigned rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
-static inline unsigned long rb_page_entries(struct buffer_page *bpage)
-{
- return local_read(&bpage->entries) & RB_WRITE_MASK;
-}
-
/* Size is determined by what has been committed */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
@@ -1542,7 +1899,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
- max_count = cpu_buffer->buffer->pages * 100;
+ max_count = cpu_buffer->nr_pages * 100;
while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -1626,7 +1983,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
}
/**
- * ring_buffer_update_event - update event type and data
+ * rb_update_event - update event type and data
* @event: the even to update
* @type: the type of event
* @length: the size of the event field in the ring buffer
@@ -1961,8 +2318,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* If we are not in overwrite mode,
* this is easy, just stop here.
*/
- if (!(buffer->flags & RB_FL_OVERWRITE))
+ if (!(buffer->flags & RB_FL_OVERWRITE)) {
+ local_inc(&cpu_buffer->dropped_events);
goto out_reset;
+ }
ret = rb_handle_head_page(cpu_buffer,
tail_page,
@@ -2040,6 +2399,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
write &= RB_WRITE_MASK;
tail = write - length;
+ /*
+ * If this is the first commit on the page, then it has the same
+ * timestamp as the page itself.
+ */
+ if (!tail)
+ delta = 0;
+
/* See if we shot pass the end of this buffer page */
if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail,
@@ -2201,7 +2567,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
if (unlikely(test_time_stamp(delta))) {
int local_clock_stable = 1;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- local_clock_stable = sched_clock_stable;
+ local_clock_stable = sched_clock_stable();
#endif
WARN_ONCE(delta > (1ULL << 59),
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
@@ -2233,41 +2599,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#ifdef CONFIG_TRACING
-#define TRACE_RECURSIVE_DEPTH 16
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
-/* Keep this code out of the fast path cache */
-static noinline void trace_recursive_fail(void)
+static __always_inline int trace_recursive_lock(void)
{
- /* Disable all tracing before we do anything else */
- tracing_off_permanent();
+ unsigned int val = this_cpu_read(current_context);
+ int bit;
- printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
- "HC[%lu]:SC[%lu]:NMI[%lu]\n",
- trace_recursion_buffer(),
- hardirq_count() >> HARDIRQ_SHIFT,
- softirq_count() >> SOFTIRQ_SHIFT,
- in_nmi());
-
- WARN_ON_ONCE(1);
-}
-
-static inline int trace_recursive_lock(void)
-{
- trace_recursion_inc();
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
- if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
- return 0;
+ if (unlikely(val & (1 << bit)))
+ return 1;
- trace_recursive_fail();
+ val |= (1 << bit);
+ this_cpu_write(current_context, val);
- return -1;
+ return 0;
}
-static inline void trace_recursive_unlock(void)
+static __always_inline void trace_recursive_unlock(void)
{
- WARN_ON_ONCE(!trace_recursion_buffer());
+ unsigned int val = this_cpu_read(current_context);
- trace_recursion_dec();
+ val--;
+ val &= this_cpu_read(current_context);
+ this_cpu_write(current_context, val);
}
#else
@@ -2375,6 +2776,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_end_commit(cpu_buffer);
}
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+ if (buffer->irq_work.waiters_pending) {
+ buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&buffer->irq_work.work);
+ }
+
+ if (cpu_buffer->irq_work.waiters_pending) {
+ cpu_buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
+}
+
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
@@ -2394,6 +2811,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
+ rb_wakeups(buffer, cpu_buffer);
+
trace_recursive_unlock();
preempt_enable_notrace();
@@ -2526,8 +2945,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
* and not the length of the event which would hold the header.
*/
int ring_buffer_write(struct ring_buffer *buffer,
- unsigned long length,
- void *data)
+ unsigned long length,
+ void *data)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
@@ -2566,6 +2985,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
+ rb_wakeups(buffer, cpu_buffer);
+
ret = 0;
out:
preempt_enable_notrace();
@@ -2619,6 +3040,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
/**
+ * ring_buffer_record_off - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * This is different than ring_buffer_record_disable() as
+ * it works like an on/off switch, where as the disable() version
+ * must be paired with a enable().
+ */
+void ring_buffer_record_off(struct ring_buffer *buffer)
+{
+ unsigned int rd;
+ unsigned int new_rd;
+
+ do {
+ rd = atomic_read(&buffer->record_disabled);
+ new_rd = rd | RB_BUFFER_OFF;
+ } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_off);
+
+/**
+ * ring_buffer_record_on - restart writes into the buffer
+ * @buffer: The ring buffer to start writes to.
+ *
+ * This enables all writes to the buffer that was disabled by
+ * ring_buffer_record_off().
+ *
+ * This is different than ring_buffer_record_enable() as
+ * it works like an on/off switch, where as the enable() version
+ * must be paired with a disable().
+ */
+void ring_buffer_record_on(struct ring_buffer *buffer)
+{
+ unsigned int rd;
+ unsigned int new_rd;
+
+ do {
+ rd = atomic_read(&buffer->record_disabled);
+ new_rd = rd & ~RB_BUFFER_OFF;
+ } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_on);
+
+/**
+ * ring_buffer_record_is_on - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes.
+ */
+int ring_buffer_record_is_on(struct ring_buffer *buffer)
+{
+ return !atomic_read(&buffer->record_disabled);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
@@ -2678,12 +3156,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
-unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
{
unsigned long flags;
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
- unsigned long ret;
+ u64 ret = 0;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
@@ -2698,7 +3176,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
bpage = cpu_buffer->reader_page;
else
bpage = rb_set_head_page(cpu_buffer);
- ret = bpage->page->time_stamp;
+ if (bpage)
+ ret = bpage->page->time_stamp;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
return ret;
@@ -2744,7 +3223,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
/**
- * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
+ * buffer wrapping around (only if RB_FL_OVERWRITE is on).
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
@@ -2764,7 +3244,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
/**
- * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
+ * commits failing due to the buffer wrapping around while there are uncommitted
+ * events, such as during an interrupt storm.
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
@@ -2785,6 +3267,46 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
/**
+ * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
+ * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->dropped_events);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
+
+/**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+
+/**
* ring_buffer_entries - get the number of entries in a buffer
* @buffer: The ring buffer
*
@@ -2992,6 +3514,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (cpu_buffer->commit_page == cpu_buffer->reader_page)
goto out;
+ /* Don't bother swapping if the ring buffer is empty */
+ if (rb_num_of_entries(cpu_buffer) == 0)
+ goto out;
+
/*
* Reset the reader page to size zero.
*/
@@ -3005,6 +3531,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Splice the empty reader page into the list around the head.
*/
reader = rb_set_head_page(cpu_buffer);
+ if (!reader)
+ goto out;
cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
cpu_buffer->reader_page->list.prev = reader->list.prev;
@@ -3137,7 +3665,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
/* check for end of page padding */
if ((iter->head >= rb_page_size(iter->head_page)) &&
(iter->head_page != cpu_buffer->commit_page))
- rb_advance_iter(iter);
+ rb_inc_iter(iter);
}
static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
@@ -3438,11 +3966,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* expected.
*
* After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_prepare_sync.
+ * expected to make at least one call to ring_buffer_read_prepare_sync.
* Afterwards, ring_buffer_read_start is invoked to get things going
* for real.
*
- * This overall must be paired with ring_buffer_finish.
+ * This overall must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
@@ -3461,6 +3989,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
iter->cpu_buffer = cpu_buffer;
+ atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
return iter;
@@ -3490,7 +4019,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
* an intervening ring_buffer_read_prepare_sync must have been
* performed.
*
- * Must be paired with ring_buffer_finish.
+ * Must be paired with ring_buffer_read_finish.
*/
void
ring_buffer_read_start(struct ring_buffer_iter *iter)
@@ -3512,7 +4041,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
/**
- * ring_buffer_finish - finish reading the iterator of the buffer
+ * ring_buffer_read_finish - finish reading the iterator of the buffer
* @iter: The iterator retrieved by ring_buffer_start
*
* This re-enables the recording to the buffer, and frees the
@@ -3522,8 +4051,20 @@ void
ring_buffer_read_finish(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ unsigned long flags;
+
+ /*
+ * Ring buffer is disabled from recording, here's a good place
+ * to check the integrity of the ring buffer.
+ * Must prevent readers from trying to read, as the check
+ * clears the HEAD page and readers require it.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_check_pages(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->buffer->resize_disabled);
kfree(iter);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3563,9 +4104,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
*/
-unsigned long ring_buffer_size(struct ring_buffer *buffer)
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
{
- return BUF_PAGE_SIZE * buffer->pages;
+ /*
+ * Earlier, this method returned
+ * BUF_PAGE_SIZE * buffer->nr_pages
+ * Since the nr_pages field is now removed, we have converted this to
+ * return the per cpu buffer value.
+ */
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
@@ -3586,14 +4136,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->commit_page = cpu_buffer->head_page;
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->read = 0;
- local_set(&cpu_buffer->commit_overrun, 0);
local_set(&cpu_buffer->entries_bytes, 0);
local_set(&cpu_buffer->overrun, 0);
+ local_set(&cpu_buffer->commit_overrun, 0);
+ local_set(&cpu_buffer->dropped_events, 0);
local_set(&cpu_buffer->entries, 0);
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
@@ -3622,8 +4174,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
+ atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
+ /* Make sure all commits have finished */
+ synchronize_sched();
+
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3639,6 +4195,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&buffer->resize_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -3740,8 +4297,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
!cpumask_test_cpu(cpu, buffer_b->cpumask))
goto out;
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
/* At least make sure the two buffers are somewhat the same */
- if (buffer_a->pages != buffer_b->pages)
+ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
ret = -EAGAIN;
@@ -3755,9 +4315,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
if (atomic_read(&buffer_b->record_disabled))
goto out;
- cpu_buffer_a = buffer_a->buffers[cpu];
- cpu_buffer_b = buffer_b->buffers[cpu];
-
if (atomic_read(&cpu_buffer_a->record_disabled))
goto out;
@@ -3799,6 +4356,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
/**
* ring_buffer_alloc_read_page - allocate a page to read from buffer
* @buffer: the buffer to allocate for.
+ * @cpu: the cpu buffer to allocate.
*
* This function is used in conjunction with ring_buffer_read_page.
* When reading a full page from the ring buffer, these functions
@@ -3856,7 +4414,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* to swap with a page in the ring buffer.
*
* for example:
- * rpage = ring_buffer_alloc_read_page(buffer);
+ * rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (!rpage)
* return error;
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
@@ -4039,68 +4597,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
-#ifdef CONFIG_TRACING
-static ssize_t
-rb_simple_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long *p = filp->private_data;
- char buf[64];
- int r;
-
- if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
- r = sprintf(buf, "permanently disabled\n");
- else
- r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-rb_simple_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long *p = filp->private_data;
- unsigned long val;
- int ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- if (val)
- set_bit(RB_BUFFERS_ON_BIT, p);
- else
- clear_bit(RB_BUFFERS_ON_BIT, p);
-
- (*ppos)++;
-
- return cnt;
-}
-
-static const struct file_operations rb_simple_fops = {
- .open = tracing_open_generic,
- .read = rb_simple_read,
- .write = rb_simple_write,
- .llseek = default_llseek,
-};
-
-
-static __init int rb_init_debugfs(void)
-{
- struct dentry *d_tracer;
-
- d_tracer = tracing_init_dentry();
-
- trace_create_file("tracing_on", 0644, d_tracer,
- &ring_buffer_flags, &rb_simple_fops);
-
- return 0;
-}
-
-fs_initcall(rb_init_debugfs);
-#endif
-
#ifdef CONFIG_HOTPLUG_CPU
static int rb_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
@@ -4108,6 +4604,8 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
+ int cpu_i, nr_pages_same;
+ unsigned int nr_pages;
switch (action) {
case CPU_UP_PREPARE:
@@ -4115,8 +4613,23 @@ static int rb_cpu_notify(struct notifier_block *self,
if (cpumask_test_cpu(cpu, buffer->cpumask))
return NOTIFY_OK;
+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
+ }
+ }
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu]) {
WARN(1, "failed to allocate ring buffer on CPU %ld\n",
cpu);
@@ -4139,3 +4652,320 @@ static int rb_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
#endif
+
+#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
+/*
+ * This is a basic integrity check of the ring buffer.
+ * Late in the boot cycle this test will run when configured in.
+ * It will kick off a thread per CPU that will go into a loop
+ * writing to the per cpu ring buffer various sizes of data.
+ * Some of the data will be large items, some small.
+ *
+ * Another thread is created that goes into a spin, sending out
+ * IPIs to the other CPUs to also write into the ring buffer.
+ * this is to test the nesting ability of the buffer.
+ *
+ * Basic stats are recorded and reported. If something in the
+ * ring buffer should happen that's not expected, a big warning
+ * is displayed and all ring buffers are disabled.
+ */
+static struct task_struct *rb_threads[NR_CPUS] __initdata;
+
+struct rb_test_data {
+ struct ring_buffer *buffer;
+ unsigned long events;
+ unsigned long bytes_written;
+ unsigned long bytes_alloc;
+ unsigned long bytes_dropped;
+ unsigned long events_nested;
+ unsigned long bytes_written_nested;
+ unsigned long bytes_alloc_nested;
+ unsigned long bytes_dropped_nested;
+ int min_size_nested;
+ int max_size_nested;
+ int max_size;
+ int min_size;
+ int cpu;
+ int cnt;
+};
+
+static struct rb_test_data rb_data[NR_CPUS] __initdata;
+
+/* 1 meg per cpu */
+#define RB_TEST_BUFFER_SIZE 1048576
+
+static char rb_string[] __initdata =
+ "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
+ "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
+ "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
+
+static bool rb_test_started __initdata;
+
+struct rb_item {
+ int size;
+ char str[];
+};
+
+static __init int rb_write_something(struct rb_test_data *data, bool nested)
+{
+ struct ring_buffer_event *event;
+ struct rb_item *item;
+ bool started;
+ int event_len;
+ int size;
+ int len;
+ int cnt;
+
+ /* Have nested writes different that what is written */
+ cnt = data->cnt + (nested ? 27 : 0);
+
+ /* Multiply cnt by ~e, to make some unique increment */
+ size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+
+ len = size + sizeof(struct rb_item);
+
+ started = rb_test_started;
+ /* read rb_test_started before checking buffer enabled */
+ smp_rmb();
+
+ event = ring_buffer_lock_reserve(data->buffer, len);
+ if (!event) {
+ /* Ignore dropped events before test starts. */
+ if (started) {
+ if (nested)
+ data->bytes_dropped += len;
+ else
+ data->bytes_dropped_nested += len;
+ }
+ return len;
+ }
+
+ event_len = ring_buffer_event_length(event);
+
+ if (RB_WARN_ON(data->buffer, event_len < len))
+ goto out;
+
+ item = ring_buffer_event_data(event);
+ item->size = size;
+ memcpy(item->str, rb_string, size);
+
+ if (nested) {
+ data->bytes_alloc_nested += event_len;
+ data->bytes_written_nested += len;
+ data->events_nested++;
+ if (!data->min_size_nested || len < data->min_size_nested)
+ data->min_size_nested = len;
+ if (len > data->max_size_nested)
+ data->max_size_nested = len;
+ } else {
+ data->bytes_alloc += event_len;
+ data->bytes_written += len;
+ data->events++;
+ if (!data->min_size || len < data->min_size)
+ data->max_size = len;
+ if (len > data->max_size)
+ data->max_size = len;
+ }
+
+ out:
+ ring_buffer_unlock_commit(data->buffer, event);
+
+ return 0;
+}
+
+static __init int rb_test(void *arg)
+{
+ struct rb_test_data *data = arg;
+
+ while (!kthread_should_stop()) {
+ rb_write_something(data, false);
+ data->cnt++;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Now sleep between a min of 100-300us and a max of 1ms */
+ usleep_range(((data->cnt % 3) + 1) * 100, 1000);
+ }
+
+ return 0;
+}
+
+static __init void rb_ipi(void *ignore)
+{
+ struct rb_test_data *data;
+ int cpu = smp_processor_id();
+
+ data = &rb_data[cpu];
+ rb_write_something(data, true);
+}
+
+static __init int rb_hammer_test(void *arg)
+{
+ while (!kthread_should_stop()) {
+
+ /* Send an IPI to all cpus to write data! */
+ smp_call_function(rb_ipi, NULL, 1);
+ /* No sleep, but for non preempt, let others run */
+ schedule();
+ }
+
+ return 0;
+}
+
+static __init int test_ringbuffer(void)
+{
+ struct task_struct *rb_hammer;
+ struct ring_buffer *buffer;
+ int cpu;
+ int ret = 0;
+
+ pr_info("Running ring buffer tests...\n");
+
+ buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
+ if (WARN_ON(!buffer))
+ return 0;
+
+ /* Disable buffer so that threads can't write to it yet */
+ ring_buffer_record_off(buffer);
+
+ for_each_online_cpu(cpu) {
+ rb_data[cpu].buffer = buffer;
+ rb_data[cpu].cpu = cpu;
+ rb_data[cpu].cnt = cpu;
+ rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
+ "rbtester/%d", cpu);
+ if (WARN_ON(!rb_threads[cpu])) {
+ pr_cont("FAILED\n");
+ ret = -1;
+ goto out_free;
+ }
+
+ kthread_bind(rb_threads[cpu], cpu);
+ wake_up_process(rb_threads[cpu]);
+ }
+
+ /* Now create the rb hammer! */
+ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
+ if (WARN_ON(!rb_hammer)) {
+ pr_cont("FAILED\n");
+ ret = -1;
+ goto out_free;
+ }
+
+ ring_buffer_record_on(buffer);
+ /*
+ * Show buffer is enabled before setting rb_test_started.
+ * Yes there's a small race window where events could be
+ * dropped and the thread wont catch it. But when a ring
+ * buffer gets enabled, there will always be some kind of
+ * delay before other CPUs see it. Thus, we don't care about
+ * those dropped events. We care about events dropped after
+ * the threads see that the buffer is active.
+ */
+ smp_wmb();
+ rb_test_started = true;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Just run for 10 seconds */;
+ schedule_timeout(10 * HZ);
+
+ kthread_stop(rb_hammer);
+
+ out_free:
+ for_each_online_cpu(cpu) {
+ if (!rb_threads[cpu])
+ break;
+ kthread_stop(rb_threads[cpu]);
+ }
+ if (ret) {
+ ring_buffer_free(buffer);
+ return ret;
+ }
+
+ /* Report! */
+ pr_info("finished\n");
+ for_each_online_cpu(cpu) {
+ struct ring_buffer_event *event;
+ struct rb_test_data *data = &rb_data[cpu];
+ struct rb_item *item;
+ unsigned long total_events;
+ unsigned long total_dropped;
+ unsigned long total_written;
+ unsigned long total_alloc;
+ unsigned long total_read = 0;
+ unsigned long total_size = 0;
+ unsigned long total_len = 0;
+ unsigned long total_lost = 0;
+ unsigned long lost;
+ int big_event_size;
+ int small_event_size;
+
+ ret = -1;
+
+ total_events = data->events + data->events_nested;
+ total_written = data->bytes_written + data->bytes_written_nested;
+ total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
+ total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
+
+ big_event_size = data->max_size + data->max_size_nested;
+ small_event_size = data->min_size + data->min_size_nested;
+
+ pr_info("CPU %d:\n", cpu);
+ pr_info(" events: %ld\n", total_events);
+ pr_info(" dropped bytes: %ld\n", total_dropped);
+ pr_info(" alloced bytes: %ld\n", total_alloc);
+ pr_info(" written bytes: %ld\n", total_written);
+ pr_info(" biggest event: %d\n", big_event_size);
+ pr_info(" smallest event: %d\n", small_event_size);
+
+ if (RB_WARN_ON(buffer, total_dropped))
+ break;
+
+ ret = 0;
+
+ while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
+ total_lost += lost;
+ item = ring_buffer_event_data(event);
+ total_len += ring_buffer_event_length(event);
+ total_size += item->size + sizeof(struct rb_item);
+ if (memcmp(&item->str[0], rb_string, item->size) != 0) {
+ pr_info("FAILED!\n");
+ pr_info("buffer had: %.*s\n", item->size, item->str);
+ pr_info("expected: %.*s\n", item->size, rb_string);
+ RB_WARN_ON(buffer, 1);
+ ret = -1;
+ break;
+ }
+ total_read++;
+ }
+ if (ret)
+ break;
+
+ ret = -1;
+
+ pr_info(" read events: %ld\n", total_read);
+ pr_info(" lost events: %ld\n", total_lost);
+ pr_info(" total events: %ld\n", total_lost + total_read);
+ pr_info(" recorded len bytes: %ld\n", total_len);
+ pr_info(" recorded size bytes: %ld\n", total_size);
+ if (total_lost)
+ pr_info(" With dropped events, record len and size may not match\n"
+ " alloced and written from above\n");
+ if (!total_lost) {
+ if (RB_WARN_ON(buffer, total_len != total_alloc ||
+ total_size != total_written))
+ break;
+ }
+ if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
+ break;
+
+ ret = 0;
+ }
+ if (!ret)
+ pr_info("Ring buffer PASSED!\n");
+
+ ring_buffer_free(buffer);
+ return 0;
+}
+
+late_initcall(test_ringbuffer);
+#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b9..0434ff1b808 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
module_param(write_iteration, uint, 0644);
MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
-static int producer_nice = 19;
-static int consumer_nice = 19;
+static int producer_nice = MAX_NICE;
+static int consumer_nice = MAX_NICE;
static int producer_fifo = -1;
static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
/* Let the user know that the test is running at low priority */
if (producer_fifo < 0 && consumer_fifo < 0 &&
- producer_nice == 19 && consumer_nice == 19)
+ producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f1bc5d2a0..291397e6666 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
/*
* ring buffer based function tracer
*
- * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
* Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
*
* Originally taken from the RT patch by:
@@ -9,7 +9,7 @@
*
* Based on code from the latency_tracer, that is:
* Copyright (C) 2004-2006 Ingo Molnar
- * Copyright (C) 2004 William Lee Irwin III
+ * Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/ring_buffer.h>
#include <generated/utsrelease.h>
@@ -36,7 +36,9 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
+#include <linux/nmi.h>
#include <linux/fs.h>
+#include <linux/sched/rt.h>
#include "trace.h"
#include "trace_output.h"
@@ -45,7 +47,7 @@
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
*/
-int ring_buffer_expanded;
+bool ring_buffer_expanded;
/*
* We need to change this state when a selftest is running.
@@ -71,12 +73,20 @@ static struct tracer_flags dummy_tracer_flags = {
.opts = dummy_tracer_opt
};
-static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+static int
+dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
return 0;
}
/*
+ * To prevent the comm cache from being overwritten when no
+ * tracing is active, only save the comm when a trace event
+ * occurred.
+ */
+static DEFINE_PER_CPU(bool, trace_cmdline_save);
+
+/*
* Kill all tracing for good (never come back).
* It is initialized to 1 but will turn to zero if the initialization
* of the tracer is successful. But that is the only place that sets
@@ -86,18 +96,6 @@ static int tracing_disabled = 1;
DEFINE_PER_CPU(int, ftrace_cpu_disabled);
-static inline void ftrace_disable_cpu(void)
-{
- preempt_disable();
- __this_cpu_inc(ftrace_cpu_disabled);
-}
-
-static inline void ftrace_enable_cpu(void)
-{
- __this_cpu_dec(ftrace_cpu_disabled);
- preempt_enable();
-}
-
cpumask_var_t __read_mostly tracing_buffer_mask;
/*
@@ -118,18 +116,23 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
enum ftrace_dump_mode ftrace_dump_on_oops;
-static int tracing_set_tracer(const char *buf);
+/* When set, tracing will stop when a WARN*() is hit */
+int __disable_trace_on_warning;
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
+static bool allocate_snapshot;
+
static int __init set_cmdline_ftrace(char *str)
{
- strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+ strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
- ring_buffer_expanded = 1;
+ ring_buffer_expanded = true;
return 1;
}
__setup("ftrace=", set_cmdline_ftrace);
@@ -150,6 +153,46 @@ static int __init set_ftrace_dump_on_oops(char *str)
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+static int __init stop_trace_on_warning(char *str)
+{
+ __disable_trace_on_warning = 1;
+ return 1;
+}
+__setup("traceoff_on_warning=", stop_trace_on_warning);
+
+static int __init boot_alloc_snapshot(char *str)
+{
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ ring_buffer_expanded = true;
+ return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
+
+
+static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_options __initdata;
+
+static int __init set_trace_boot_options(char *str)
+{
+ strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ trace_boot_options = trace_boot_options_buf;
+ return 0;
+}
+__setup("trace_options=", set_trace_boot_options);
+
+static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_clock __initdata;
+
+static int __init set_trace_boot_clock(char *str)
+{
+ strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
+ trace_boot_clock = trace_boot_clock_buf;
+ return 0;
+}
+__setup("trace_clock=", set_trace_boot_clock);
+
+
unsigned long long ns2usecs(cycle_t nsec)
{
nsec += 500;
@@ -171,58 +214,104 @@ unsigned long long ns2usecs(cycle_t nsec)
*/
static struct trace_array global_trace;
-static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+LIST_HEAD(ftrace_trace_arrays);
+
+int trace_array_get(struct trace_array *this_tr)
+{
+ struct trace_array *tr;
+ int ret = -ENODEV;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == this_tr) {
+ tr->ref++;
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
-int filter_current_check_discard(struct ring_buffer *buffer,
- struct ftrace_event_call *call, void *rec,
- struct ring_buffer_event *event)
+static void __trace_array_put(struct trace_array *this_tr)
{
- return filter_check_discard(call, rec, buffer, event);
+ WARN_ON(!this_tr->ref);
+ this_tr->ref--;
}
-EXPORT_SYMBOL_GPL(filter_current_check_discard);
-cycle_t ftrace_now(int cpu)
+void trace_array_put(struct trace_array *this_tr)
+{
+ mutex_lock(&trace_types_lock);
+ __trace_array_put(this_tr);
+ mutex_unlock(&trace_types_lock);
+}
+
+int filter_check_discard(struct ftrace_event_file *file, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(file->filter, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(filter_check_discard);
+
+int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(call->filter, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(call_filter_check_discard);
+
+static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
/* Early boot up does not have a buffer yet */
- if (!global_trace.buffer)
+ if (!buf->buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
- ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+ ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
return ts;
}
-/*
- * The max_tr is used to snapshot the global_trace when a maximum
- * latency is reached. Some tracers will use this to store a maximum
- * trace while it continues examining live traces.
- *
- * The buffers for the max_tr are set up the same as the global_trace.
- * When a snapshot is taken, the link list of the max_tr is swapped
- * with the link list of the global_trace and the buffers are reset for
- * the global_trace so the tracing can continue.
- */
-static struct trace_array max_tr;
-
-static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
-
-/* tracer_enabled is used to toggle activation of a tracer */
-static int tracer_enabled = 1;
+cycle_t ftrace_now(int cpu)
+{
+ return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+}
/**
- * tracing_is_enabled - return tracer_enabled status
+ * tracing_is_enabled - Show if global_trace has been disabled
*
- * This function is used by other tracers to know the status
- * of the tracer_enabled flag. Tracers may use this function
- * to know if it should enable their features when starting
- * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+ * Shows if the global trace has been enabled or not. It uses the
+ * mirror flag "buffer_disabled" to be used in fast paths such as for
+ * the irqsoff tracer. But it may be inaccurate due to races. If you
+ * need to know the accurate state, use tracing_is_on() which is a little
+ * slower, but accurate.
*/
int tracing_is_enabled(void)
{
- return tracer_enabled;
+ /*
+ * For quick access (irqsoff uses this in fast path), just
+ * return the mirror variable of the state of the ring buffer.
+ * It's a little racy, but we don't really care.
+ */
+ smp_rmb();
+ return !global_trace.buffer_disabled;
}
/*
@@ -242,13 +331,10 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
/* trace_types holds a link list of available tracers. */
static struct tracer *trace_types __read_mostly;
-/* current_trace points to the tracer that is currently active */
-static struct tracer *current_trace __read_mostly;
-
/*
* trace_types_lock is used to protect the trace_types list.
*/
-static DEFINE_MUTEX(trace_types_lock);
+DEFINE_MUTEX(trace_types_lock);
/*
* serialize the access of the ring buffer
@@ -278,13 +364,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
static inline void trace_access_lock(int cpu)
{
- if (cpu == TRACE_PIPE_ALL_CPU) {
+ if (cpu == RING_BUFFER_ALL_CPUS) {
/* gain it for accessing the whole ring buffer. */
down_write(&all_cpu_access_lock);
} else {
/* gain it for accessing a cpu ring buffer. */
- /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
down_read(&all_cpu_access_lock);
/* Secondly block other access to this @cpu ring buffer. */
@@ -294,7 +380,7 @@ static inline void trace_access_lock(int cpu)
static inline void trace_access_unlock(int cpu)
{
- if (cpu == TRACE_PIPE_ALL_CPU) {
+ if (cpu == RING_BUFFER_ALL_CPUS) {
up_write(&all_cpu_access_lock);
} else {
mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -332,41 +418,337 @@ static inline void trace_access_lock_init(void)
#endif
-/* trace_wait is a waitqueue for tasks blocked on trace_poll */
-static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
-
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
- TRACE_ITER_IRQ_INFO;
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
-static int trace_stop_count;
-static DEFINE_RAW_SPINLOCK(tracing_start_lock);
+static void tracer_tracing_on(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ ring_buffer_record_on(tr->trace_buffer.buffer);
+ /*
+ * This flag is looked at when buffers haven't been allocated
+ * yet, or by some tracers (like irqsoff), that just want to
+ * know if the ring buffer has been disabled, but it can handle
+ * races of where it gets disabled but we still do a record.
+ * As the check is in the fast path of the tracers, it is more
+ * important to be fast than accurate.
+ */
+ tr->buffer_disabled = 0;
+ /* Make the flag seen by readers */
+ smp_wmb();
+}
-static void wakeup_work_handler(struct work_struct *work)
+/**
+ * tracing_on - enable tracing buffers
+ *
+ * This function enables tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
{
- wake_up(&trace_wait);
+ tracer_tracing_on(&global_trace);
}
+EXPORT_SYMBOL_GPL(tracing_on);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ * @size: The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ int alloc;
+ int pc;
-static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+ pc = preempt_count();
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+ local_save_flags(irq_flags);
+ buffer = global_trace.trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, str, size);
+
+ /* Add a newline if necessary */
+ if (entry->buf[size - 1] != '\n') {
+ entry->buf[size] = '\n';
+ entry->buf[size + 1] = '\0';
+ } else
+ entry->buf[size] = '\0';
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip: The address of the caller
+ * @str: The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct bputs_entry *entry;
+ unsigned long irq_flags;
+ int size = sizeof(struct bputs_entry);
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ local_save_flags(irq_flags);
+ buffer = global_trace.trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->str = str;
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
/**
- * trace_wake_up - wake up tasks waiting for trace input
+ * trace_snapshot - take a snapshot of the current buffer.
*
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
*/
-void trace_wake_up(void)
+void tracing_snapshot(void)
{
- const unsigned long delay = msecs_to_jiffies(2);
+ struct trace_array *tr = &global_trace;
+ struct tracer *tracer = tr->current_trace;
+ unsigned long flags;
- if (trace_flags & TRACE_ITER_BLOCK)
+ if (in_nmi()) {
+ internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ internal_trace_puts("*** snapshot is being ignored ***\n");
return;
- schedule_delayed_work(&wakeup_work, delay);
+ }
+
+ if (!tr->allocated_snapshot) {
+ internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
+ internal_trace_puts("*** stopping trace here! ***\n");
+ tracing_off();
+ return;
+ }
+
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer->use_max_tr) {
+ internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
+ internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ update_max_tr(tr, current, smp_processor_id());
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+ struct trace_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+
+static int alloc_snapshot(struct trace_array *tr)
+{
+ int ret;
+
+ if (!tr->allocated_snapshot) {
+
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ return ret;
+
+ tr->allocated_snapshot = true;
+ }
+
+ return 0;
}
+static void free_snapshot(struct trace_array *tr)
+{
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&tr->max_buffer, 1);
+ tracing_reset_online_cpus(&tr->max_buffer);
+ tr->allocated_snapshot = false;
+}
+
+/**
+ * tracing_alloc_snapshot - allocate snapshot buffer.
+ *
+ * This only allocates the snapshot buffer if it isn't already
+ * allocated - it doesn't also take a snapshot.
+ *
+ * This is meant to be used in cases where the snapshot buffer needs
+ * to be set up for events that can't sleep but need to be able to
+ * trigger a snapshot.
+ */
+int tracing_alloc_snapshot(void)
+{
+ struct trace_array *tr = &global_trace;
+ int ret;
+
+ ret = alloc_snapshot(tr);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
+
+/**
+ * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to trace_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+ int ret;
+
+ ret = tracing_alloc_snapshot();
+ if (ret < 0)
+ return;
+
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#else
+void tracing_snapshot(void)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+int tracing_alloc_snapshot(void)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
+void tracing_snapshot_alloc(void)
+{
+ /* Give warning */
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+static void tracer_tracing_off(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ ring_buffer_record_off(tr->trace_buffer.buffer);
+ /*
+ * This flag is looked at when buffers haven't been allocated
+ * yet, or by some tracers (like irqsoff), that just want to
+ * know if the ring buffer has been disabled, but it can handle
+ * races of where it gets disabled but we still do a record.
+ * As the check is in the fast path of the tracers, it is more
+ * important to be fast than accurate.
+ */
+ tr->buffer_disabled = 1;
+ /* Make the flag seen by readers */
+ smp_wmb();
+}
+
+/**
+ * tracing_off - turn off tracing buffers
+ *
+ * This function stops the tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+ tracer_tracing_off(&global_trace);
+}
+EXPORT_SYMBOL_GPL(tracing_off);
+
+void disable_trace_on_warning(void)
+{
+ if (__disable_trace_on_warning)
+ tracing_off();
+}
+
+/**
+ * tracer_tracing_is_on - show real state of ring buffer enabled
+ * @tr : the trace array to know if ring buffer is enabled
+ *
+ * Shows real state of the ring buffer if it is enabled or not.
+ */
+static int tracer_tracing_is_on(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ return ring_buffer_record_is_on(tr->trace_buffer.buffer);
+ return !tr->buffer_disabled;
+}
+
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+ return tracer_tracing_is_on(&global_trace);
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
static int __init set_buf_size(char *str)
{
unsigned long buf_size;
@@ -384,15 +766,15 @@ __setup("trace_buf_size=", set_buf_size);
static int __init set_tracing_thresh(char *str)
{
- unsigned long threshhold;
+ unsigned long threshold;
int ret;
if (!str)
return 0;
- ret = strict_strtoul(str, 0, &threshhold);
+ ret = kstrtoul(str, 0, &threshold);
if (ret < 0)
return 0;
- tracing_thresh = threshhold * 1000;
+ tracing_thresh = threshold * 1000;
return 1;
}
__setup("tracing_thresh=", set_tracing_thresh);
@@ -428,20 +810,24 @@ static const char *trace_options[] = {
"overwrite",
"disable_on_free",
"irq-info",
+ "markers",
+ "function-trace",
NULL
};
static struct {
u64 (*func)(void);
const char *name;
+ int in_ns; /* is this clock in nanoseconds? */
} trace_clocks[] = {
- { trace_clock_local, "local" },
- { trace_clock_global, "global" },
- { trace_clock_counter, "counter" },
+ { trace_clock_local, "local", 1 },
+ { trace_clock_global, "global", 1 },
+ { trace_clock_counter, "counter", 0 },
+ { trace_clock_jiffies, "uptime", 0 },
+ { trace_clock, "perf", 1 },
+ ARCH_TRACE_CLOCKS
};
-int trace_clock_id;
-
/*
* trace_parser_get_init - gets the buffer for trace parser
*/
@@ -536,9 +922,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
if (isspace(ch)) {
parser->buffer[parser->idx] = 0;
parser->cont = false;
- } else {
+ } else if (parser->idx < parser->size - 1) {
parser->cont = true;
parser->buffer[parser->idx++] = ch;
+ } else {
+ ret = -EINVAL;
+ goto out;
}
*ppos += read;
@@ -575,7 +964,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- void *ret;
if (s->len <= s->readpos)
return -EBUSY;
@@ -583,35 +971,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
len = s->len - s->readpos;
if (cnt > len)
cnt = len;
- ret = memcpy(buf, s->buffer + s->readpos, cnt);
- if (!ret)
- return -EFAULT;
+ memcpy(buf, s->buffer + s->readpos, cnt);
s->readpos += cnt;
return cnt;
}
-/*
- * ftrace_max_lock is used to protect the swapping of buffers
- * when taking a max snapshot. The buffers themselves are
- * protected by per_cpu spinlocks. But the action of the swap
- * needs its own lock.
- *
- * This is defined as a arch_spinlock_t in order to help
- * with performance when lockdep debugging is enabled.
- *
- * It is also used in other places outside the update_max_tr
- * so it needs to be defined outside of the
- * CONFIG_TRACER_MAX_TRACE.
- */
-static arch_spinlock_t ftrace_max_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
unsigned long __read_mostly tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
-unsigned long __read_mostly tracing_max_latency;
-
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
@@ -620,20 +988,29 @@ unsigned long __read_mostly tracing_max_latency;
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_array_cpu *data = tr->data[cpu];
- struct trace_array_cpu *max_data;
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct trace_buffer *max_buf = &tr->max_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
- max_tr.cpu = cpu;
- max_tr.time_start = data->preempt_timestamp;
+ max_buf->cpu = cpu;
+ max_buf->time_start = data->preempt_timestamp;
- max_data = max_tr.data[cpu];
- max_data->saved_latency = tracing_max_latency;
+ max_data->saved_latency = tr->max_latency;
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
- max_data->uid = task_uid(tsk);
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
max_data->policy = tsk->policy;
max_data->rt_priority = tsk->rt_priority;
@@ -654,23 +1031,27 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct ring_buffer *buf = tr->buffer;
+ struct ring_buffer *buf;
- if (trace_stop_count)
+ if (tr->stop_count)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->use_max_tr) {
- WARN_ON_ONCE(1);
+
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
return;
}
- arch_spin_lock(&ftrace_max_lock);
- tr->buffer = max_tr.buffer;
- max_tr.buffer = buf;
+ arch_spin_lock(&tr->max_lock);
+
+ buf = tr->trace_buffer.buffer;
+ tr->trace_buffer.buffer = tr->max_buffer.buffer;
+ tr->max_buffer.buffer = buf;
__update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&tr->max_lock);
}
/**
@@ -686,20 +1067,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
int ret;
- if (trace_stop_count)
+ if (tr->stop_count)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->use_max_tr) {
- WARN_ON_ONCE(1);
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
return;
}
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&tr->max_lock);
- ftrace_disable_cpu();
-
- ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+ ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
if (ret == -EBUSY) {
/*
@@ -708,19 +1088,92 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* the max trace buffer (no one writes directly to it)
* and flag that it failed.
*/
- trace_array_printk(&max_tr, _THIS_IP_,
+ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
"Failed to swap buffers due to commit in progress\n");
}
- ftrace_enable_cpu();
-
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
__update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&tr->max_lock);
}
#endif /* CONFIG_TRACER_MAX_TRACE */
+static int wait_on_pipe(struct trace_iterator *iter)
+{
+ /* Iterators are static, they should be filled or empty */
+ if (trace_buffer_iter(iter, iter->cpu_file))
+ return 0;
+
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int run_tracer_selftest(struct tracer *type)
+{
+ struct trace_array *tr = &global_trace;
+ struct tracer *saved_tracer = tr->current_trace;
+ int ret;
+
+ if (!type->selftest || tracing_selftest_disabled)
+ return 0;
+
+ /*
+ * Run a selftest on this tracer.
+ * Here we reset the trace buffer, and set the current
+ * tracer to be this tracer. The tracer can then run some
+ * internal tracing to verify that everything is in order.
+ * If we fail, we do not register this tracer.
+ */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+ tr->current_trace = type;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (type->use_max_tr) {
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ tr->allocated_snapshot = true;
+ }
+#endif
+
+ /* the test is responsible for initializing and enabling */
+ pr_info("Testing tracer %s: ", type->name);
+ ret = type->selftest(type, tr);
+ /* the test is responsible for resetting too */
+ tr->current_trace = saved_tracer;
+ if (ret) {
+ printk(KERN_CONT "FAILED!\n");
+ /* Add the warning after printing 'FAILED' */
+ WARN_ON(1);
+ return -1;
+ }
+ /* Only reset on passing, to avoid touching corrupted buffers */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (type->use_max_tr) {
+ tr->allocated_snapshot = false;
+
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(tr->max_buffer.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ }
+#endif
+
+ printk(KERN_CONT "PASSED\n");
+ return 0;
+}
+#else
+static inline int run_tracer_selftest(struct tracer *type)
+{
+ return 0;
+}
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
/**
* register_tracer - register a tracer with the ftrace system.
* @type - the plugin for the tracer
@@ -728,8 +1181,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* Register a new plugin tracer.
*/
int register_tracer(struct tracer *type)
-__releases(kernel_lock)
-__acquires(kernel_lock)
{
struct tracer *t;
int ret = 0;
@@ -765,49 +1216,10 @@ __acquires(kernel_lock)
else
if (!type->flags->opts)
type->flags->opts = dummy_tracer_opt;
- if (!type->wait_pipe)
- type->wait_pipe = default_wait_pipe;
-
-
-#ifdef CONFIG_FTRACE_STARTUP_TEST
- if (type->selftest && !tracing_selftest_disabled) {
- struct tracer *saved_tracer = current_trace;
- struct trace_array *tr = &global_trace;
- /*
- * Run a selftest on this tracer.
- * Here we reset the trace buffer, and set the current
- * tracer to be this tracer. The tracer can then run some
- * internal tracing to verify that everything is in order.
- * If we fail, we do not register this tracer.
- */
- tracing_reset_online_cpus(tr);
-
- current_trace = type;
-
- /* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, trace_buf_size);
-
- /* the test is responsible for initializing and enabling */
- pr_info("Testing tracer %s: ", type->name);
- ret = type->selftest(type, tr);
- /* the test is responsible for resetting too */
- current_trace = saved_tracer;
- if (ret) {
- printk(KERN_CONT "FAILED!\n");
- goto out;
- }
- /* Only reset on passing, to avoid touching corrupted buffers */
- tracing_reset_online_cpus(tr);
-
- /* Shrink the max buffer again */
- if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, 1);
-
- printk(KERN_CONT "PASSED\n");
- }
-#endif
+ ret = run_tracer_selftest(type);
+ if (ret < 0)
+ goto out;
type->next = trace_types;
trace_types = type;
@@ -824,10 +1236,10 @@ __acquires(kernel_lock)
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
- tracing_set_tracer(type->name);
+ tracing_set_tracer(&global_trace, type->name);
default_bootup_tracer = NULL;
/* disable other selftests, since this will break it. */
- tracing_selftest_disabled = 1;
+ tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
type->name);
@@ -837,116 +1249,126 @@ __acquires(kernel_lock)
return ret;
}
-void unregister_tracer(struct tracer *type)
+void tracing_reset(struct trace_buffer *buf, int cpu)
{
- struct tracer **t;
+ struct ring_buffer *buffer = buf->buffer;
- mutex_lock(&trace_types_lock);
- for (t = &trace_types; *t; t = &(*t)->next) {
- if (*t == type)
- goto found;
- }
- pr_info("Tracer %s not registered\n", type->name);
- goto out;
-
- found:
- *t = (*t)->next;
-
- if (type == current_trace && tracer_enabled) {
- tracer_enabled = 0;
- tracing_stop();
- if (current_trace->stop)
- current_trace->stop(&global_trace);
- current_trace = &nop_trace;
- }
-out:
- mutex_unlock(&trace_types_lock);
-}
-
-static void __tracing_reset(struct ring_buffer *buffer, int cpu)
-{
- ftrace_disable_cpu();
- ring_buffer_reset_cpu(buffer, cpu);
- ftrace_enable_cpu();
-}
-
-void tracing_reset(struct trace_array *tr, int cpu)
-{
- struct ring_buffer *buffer = tr->buffer;
+ if (!buffer)
+ return;
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
synchronize_sched();
- __tracing_reset(buffer, cpu);
+ ring_buffer_reset_cpu(buffer, cpu);
ring_buffer_record_enable(buffer);
}
-void tracing_reset_online_cpus(struct trace_array *tr)
+void tracing_reset_online_cpus(struct trace_buffer *buf)
{
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = buf->buffer;
int cpu;
+ if (!buffer)
+ return;
+
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
synchronize_sched();
- tr->time_start = ftrace_now(tr->cpu);
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
for_each_online_cpu(cpu)
- __tracing_reset(buffer, cpu);
+ ring_buffer_reset_cpu(buffer, cpu);
ring_buffer_record_enable(buffer);
}
-void tracing_reset_current(int cpu)
+/* Must have trace_types_lock held */
+void tracing_reset_all_online_cpus(void)
{
- tracing_reset(&global_trace, cpu);
-}
+ struct trace_array *tr;
-void tracing_reset_current_online_cpus(void)
-{
- tracing_reset_online_cpus(&global_trace);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+ }
}
-#define SAVED_CMDLINES 128
+#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
-static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
-static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
-static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
-static int cmdline_idx;
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+ unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+ unsigned *map_cmdline_to_pid;
+ unsigned cmdline_num;
+ int cmdline_idx;
+ char *saved_cmdlines;
+};
+static struct saved_cmdlines_buffer *savedcmd;
/* temporary disable recording */
static atomic_t trace_record_cmdline_disabled __read_mostly;
-static void trace_init_cmdlines(void)
+static inline char *get_saved_cmdlines(int idx)
{
- memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
- memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
- cmdline_idx = 0;
+ return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
}
-int is_tracing_stopped(void)
+static inline void set_cmdline(int idx, const char *cmdline)
{
- return trace_stop_count;
+ memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
-/**
- * ftrace_off_permanent - disable all ftrace code permanently
- *
- * This should only be called when a serious anomally has
- * been detected. This will turn off the function tracing,
- * ring buffers, and other tracing utilites. It takes no
- * locks and can be called from any context.
- */
-void ftrace_off_permanent(void)
+static int allocate_cmdlines_buffer(unsigned int val,
+ struct saved_cmdlines_buffer *s)
+{
+ s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_pid)
+ return -ENOMEM;
+
+ s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
+ if (!s->saved_cmdlines) {
+ kfree(s->map_cmdline_to_pid);
+ return -ENOMEM;
+ }
+
+ s->cmdline_idx = 0;
+ s->cmdline_num = val;
+ memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+ sizeof(s->map_pid_to_cmdline));
+ memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_pid));
+
+ return 0;
+}
+
+static int trace_create_savedcmd(void)
+{
+ int ret;
+
+ savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
+ if (!savedcmd)
+ return -ENOMEM;
+
+ ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
+ if (ret < 0) {
+ kfree(savedcmd);
+ savedcmd = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int is_tracing_stopped(void)
{
- tracing_disabled = 1;
- ftrace_stop();
- tracing_off_permanent();
+ return global_trace.stop_count;
}
/**
@@ -963,32 +1385,64 @@ void tracing_start(void)
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&tracing_start_lock, flags);
- if (--trace_stop_count) {
- if (trace_stop_count < 0) {
+ raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+ if (--global_trace.stop_count) {
+ if (global_trace.stop_count < 0) {
/* Someone screwed up their debugging */
WARN_ON_ONCE(1);
- trace_stop_count = 0;
+ global_trace.stop_count = 0;
}
goto out;
}
/* Prevent the buffers from switching */
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&global_trace.max_lock);
- buffer = global_trace.buffer;
+ buffer = global_trace.trace_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
- buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ buffer = global_trace.max_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
+#endif
+
+ arch_spin_unlock(&global_trace.max_lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_start_tr(struct trace_array *tr)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ if (tracing_disabled)
+ return;
- arch_spin_unlock(&ftrace_max_lock);
+ /* If global, we need to also start the max tracer */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return tracing_start();
+
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+
+ if (--tr->stop_count) {
+ if (tr->stop_count < 0) {
+ /* Someone screwed up their debugging */
+ WARN_ON_ONCE(1);
+ tr->stop_count = 0;
+ }
+ goto out;
+ }
+
+ buffer = tr->trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
- ftrace_start();
out:
- raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
@@ -1002,36 +1456,58 @@ void tracing_stop(void)
struct ring_buffer *buffer;
unsigned long flags;
- ftrace_stop();
- raw_spin_lock_irqsave(&tracing_start_lock, flags);
- if (trace_stop_count++)
+ raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+ if (global_trace.stop_count++)
goto out;
/* Prevent the buffers from switching */
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&global_trace.max_lock);
- buffer = global_trace.buffer;
+ buffer = global_trace.trace_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
- buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ buffer = global_trace.max_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
+#endif
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&global_trace.max_lock);
out:
- raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+ raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ /* If global, we need to also stop the max tracer */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return tracing_stop();
+
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (tr->stop_count++)
+ goto out;
+
+ buffer = tr->trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+
+ out:
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
void trace_stop_cmdline_recording(void);
-static void trace_save_cmdline(struct task_struct *tsk)
+static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
- return;
+ return 0;
/*
* It's not the end of the world if we don't get
@@ -1040,11 +1516,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
* so if we miss here, then better luck next time.
*/
if (!arch_spin_trylock(&trace_cmdline_lock))
- return;
+ return 0;
- idx = map_pid_to_cmdline[tsk->pid];
+ idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
- idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+ idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
/*
* Check whether the cmdline buffer at idx has a pid
@@ -1052,22 +1528,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
* need to clear the map_pid_to_cmdline. Otherwise we
* would read the new comm for the old pid.
*/
- pid = map_cmdline_to_pid[idx];
+ pid = savedcmd->map_cmdline_to_pid[idx];
if (pid != NO_CMDLINE_MAP)
- map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
+ savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
- map_cmdline_to_pid[idx] = tsk->pid;
- map_pid_to_cmdline[tsk->pid] = idx;
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
+ savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
- cmdline_idx = idx;
+ savedcmd->cmdline_idx = idx;
}
- memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+ set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
}
-void trace_find_cmdline(int pid, char comm[])
+static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
@@ -1086,13 +1564,19 @@ void trace_find_cmdline(int pid, char comm[])
return;
}
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- map = map_pid_to_cmdline[pid];
+ map = savedcmd->map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
- strcpy(comm, saved_cmdlines[map]);
+ strcpy(comm, get_saved_cmdlines(map));
else
strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
@@ -1100,11 +1584,14 @@ void trace_find_cmdline(int pid, char comm[])
void tracing_record_cmdline(struct task_struct *tsk)
{
- if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
- !tracing_is_on())
+ if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
return;
- trace_save_cmdline(tsk);
+ if (!__this_cpu_read(trace_cmdline_save))
+ return;
+
+ if (trace_save_cmdline(tsk))
+ __this_cpu_write(trace_cmdline_save, false);
}
void
@@ -1115,7 +1602,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->padding = 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1124,7 +1610,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
- (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+ (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
@@ -1147,34 +1634,66 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
return event;
}
+void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_cmdline_save, true);
+ ring_buffer_unlock_commit(buffer, event);
+}
+
static inline void
__trace_buffer_unlock_commit(struct ring_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
- int wake)
+ unsigned long flags, int pc)
{
- ring_buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
ftrace_trace_userstack(buffer, flags, pc);
-
- if (wake)
- trace_wake_up();
}
void trace_buffer_unlock_commit(struct ring_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc)
{
- __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
+ __trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
+
+static struct ring_buffer *temp_buffer;
+
+struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+ struct ftrace_event_file *ftrace_file,
+ int type, unsigned long len,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *entry;
+
+ *current_rb = ftrace_file->tr->trace_buffer.buffer;
+ entry = trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+ /*
+ * If tracing is off, but we have triggers enabled
+ * we still need to look at the event data. Use the temp_buffer
+ * to store the trace event for the tigger to use. It's recusive
+ * safe and will not be recorded anywhere.
+ */
+ if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+ *current_rb = temp_buffer;
+ entry = trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+ }
+ return entry;
}
+EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
struct ring_buffer_event *
trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
int type, unsigned long len,
unsigned long flags, int pc)
{
- *current_rb = global_trace.buffer;
+ *current_rb = global_trace.trace_buffer.buffer;
return trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
}
@@ -1184,29 +1703,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc)
{
- __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
+ __trace_buffer_unlock_commit(buffer, event, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
-{
- __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
-}
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
-
-void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc,
- struct pt_regs *regs)
+void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc,
+ struct pt_regs *regs)
{
- ring_buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
+EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
struct ring_buffer_event *event)
@@ -1221,7 +1732,7 @@ trace_function(struct trace_array *tr,
int pc)
{
struct ftrace_event_call *call = &event_function;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -1237,17 +1748,8 @@ trace_function(struct trace_array *tr,
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!filter_check_discard(call, entry, buffer, event))
- ring_buffer_unlock_commit(buffer, event);
-}
-
-void
-ftrace(struct trace_array *tr, struct trace_array_cpu *data,
- unsigned long ip, unsigned long parent_ip, unsigned long flags,
- int pc)
-{
- if (likely(!atomic_read(&data->disabled)))
- trace_function(tr, ip, parent_ip, flags, pc);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
}
#ifdef CONFIG_STACKTRACE
@@ -1282,7 +1784,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
preempt_disable_notrace();
- use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+ use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
/*
* We don't need any atomic variables, just a barrier.
* If an interrupt comes in, we don't care, because it would
@@ -1292,7 +1794,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
barrier();
if (use_stack == 1) {
- trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
+ trace.entries = this_cpu_ptr(ftrace_stack.calls);
trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
if (regs)
@@ -1330,13 +1832,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
entry->size = trace.nr_entries;
- if (!filter_check_discard(call, entry, buffer, event))
- ring_buffer_unlock_commit(buffer, event);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
out:
/* Again, don't let gcc optimize things here */
barrier();
- __get_cpu_var(ftrace_stack_reserve)--;
+ __this_cpu_dec(ftrace_stack_reserve);
preempt_enable_notrace();
}
@@ -1362,13 +1864,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
}
/**
* trace_dump_stack - record a stack back trace in the trace buffer
+ * @skip: Number of functions to skip (helper handlers)
*/
-void trace_dump_stack(void)
+void trace_dump_stack(int skip)
{
unsigned long flags;
@@ -1377,8 +1880,13 @@ void trace_dump_stack(void)
local_save_flags(flags);
- /* skipping 3 traces, seems to get us at the caller of this function */
- __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
+ /*
+ * Skip 3 more, seems to get us at the caller of
+ * this function.
+ */
+ skip += 3;
+ __ftrace_trace_stack(global_trace.trace_buffer.buffer,
+ flags, skip, preempt_count(), NULL);
}
static DEFINE_PER_CPU(int, user_stack_count);
@@ -1426,8 +1934,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
trace.entries = entry->caller;
save_stack_trace_user(&trace);
- if (!filter_check_discard(call, entry, buffer, event))
- ring_buffer_unlock_commit(buffer, event);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
out_drop_count:
__this_cpu_dec(user_stack_count);
@@ -1444,25 +1952,161 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
#endif /* CONFIG_STACKTRACE */
+/* created for use with alloc_percpu */
+struct trace_buffer_struct {
+ char buffer[TRACE_BUF_SIZE];
+};
+
+static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct *trace_percpu_sirq_buffer;
+static struct trace_buffer_struct *trace_percpu_irq_buffer;
+static struct trace_buffer_struct *trace_percpu_nmi_buffer;
+
+/*
+ * The buffer used is dependent on the context. There is a per cpu
+ * buffer for normal context, softirq contex, hard irq context and
+ * for NMI context. Thise allows for lockless recording.
+ *
+ * Note, if the buffers failed to be allocated, then this returns NULL
+ */
+static char *get_trace_buf(void)
+{
+ struct trace_buffer_struct *percpu_buffer;
+
+ /*
+ * If we have allocated per cpu buffers, then we do not
+ * need to do any locking.
+ */
+ if (in_nmi())
+ percpu_buffer = trace_percpu_nmi_buffer;
+ else if (in_irq())
+ percpu_buffer = trace_percpu_irq_buffer;
+ else if (in_softirq())
+ percpu_buffer = trace_percpu_sirq_buffer;
+ else
+ percpu_buffer = trace_percpu_buffer;
+
+ if (!percpu_buffer)
+ return NULL;
+
+ return this_cpu_ptr(&percpu_buffer->buffer[0]);
+}
+
+static int alloc_percpu_trace_buffer(void)
+{
+ struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct *sirq_buffers;
+ struct trace_buffer_struct *irq_buffers;
+ struct trace_buffer_struct *nmi_buffers;
+
+ buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!buffers)
+ goto err_warn;
+
+ sirq_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!sirq_buffers)
+ goto err_sirq;
+
+ irq_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!irq_buffers)
+ goto err_irq;
+
+ nmi_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!nmi_buffers)
+ goto err_nmi;
+
+ trace_percpu_buffer = buffers;
+ trace_percpu_sirq_buffer = sirq_buffers;
+ trace_percpu_irq_buffer = irq_buffers;
+ trace_percpu_nmi_buffer = nmi_buffers;
+
+ return 0;
+
+ err_nmi:
+ free_percpu(irq_buffers);
+ err_irq:
+ free_percpu(sirq_buffers);
+ err_sirq:
+ free_percpu(buffers);
+ err_warn:
+ WARN(1, "Could not allocate percpu trace_printk buffer");
+ return -ENOMEM;
+}
+
+static int buffers_allocated;
+
+void trace_printk_init_buffers(void)
+{
+ if (buffers_allocated)
+ return;
+
+ if (alloc_percpu_trace_buffer())
+ return;
+
+ /* trace_printk() is for debug use only. Don't use it in production. */
+
+ pr_warning("\n**********************************************************\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("** **\n");
+ pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warning("** **\n");
+ pr_warning("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** **\n");
+ pr_warning("** If you see this message and you are not debugging **\n");
+ pr_warning("** the kernel, report this immediately to your vendor! **\n");
+ pr_warning("** **\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("**********************************************************\n");
+
+ /* Expand the buffers to set size */
+ tracing_update_buffers();
+
+ buffers_allocated = 1;
+
+ /*
+ * trace_printk_init_buffers() can be called by modules.
+ * If that happens, then we need to start cmdline recording
+ * directly here. If the global_trace.buffer is already
+ * allocated here, then this was called by module code.
+ */
+ if (global_trace.trace_buffer.buffer)
+ tracing_start_cmdline_record();
+}
+
+void trace_printk_start_comm(void)
+{
+ /* Start tracing comms if trace printk is set */
+ if (!buffers_allocated)
+ return;
+ tracing_start_cmdline_record();
+}
+
+static void trace_printk_start_stop_comm(int enabled)
+{
+ if (!buffers_allocated)
+ return;
+
+ if (enabled)
+ tracing_start_cmdline_record();
+ else
+ tracing_stop_cmdline_record();
+}
+
/**
* trace_vbprintk - write binary msg to tracing buffer
*
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- static arch_spinlock_t trace_buf_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
- static u32 trace_buf[TRACE_BUF_SIZE];
-
struct ftrace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
struct bprint_entry *entry;
unsigned long flags;
- int disable;
- int cpu, len = 0, size, pc;
+ char *tbuffer;
+ int len = 0, size, pc;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -1472,43 +2116,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
pc = preempt_count();
preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disable = atomic_inc_return(&data->disabled);
- if (unlikely(disable != 1))
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
goto out;
+ }
- /* Lockdep uses trace_printk for lock tracing */
- local_irq_save(flags);
- arch_spin_lock(&trace_buf_lock);
- len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+ len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
- if (len > TRACE_BUF_SIZE || len < 0)
- goto out_unlock;
+ if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
+ goto out;
+ local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
- buffer = tr->buffer;
+ buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
flags, pc);
if (!event)
- goto out_unlock;
+ goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->fmt = fmt;
- memcpy(entry->buf, trace_buf, sizeof(u32) * len);
- if (!filter_check_discard(call, entry, buffer, event)) {
- ring_buffer_unlock_commit(buffer, event);
+ memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+ if (!call_filter_check_discard(call, entry, buffer, event)) {
+ __buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
}
-out_unlock:
- arch_spin_unlock(&trace_buf_lock);
- local_irq_restore(flags);
-
out:
- atomic_dec_return(&data->disabled);
preempt_enable_notrace();
unpause_graph_tracing();
@@ -1516,80 +2153,95 @@ out:
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
-int trace_array_printk(struct trace_array *tr,
- unsigned long ip, const char *fmt, ...)
-{
- int ret;
- va_list ap;
-
- if (!(trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
- va_start(ap, fmt);
- ret = trace_array_vprintk(tr, ip, fmt, ap);
- va_end(ap);
- return ret;
-}
-
-int trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args)
+static int
+__trace_array_vprintk(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
{
- static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
- static char trace_buf[TRACE_BUF_SIZE];
-
struct ftrace_event_call *call = &event_print;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- struct trace_array_cpu *data;
- int cpu, len = 0, size, pc;
+ int len = 0, size, pc;
struct print_entry *entry;
- unsigned long irq_flags;
- int disable;
+ unsigned long flags;
+ char *tbuffer;
if (tracing_disabled || tracing_selftest_running)
return 0;
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
pc = preempt_count();
preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disable = atomic_inc_return(&data->disabled);
- if (unlikely(disable != 1))
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
goto out;
+ }
- pause_graph_tracing();
- raw_local_irq_save(irq_flags);
- arch_spin_lock(&trace_buf_lock);
- len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+ len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+ if (len > TRACE_BUF_SIZE)
+ goto out;
+ local_save_flags(flags);
size = sizeof(*entry) + len + 1;
- buffer = tr->buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, pc);
+ flags, pc);
if (!event)
- goto out_unlock;
+ goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
- memcpy(&entry->buf, trace_buf, len);
+ memcpy(&entry->buf, tbuffer, len);
entry->buf[len] = '\0';
- if (!filter_check_discard(call, entry, buffer, event)) {
- ring_buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, irq_flags, 6, pc);
+ if (!call_filter_check_discard(call, entry, buffer, event)) {
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, flags, 6, pc);
}
-
- out_unlock:
- arch_spin_unlock(&trace_buf_lock);
- raw_local_irq_restore(irq_flags);
- unpause_graph_tracing();
out:
- atomic_dec_return(&data->disabled);
preempt_enable_notrace();
+ unpause_graph_tracing();
return len;
}
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+}
+
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_array_printk_buf(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1598,14 +2250,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
static void trace_iterator_increment(struct trace_iterator *iter)
{
- /* Don't allow ftrace to trace into the ring buffers */
- ftrace_disable_cpu();
+ struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
iter->idx++;
- if (iter->buffer_iter[iter->cpu])
- ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-
- ftrace_enable_cpu();
+ if (buf_iter)
+ ring_buffer_read(buf_iter, NULL);
}
static struct trace_entry *
@@ -1613,19 +2262,14 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
unsigned long *lost_events)
{
struct ring_buffer_event *event;
- struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
-
- /* Don't allow ftrace to trace into the ring buffers */
- ftrace_disable_cpu();
+ struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
- event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+ event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
lost_events);
- ftrace_enable_cpu();
-
if (event) {
iter->ent_size = ring_buffer_event_length(event);
return ring_buffer_event_data(event);
@@ -1638,19 +2282,20 @@ static struct trace_entry *
__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
unsigned long *missing_events, u64 *ent_ts)
{
- struct ring_buffer *buffer = iter->tr->buffer;
+ struct ring_buffer *buffer = iter->trace_buffer->buffer;
struct trace_entry *ent, *next = NULL;
unsigned long lost_events = 0, next_lost = 0;
int cpu_file = iter->cpu_file;
u64 next_ts = 0, ts;
int next_cpu = -1;
+ int next_size = 0;
int cpu;
/*
* If we are in a per_cpu trace file, don't bother by iterating over
* all cpu and peek directly.
*/
- if (cpu_file > TRACE_PIPE_ALL_CPU) {
+ if (cpu_file > RING_BUFFER_ALL_CPUS) {
if (ring_buffer_empty_cpu(buffer, cpu_file))
return NULL;
ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1675,9 +2320,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
next_cpu = cpu;
next_ts = ts;
next_lost = lost_events;
+ next_size = iter->ent_size;
}
}
+ iter->ent_size = next_size;
+
if (ent_cpu)
*ent_cpu = next_cpu;
@@ -1711,11 +2359,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
static void trace_consume(struct trace_iterator *iter)
{
- /* Don't allow ftrace to trace into the ring buffers */
- ftrace_disable_cpu();
- ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+ ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
&iter->lost_events);
- ftrace_enable_cpu();
}
static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1747,18 +2392,17 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
- struct trace_array *tr = iter->tr;
struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter;
unsigned long entries = 0;
u64 ts;
- tr->data[cpu]->skipped_entries = 0;
+ per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
- if (!iter->buffer_iter[cpu])
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (!buf_iter)
return;
- buf_iter = iter->buffer_iter[cpu];
ring_buffer_iter_reset(buf_iter);
/*
@@ -1767,13 +2411,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* by the timestamp being before the start of the buffer.
*/
while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
- if (ts >= iter->tr->time_start)
+ if (ts >= iter->trace_buffer->time_start)
break;
entries++;
ring_buffer_read(buf_iter, NULL);
}
- tr->data[cpu]->skipped_entries = entries;
+ per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
}
/*
@@ -1783,37 +2427,42 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct trace_iterator *iter = m->private;
- static struct tracer *old_tracer;
+ struct trace_array *tr = iter->tr;
int cpu_file = iter->cpu_file;
void *p = NULL;
loff_t l = 0;
int cpu;
- /* copy the tracer to avoid using a global lock all around */
+ /*
+ * copy the tracer to avoid using a global lock all around.
+ * iter->trace is a copy of current_trace, the pointer to the
+ * name may be used instead of a strcmp(), as iter->trace->name
+ * will point to the same string as current_trace->name.
+ */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
- *iter->trace = *current_trace;
- }
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
mutex_unlock(&trace_types_lock);
- atomic_inc(&trace_record_cmdline_disabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return ERR_PTR(-EBUSY);
+#endif
+
+ if (!iter->snapshot)
+ atomic_inc(&trace_record_cmdline_disabled);
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
iter->idx = -1;
- ftrace_disable_cpu();
-
- if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu)
tracing_iter_reset(iter, cpu);
} else
tracing_iter_reset(iter, cpu_file);
- ftrace_enable_cpu();
-
iter->leftover = 0;
for (p = iter; p && l < *pos; p = s_next(m, p, &l))
;
@@ -1840,13 +2489,21 @@ static void s_stop(struct seq_file *m, void *p)
{
struct trace_iterator *iter = m->private;
- atomic_dec(&trace_record_cmdline_disabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return;
+#endif
+
+ if (!iter->snapshot)
+ atomic_dec(&trace_record_cmdline_disabled);
+
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
static void
-get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+get_total_entries(struct trace_buffer *buf,
+ unsigned long *total, unsigned long *entries)
{
unsigned long count;
int cpu;
@@ -1855,19 +2512,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
*entries = 0;
for_each_tracing_cpu(cpu) {
- count = ring_buffer_entries_cpu(tr->buffer, cpu);
+ count = ring_buffer_entries_cpu(buf->buffer, cpu);
/*
* If this buffer has skipped entries, then we hold all
* entries for the trace and we need to ignore the
* ones before the time stamp.
*/
- if (tr->data[cpu]->skipped_entries) {
- count -= tr->data[cpu]->skipped_entries;
+ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+ count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
/* total is the same as the entries */
*total += count;
} else
*total += count +
- ring_buffer_overrun_cpu(tr->buffer, cpu);
+ ring_buffer_overrun_cpu(buf->buffer, cpu);
*entries += count;
}
}
@@ -1884,27 +2541,27 @@ static void print_lat_help_header(struct seq_file *m)
seq_puts(m, "# \\ / ||||| \\ | / \n");
}
-static void print_event_info(struct trace_array *tr, struct seq_file *m)
+static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
{
unsigned long total;
unsigned long entries;
- get_total_entries(tr, &total, &entries);
+ get_total_entries(buf, &total, &entries);
seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
entries, total, num_online_cpus());
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
{
- print_event_info(tr, m);
+ print_event_info(buf, m);
seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
seq_puts(m, "# | | | | |\n");
}
-static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
- print_event_info(tr, m);
+ print_event_info(buf, m);
seq_puts(m, "# _-----=> irqs-off\n");
seq_puts(m, "# / _----=> need-resched\n");
seq_puts(m, "# | / _---=> hardirq/softirq\n");
@@ -1918,17 +2575,16 @@ void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_array *tr = iter->tr;
- struct trace_array_cpu *data = tr->data[tr->cpu];
- struct tracer *type = current_trace;
+ struct trace_buffer *buf = iter->trace_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
+ struct tracer *type = iter->trace;
unsigned long entries;
unsigned long total;
const char *name = "preemption";
- if (type)
- name = type->name;
+ name = type->name;
- get_total_entries(tr, &total, &entries);
+ get_total_entries(buf, &total, &entries);
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
@@ -1939,7 +2595,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
nsecs_to_usecs(data->saved_latency),
entries,
total,
- tr->cpu,
+ buf->cpu,
#if defined(CONFIG_PREEMPT_NONE)
"server",
#elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -1959,7 +2615,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
seq_puts(m, "# -----------------\n");
seq_printf(m, "# | task: %.16s-%d "
"(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
- data->comm, data->pid, data->uid, data->nice,
+ data->comm, data->pid,
+ from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
data->policy, data->rt_priority);
seq_puts(m, "# -----------------\n");
@@ -1989,7 +2646,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
if (cpumask_test_cpu(iter->cpu, iter->started))
return;
- if (iter->tr->data[iter->cpu]->skipped_entries)
+ if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
cpumask_set_cpu(iter->cpu, iter->started);
@@ -2108,27 +2765,30 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
int trace_empty(struct trace_iterator *iter)
{
+ struct ring_buffer_iter *buf_iter;
int cpu;
/* If we are looking at one CPU buffer, only check that one */
- if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
cpu = iter->cpu_file;
- if (iter->buffer_iter[cpu]) {
- if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (buf_iter) {
+ if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
return 0;
}
return 1;
}
for_each_tracing_cpu(cpu) {
- if (iter->buffer_iter[cpu]) {
- if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (buf_iter) {
+ if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
return 0;
}
}
@@ -2152,6 +2812,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
return ret;
}
+ if (iter->ent->type == TRACE_BPUTS &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return trace_print_bputs_msg_only(iter);
+
if (iter->ent->type == TRACE_BPRINT &&
trace_flags & TRACE_ITER_PRINTK &&
trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2206,9 +2871,9 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->tr, m);
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->tr, m);
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -2221,6 +2886,50 @@ static void test_ftrace_alive(struct seq_file *m)
seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+static void show_snapshot_main_help(struct seq_file *m)
+{
+ seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+ seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+ seq_printf(m, "# Takes a snapshot of the main buffer.\n");
+ seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
+ seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
+ seq_printf(m, "# is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+ seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+ seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
+#else
+ seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
+ seq_printf(m, "# Must use main snapshot file to allocate.\n");
+#endif
+ seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
+ seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
+ seq_printf(m, "# is not a '0' or '1')\n");
+}
+
+static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ if (iter->tr->allocated_snapshot)
+ seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+ else
+ seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+
+ seq_printf(m, "# Snapshot commands:\n");
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ show_snapshot_main_help(m);
+ else
+ show_snapshot_percpu_help(m);
+}
+#else
+/* Should never be called */
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
+#endif
+
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -2232,7 +2941,9 @@ static int s_show(struct seq_file *m, void *v)
seq_puts(m, "#\n");
test_ftrace_alive(m);
}
- if (iter->trace && iter->trace->print_header)
+ if (iter->snapshot && trace_empty(iter))
+ print_snapshot_help(m, iter);
+ else if (iter->trace && iter->trace->print_header)
iter->trace->print_header(m);
else
trace_default_header(m);
@@ -2263,6 +2974,17 @@ static int s_show(struct seq_file *m, void *v)
return 0;
}
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+ if (inode->i_cdev) /* See trace_create_cpu_file() */
+ return (long)inode->i_cdev - 1;
+ return RING_BUFFER_ALL_CPUS;
+}
+
static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
@@ -2271,21 +2993,24 @@ static const struct seq_operations tracer_seq_ops = {
};
static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file)
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
- long cpu_file = (long) inode->i_private;
- void *fail_ret = ERR_PTR(-ENOMEM);
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
- struct seq_file *m;
- int cpu, ret;
+ int cpu;
if (tracing_disabled)
return ERR_PTR(-ENODEV);
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
if (!iter)
return ERR_PTR(-ENOMEM);
+ iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+ GFP_KERNEL);
+ if (!iter->buffer_iter)
+ goto release;
+
/*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
@@ -2295,35 +3020,45 @@ __tracing_open(struct inode *inode, struct file *file)
if (!iter->trace)
goto fail;
- if (current_trace)
- *iter->trace = *current_trace;
+ *iter->trace = *tr->current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
- if (current_trace && current_trace->print_max)
- iter->tr = &max_tr;
+ iter->tr = tr;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Currently only the top directory has a snapshot */
+ if (tr->current_trace->print_max || snapshot)
+ iter->trace_buffer = &tr->max_buffer;
else
- iter->tr = &global_trace;
+#endif
+ iter->trace_buffer = &tr->trace_buffer;
+ iter->snapshot = snapshot;
iter->pos = -1;
+ iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
- iter->cpu_file = cpu_file;
/* Notify the tracer early; before we stop tracing. */
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->tr->buffer))
+ if (ring_buffer_overruns(iter->trace_buffer->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
- /* stop the trace while dumping */
- tracing_stop();
+ /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+ if (trace_clocks[tr->clock_id].in_ns)
+ iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ /* stop the trace while dumping if we are not opening "snapshot" */
+ if (!iter->snapshot)
+ tracing_stop_tr(tr);
- if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
}
ring_buffer_read_prepare_sync();
for_each_tracing_cpu(cpu) {
@@ -2333,38 +3068,23 @@ __tracing_open(struct inode *inode, struct file *file)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
}
- ret = seq_open(file, &tracer_seq_ops);
- if (ret < 0) {
- fail_ret = ERR_PTR(ret);
- goto fail_buffer;
- }
-
- m = file->private_data;
- m->private = iter;
-
mutex_unlock(&trace_types_lock);
return iter;
- fail_buffer:
- for_each_tracing_cpu(cpu) {
- if (iter->buffer_iter[cpu])
- ring_buffer_read_finish(iter->buffer_iter[cpu]);
- }
- free_cpumask_var(iter->started);
- tracing_start();
fail:
mutex_unlock(&trace_types_lock);
kfree(iter->trace);
- kfree(iter);
-
- return fail_ret;
+ kfree(iter->buffer_iter);
+release:
+ seq_release_private(inode, file);
+ return ERR_PTR(-ENOMEM);
}
int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2376,18 +3096,46 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
return 0;
}
+bool tracing_is_disabled(void)
+{
+ return (tracing_disabled) ? true: false;
+}
+
+/*
+ * Open and update trace_array ref count.
+ * Must have the current trace_array passed to it.
+ */
+static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
static int tracing_release(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
struct seq_file *m = file->private_data;
struct trace_iterator *iter;
int cpu;
- if (!(file->f_mode & FMODE_READ))
+ if (!(file->f_mode & FMODE_READ)) {
+ trace_array_put(tr);
return 0;
+ }
+ /* Writes do not use seq_file */
iter = m->private;
-
mutex_lock(&trace_types_lock);
+
for_each_tracing_cpu(cpu) {
if (iter->buffer_iter[cpu])
ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2396,65 +3144,119 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- /* reenable tracing if it was previously enabled */
- tracing_start();
+ if (!iter->snapshot)
+ /* reenable tracing if it was previously enabled */
+ tracing_start_tr(tr);
+
+ __trace_array_put(tr);
+
mutex_unlock(&trace_types_lock);
- seq_release(inode, file);
mutex_destroy(&iter->mutex);
free_cpumask_var(iter->started);
kfree(iter->trace);
- kfree(iter);
+ kfree(iter->buffer_iter);
+ seq_release_private(inode, file);
+
return 0;
}
+static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+ return 0;
+}
+
+static int tracing_single_release_tr(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ return single_release(inode, file);
+}
+
static int tracing_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
/* If this file was open for write, then erase contents */
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC)) {
- long cpu = (long) inode->i_private;
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ int cpu = tracing_get_cpu(inode);
- if (cpu == TRACE_PIPE_ALL_CPU)
- tracing_reset_online_cpus(&global_trace);
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->trace_buffer);
else
- tracing_reset(&global_trace, cpu);
+ tracing_reset(&tr->trace_buffer, cpu);
}
if (file->f_mode & FMODE_READ) {
- iter = __tracing_open(inode, file);
+ iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
else if (trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
}
+
+ if (ret < 0)
+ trace_array_put(tr);
+
return ret;
}
+/*
+ * Some tracers are not suitable for instance buffers.
+ * A tracer is always available for the global array (toplevel)
+ * or if it explicitly states that it is.
+ */
+static bool
+trace_ok_for_array(struct tracer *t, struct trace_array *tr)
+{
+ return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
+}
+
+/* Find the next tracer that this trace array may use */
+static struct tracer *
+get_tracer_for_array(struct trace_array *tr, struct tracer *t)
+{
+ while (t && !trace_ok_for_array(t, tr))
+ t = t->next;
+
+ return t;
+}
+
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
+ struct trace_array *tr = m->private;
struct tracer *t = v;
(*pos)++;
if (t)
- t = t->next;
+ t = get_tracer_for_array(tr, t->next);
return t;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_array *tr = m->private;
struct tracer *t;
loff_t l = 0;
mutex_lock(&trace_types_lock);
- for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
- ;
+
+ t = get_tracer_for_array(tr, trace_types);
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
return t;
}
@@ -2489,10 +3291,21 @@ static const struct seq_operations show_traces_seq_ops = {
static int show_traces_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
if (tracing_disabled)
return -ENODEV;
- return seq_open(file, &show_traces_seq_ops);
+ ret = seq_open(file, &show_traces_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = tr;
+
+ return 0;
}
static ssize_t
@@ -2502,19 +3315,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
return count;
}
-static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
{
+ int ret;
+
if (file->f_mode & FMODE_READ)
- return seq_lseek(file, offset, origin);
+ ret = seq_lseek(file, offset, whence);
else
- return 0;
+ file->f_pos = ret = 0;
+
+ return ret;
}
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.write = tracing_write_stub,
- .llseek = tracing_seek,
+ .llseek = tracing_lseek,
.release = tracing_release,
};
@@ -2526,11 +3343,6 @@ static const struct file_operations show_traces_fops = {
};
/*
- * Only trace on a CPU if the bitmask is set:
- */
-static cpumask_var_t tracing_cpumask;
-
-/*
* The tracer itself will not take this lock, but still we want
* to provide a consistent cpumask to user-space:
*/
@@ -2546,11 +3358,12 @@ static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
+ struct trace_array *tr = file_inode(filp)->i_private;
int len;
mutex_lock(&tracing_cpumask_update_lock);
- len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+ len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
if (count - len < 2) {
count = -EINVAL;
goto out_err;
@@ -2568,8 +3381,9 @@ static ssize_t
tracing_cpumask_write(struct file *filp, const char __user *ubuf,
size_t count, loff_t *ppos)
{
- int err, cpu;
+ struct trace_array *tr = file_inode(filp)->i_private;
cpumask_var_t tracing_cpumask_new;
+ int err, cpu;
if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
@@ -2581,25 +3395,27 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
mutex_lock(&tracing_cpumask_update_lock);
local_irq_disable();
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
/*
* Increase/decrease the disabled counter if we are
* about to flip a bit in the cpumask:
*/
- if (cpumask_test_cpu(cpu, tracing_cpumask) &&
+ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&global_trace.data[cpu]->disabled);
+ atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+ ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
}
- if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
+ if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&global_trace.data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+ ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
}
}
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&tr->max_lock);
local_irq_enable();
- cpumask_copy(tracing_cpumask, tracing_cpumask_new);
+ cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
mutex_unlock(&tracing_cpumask_update_lock);
free_cpumask_var(tracing_cpumask_new);
@@ -2613,21 +3429,23 @@ err_unlock:
}
static const struct file_operations tracing_cpumask_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_cpumask_read,
.write = tracing_cpumask_write,
+ .release = tracing_release_generic_tr,
.llseek = generic_file_llseek,
};
static int tracing_trace_options_show(struct seq_file *m, void *v)
{
struct tracer_opt *trace_opts;
+ struct trace_array *tr = m->private;
u32 tracer_flags;
int i;
mutex_lock(&trace_types_lock);
- tracer_flags = current_trace->flags->val;
- trace_opts = current_trace->flags->opts;
+ tracer_flags = tr->current_trace->flags->val;
+ trace_opts = tr->current_trace->flags->opts;
for (i = 0; trace_options[i]; i++) {
if (trace_flags & (1 << i))
@@ -2647,13 +3465,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
return 0;
}
-static int __set_tracer_option(struct tracer *trace,
+static int __set_tracer_option(struct trace_array *tr,
struct tracer_flags *tracer_flags,
struct tracer_opt *opts, int neg)
{
+ struct tracer *trace = tr->current_trace;
int ret;
- ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
+ ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
if (ret)
return ret;
@@ -2665,8 +3484,9 @@ static int __set_tracer_option(struct tracer *trace,
}
/* Try to assign a tracer specific option */
-static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
{
+ struct tracer *trace = tr->current_trace;
struct tracer_flags *tracer_flags = trace->flags;
struct tracer_opt *opts = NULL;
int i;
@@ -2675,18 +3495,31 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
opts = &tracer_flags->opts[i];
if (strcmp(cmp, opts->name) == 0)
- return __set_tracer_option(trace, trace->flags,
- opts, neg);
+ return __set_tracer_option(tr, trace->flags, opts, neg);
}
return -EINVAL;
}
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+ if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ return -1;
+
+ return 0;
+}
+
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
if (!!(trace_flags & mask) == !!enabled)
- return;
+ return 0;
+
+ /* Give the tracer a chance to approve the change */
+ if (tr->current_trace->flag_changed)
+ if (tr->current_trace->flag_changed(tr, mask, !!enabled))
+ return -EINVAL;
if (enabled)
trace_flags |= mask;
@@ -2696,49 +3529,71 @@ static void set_tracer_flags(unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
- if (mask == TRACE_ITER_OVERWRITE)
- ring_buffer_change_overwrite(global_trace.buffer, enabled);
+ if (mask == TRACE_ITER_OVERWRITE) {
+ ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
+#endif
+ }
+
+ if (mask == TRACE_ITER_PRINTK)
+ trace_printk_start_stop_comm(enabled);
+
+ return 0;
}
-static ssize_t
-tracing_trace_options_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int trace_set_options(struct trace_array *tr, char *option)
{
- char buf[64];
char *cmp;
int neg = 0;
- int ret;
+ int ret = -ENODEV;
int i;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ cmp = strstrip(option);
if (strncmp(cmp, "no", 2) == 0) {
neg = 1;
cmp += 2;
}
+ mutex_lock(&trace_types_lock);
+
for (i = 0; trace_options[i]; i++) {
if (strcmp(cmp, trace_options[i]) == 0) {
- set_tracer_flags(1 << i, !neg);
+ ret = set_tracer_flag(tr, 1 << i, !neg);
break;
}
}
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i]) {
- mutex_lock(&trace_types_lock);
- ret = set_tracer_option(current_trace, cmp, neg);
- mutex_unlock(&trace_types_lock);
- if (ret)
- return ret;
- }
+ if (!trace_options[i])
+ ret = set_tracer_option(tr, cmp, neg);
+
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ char buf[64];
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = trace_set_options(tr, buf);
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -2747,35 +3602,156 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
static int tracing_trace_options_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
if (tracing_disabled)
return -ENODEV;
- return single_open(file, tracing_trace_options_show, NULL);
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ ret = single_open(file, tracing_trace_options_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
}
static const struct file_operations tracing_iter_fops = {
.open = tracing_trace_options_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_tr,
.write = tracing_trace_options_write,
};
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
- "# mount -t debugfs nodev /sys/kernel/debug\n\n"
- "# cat /sys/kernel/debug/tracing/available_tracers\n"
- "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
- "# cat /sys/kernel/debug/tracing/current_tracer\n"
- "nop\n"
- "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
- "# cat /sys/kernel/debug/tracing/current_tracer\n"
- "sched_switch\n"
- "# cat /sys/kernel/debug/tracing/trace_options\n"
- "noprint-parent nosym-offset nosym-addr noverbose\n"
- "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
- "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
- "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
- "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
+ "# echo 0 > tracing_on : quick way to disable tracing\n"
+ "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
+ " Important files:\n"
+ " trace\t\t\t- The static contents of the buffer\n"
+ "\t\t\t To clear the buffer write into this file: echo > trace\n"
+ " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
+ " current_tracer\t- function and latency tracers\n"
+ " available_tracers\t- list of configured tracers for current_tracer\n"
+ " buffer_size_kb\t- view and modify size of per cpu buffer\n"
+ " buffer_total_size_kb - view total size of all cpu buffers\n\n"
+ " trace_clock\t\t-change the clock used to order events\n"
+ " local: Per cpu clock but may not be synced across CPUs\n"
+ " global: Synced across CPUs but slows tracing down.\n"
+ " counter: Not a clock, but just an increment\n"
+ " uptime: Jiffy counter from time of boot\n"
+ " perf: Same clock that perf events use\n"
+#ifdef CONFIG_X86_64
+ " x86-tsc: TSC cycle counter\n"
+#endif
+ "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+ " tracing_cpumask\t- Limit which CPUs to trace\n"
+ " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
+ "\t\t\t Remove sub-buffer with rmdir\n"
+ " trace_options\t\t- Set format or modify how tracing happens\n"
+ "\t\t\t Disable an option by adding a suffix 'no' to the\n"
+ "\t\t\t option name\n"
+ " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+ "\n available_filter_functions - list of functions that can be filtered on\n"
+ " set_ftrace_filter\t- echo function name in here to only trace these\n"
+ "\t\t\t functions\n"
+ "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ "\t modules: Can select a group via module\n"
+ "\t Format: :mod:<module-name>\n"
+ "\t example: echo :mod:ext3 > set_ftrace_filter\n"
+ "\t triggers: a command to perform when function is hit\n"
+ "\t Format: <function>:<trigger>[:count]\n"
+ "\t trigger: traceon, traceoff\n"
+ "\t\t enable_event:<system>:<event>\n"
+ "\t\t disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+ "\t\t stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\t\t snapshot\n"
+#endif
+ "\t\t dump\n"
+ "\t\t cpudump\n"
+ "\t example: echo do_fault:traceoff > set_ftrace_filter\n"
+ "\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
+ "\t The first one will disable tracing every time do_fault is hit\n"
+ "\t The second will disable tracing at most 3 times when do_trap is hit\n"
+ "\t The first time do trap is hit and it disables tracing, the\n"
+ "\t counter will decrement to 2. If tracing is already disabled,\n"
+ "\t the counter will not decrement. It only decrements when the\n"
+ "\t trigger did work\n"
+ "\t To remove trigger without count:\n"
+ "\t echo '!<function>:<trigger> > set_ftrace_filter\n"
+ "\t To remove trigger with a count:\n"
+ "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
+ " set_ftrace_notrace\t- echo function name in here to never trace.\n"
+ "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ "\t modules: Can select a group via module command :mod:\n"
+ "\t Does not accept triggers\n"
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_FUNCTION_TRACER
+ " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
+ "\t\t (function)\n"
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\n snapshot\t\t- Like 'trace' but shows the content of the static\n"
+ "\t\t\t snapshot buffer. Read the contents for more\n"
+ "\t\t\t information\n"
+#endif
+#ifdef CONFIG_STACK_TRACER
+ " stack_trace\t\t- Shows the max stack trace when active\n"
+ " stack_max_size\t- Shows current max stack size that was traced\n"
+ "\t\t\t Write into this file to reset the max size (trigger a\n"
+ "\t\t\t new trace)\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+ " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n"
+ "\t\t\t traces\n"
+#endif
+#endif /* CONFIG_STACK_TRACER */
+ " events/\t\t- Directory containing all trace event subsystems:\n"
+ " enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
+ " events/<system>/\t- Directory containing all trace events for <system>:\n"
+ " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n"
+ "\t\t\t events\n"
+ " filter\t\t- If set, only events passing filter are traced\n"
+ " events/<system>/<event>/\t- Directory containing control files for\n"
+ "\t\t\t <event>:\n"
+ " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n"
+ " filter\t\t- If set, only events passing filter are traced\n"
+ " trigger\t\t- If set, a command to perform when event is hit\n"
+ "\t Format: <trigger>[:count][if <filter>]\n"
+ "\t trigger: traceon, traceoff\n"
+ "\t enable_event:<system>:<event>\n"
+ "\t disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+ "\t\t stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\t\t snapshot\n"
+#endif
+ "\t example: echo traceoff > events/block/block_unplug/trigger\n"
+ "\t echo traceoff:3 > events/block/block_unplug/trigger\n"
+ "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
+ "\t events/block/block_unplug/trigger\n"
+ "\t The first disables tracing every time block_unplug is hit.\n"
+ "\t The second disables tracing the first 3 times block_unplug is hit.\n"
+ "\t The third enables the kmalloc event the first 3 times block_unplug\n"
+ "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n"
+ "\t Like function triggers, the counter is only decremented if it\n"
+ "\t enabled or disabled tracing.\n"
+ "\t To remove a trigger without a count:\n"
+ "\t echo '!<trigger> > <system>/<event>/trigger\n"
+ "\t To remove a trigger with a count:\n"
+ "\t echo '!<trigger>:0 > <system>/<event>/trigger\n"
+ "\t Filters can be ignored when removing a trigger.\n"
;
static ssize_t
@@ -2792,73 +3768,129 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
-static ssize_t
-tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
- char *buf_comm;
- char *file_buf;
- char *buf;
- int len = 0;
- int pid;
- int i;
+ unsigned int *ptr = v;
- file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
- if (!file_buf)
- return -ENOMEM;
+ if (*pos || m->count)
+ ptr++;
- buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
- if (!buf_comm) {
- kfree(file_buf);
- return -ENOMEM;
+ (*pos)++;
+
+ for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+ ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+ continue;
+
+ return ptr;
}
- buf = file_buf;
+ return NULL;
+}
- for (i = 0; i < SAVED_CMDLINES; i++) {
- int r;
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
- pid = map_cmdline_to_pid[i];
- if (pid == -1 || pid == NO_CMDLINE_MAP)
- continue;
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
- trace_find_cmdline(pid, buf_comm);
- r = sprintf(buf, "%d %s\n", pid, buf_comm);
- buf += r;
- len += r;
+ v = &savedcmd->map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
}
- len = simple_read_from_buffer(ubuf, cnt, ppos,
- file_buf, len);
+ return v;
+}
- kfree(file_buf);
- kfree(buf_comm);
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
- return len;
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}
static const struct file_operations tracing_saved_cmdlines_fops = {
- .open = tracing_open_generic,
- .read = tracing_saved_cmdlines_read,
- .llseek = generic_file_llseek,
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
static ssize_t
-tracing_ctrl_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
char buf[64];
int r;
- r = sprintf(buf, "%u\n", tracer_enabled);
+ arch_spin_lock(&trace_cmdline_lock);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ arch_spin_unlock(&trace_cmdline_lock);
+
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ kfree(s->saved_cmdlines);
+ kfree(s->map_cmdline_to_pid);
+ kfree(s);
+}
+
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s, *savedcmd_temp;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ if (allocate_cmdlines_buffer(val, s) < 0) {
+ kfree(s);
+ return -ENOMEM;
+ }
+
+ arch_spin_lock(&trace_cmdline_lock);
+ savedcmd_temp = savedcmd;
+ savedcmd = s;
+ arch_spin_unlock(&trace_cmdline_lock);
+ free_saved_cmdlines_buffer(savedcmd_temp);
+
+ return 0;
+}
+
static ssize_t
-tracing_ctrl_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- struct trace_array *tr = filp->private_data;
unsigned long val;
int ret;
@@ -2866,45 +3898,35 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
if (ret)
return ret;
- val = !!val;
-
- mutex_lock(&trace_types_lock);
- if (tracer_enabled ^ val) {
-
- /* Only need to warn if this is used to change the state */
- WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+ /* must have at least 1 entry or less than PID_MAX_DEFAULT */
+ if (!val || val > PID_MAX_DEFAULT)
+ return -EINVAL;
- if (val) {
- tracer_enabled = 1;
- if (current_trace->start)
- current_trace->start(tr);
- tracing_start();
- } else {
- tracer_enabled = 0;
- tracing_stop();
- if (current_trace->stop)
- current_trace->stop(tr);
- }
- }
- mutex_unlock(&trace_types_lock);
+ ret = tracing_resize_saved_cmdlines((unsigned int)val);
+ if (ret < 0)
+ return ret;
*ppos += cnt;
return cnt;
}
+static const struct file_operations tracing_saved_cmdlines_size_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_size_read,
+ .write = tracing_saved_cmdlines_size_write,
+};
+
static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+2];
int r;
mutex_lock(&trace_types_lock);
- if (current_trace)
- r = sprintf(buf, "%s\n", current_trace->name);
- else
- r = sprintf(buf, "\n");
+ r = sprintf(buf, "%s\n", tr->current_trace->name);
mutex_unlock(&trace_types_lock);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -2912,11 +3934,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int tracer_init(struct tracer *t, struct trace_array *tr)
{
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
return t->init(tr);
}
-static int __tracing_resize_ring_buffer(unsigned long size)
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
+{
+ int cpu;
+
+ for_each_tracing_cpu(cpu)
+ per_cpu_ptr(buf->data, cpu)->entries = val;
+}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+/* resize @tr's buffer to the size of @size_tr's entries */
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+ struct trace_buffer *size_buf, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
+ if (ret < 0)
+ break;
+ per_cpu_ptr(trace_buf->data, cpu)->entries =
+ per_cpu_ptr(size_buf->data, cpu)->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
+ if (ret == 0)
+ per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+ per_cpu_ptr(size_buf->data, cpu_id)->entries;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+static int __tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu)
{
int ret;
@@ -2925,21 +3984,25 @@ static int __tracing_resize_ring_buffer(unsigned long size)
* we use the size that was given, and we can forget about
* expanding it later.
*/
- ring_buffer_expanded = 1;
+ ring_buffer_expanded = true;
- ret = ring_buffer_resize(global_trace.buffer, size);
+ /* May be called before buffers are initialized */
+ if (!tr->trace_buffer.buffer)
+ return 0;
+
+ ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
if (ret < 0)
return ret;
- if (!current_trace->use_max_tr)
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
+ !tr->current_trace->use_max_tr)
goto out;
- ret = ring_buffer_resize(max_tr.buffer, size);
+ ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
if (ret < 0) {
- int r;
-
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.entries);
+ int r = resize_buffer_duplicate_size(&tr->trace_buffer,
+ &tr->trace_buffer, cpu);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -2961,43 +4024,42 @@ static int __tracing_resize_ring_buffer(unsigned long size)
return ret;
}
- max_tr.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&tr->max_buffer, size);
+ else
+ per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
+
out:
- global_trace.entries = size;
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&tr->trace_buffer, size);
+ else
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
return ret;
}
-static ssize_t tracing_resize_ring_buffer(unsigned long size)
+static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id)
{
- int cpu, ret = size;
+ int ret = size;
mutex_lock(&trace_types_lock);
- tracing_stop();
-
- /* disable all cpu buffers */
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_inc(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_inc(&max_tr.data[cpu]->disabled);
+ if (cpu_id != RING_BUFFER_ALL_CPUS) {
+ /* make sure, this cpu is enabled in the mask */
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
}
- if (size != global_trace.entries)
- ret = __tracing_resize_ring_buffer(size);
-
+ ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
if (ret < 0)
ret = -ENOMEM;
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_dec(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_dec(&max_tr.data[cpu]->disabled);
- }
-
- tracing_start();
+out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -3020,7 +4082,8 @@ int tracing_update_buffers(void)
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);
return ret;
@@ -3029,22 +4092,42 @@ int tracing_update_buffers(void)
struct trace_option_dentry;
static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer);
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
static void
destroy_trace_option_files(struct trace_option_dentry *topts);
-static int tracing_set_tracer(const char *buf)
+/*
+ * Used to clear out the tracer before deletion of an instance.
+ * Must have trace_types_lock held.
+ */
+static void tracing_set_nop(struct trace_array *tr)
+{
+ if (tr->current_trace == &nop_trace)
+ return;
+
+ tr->current_trace->enabled--;
+
+ if (tr->current_trace->reset)
+ tr->current_trace->reset(tr);
+
+ tr->current_trace = &nop_trace;
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
static struct trace_option_dentry *topts;
- struct trace_array *tr = &global_trace;
struct tracer *t;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ bool had_max_tr;
+#endif
int ret = 0;
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded) {
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
if (ret < 0)
goto out;
ret = 0;
@@ -3058,32 +4141,53 @@ static int tracing_set_tracer(const char *buf)
ret = -EINVAL;
goto out;
}
- if (t == current_trace)
+ if (t == tr->current_trace)
goto out;
+ /* Some tracers are only allowed for the top level buffer */
+ if (!trace_ok_for_array(t, tr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trace_branch_disable();
- if (current_trace && current_trace->reset)
- current_trace->reset(tr);
- if (current_trace && current_trace->use_max_tr) {
+
+ tr->current_trace->enabled--;
+
+ if (tr->current_trace->reset)
+ tr->current_trace->reset(tr);
+
+ /* Current trace needs to be nop_trace before synchronize_sched */
+ tr->current_trace = &nop_trace;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ had_max_tr = tr->allocated_snapshot;
+
+ if (had_max_tr && !t->use_max_tr) {
/*
- * We don't free the ring buffer. instead, resize it because
- * The max_tr ring buffer has some state (e.g. ring->clock) and
- * we want preserve it.
+ * We need to make sure that the update_max_tr sees that
+ * current_trace changed to nop_trace to keep it from
+ * swapping the buffers after we resize it.
+ * The update_max_tr is called from interrupts disabled
+ * so a synchronized_sched() is sufficient.
*/
- ring_buffer_resize(max_tr.buffer, 1);
- max_tr.entries = 1;
+ synchronize_sched();
+ free_snapshot(tr);
+ }
+#endif
+ /* Currently, only the top instance has options */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+ destroy_trace_option_files(topts);
+ topts = create_trace_option_files(tr, t);
}
- destroy_trace_option_files(topts);
-
- current_trace = t;
- topts = create_trace_option_files(current_trace);
- if (current_trace->use_max_tr) {
- ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (t->use_max_tr && !had_max_tr) {
+ ret = alloc_snapshot(tr);
if (ret < 0)
goto out;
- max_tr.entries = global_trace.entries;
}
+#endif
if (t->init) {
ret = tracer_init(t, tr);
@@ -3091,6 +4195,8 @@ static int tracing_set_tracer(const char *buf)
goto out;
}
+ tr->current_trace = t;
+ tr->current_trace->enabled++;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
@@ -3102,6 +4208,7 @@ static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+1];
int i;
size_t ret;
@@ -3121,7 +4228,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
buf[i] = 0;
- err = tracing_set_tracer(buf);
+ err = tracing_set_tracer(tr, buf);
if (err)
return err;
@@ -3164,19 +4271,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
- long cpu_file = (long) inode->i_private;
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret = 0;
if (tracing_disabled)
return -ENODEV;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
mutex_lock(&trace_types_lock);
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
+ __trace_array_put(tr);
goto out;
}
@@ -3189,8 +4300,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
ret = -ENOMEM;
goto fail;
}
- if (current_trace)
- *iter->trace = *current_trace;
+ *iter->trace = *tr->current_trace;
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
ret = -ENOMEM;
@@ -3203,8 +4313,13 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
- iter->cpu_file = cpu_file;
- iter->tr = &global_trace;
+ /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+ if (trace_clocks[tr->clock_id].in_ns)
+ iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ iter->tr = tr;
+ iter->trace_buffer = &tr->trace_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -3219,6 +4334,7 @@ out:
fail:
kfree(iter->trace);
kfree(iter);
+ __trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
}
@@ -3226,6 +4342,7 @@ fail:
static int tracing_release_pipe(struct inode *inode, struct file *file)
{
struct trace_iterator *iter = file->private_data;
+ struct trace_array *tr = inode->i_private;
mutex_lock(&trace_types_lock);
@@ -3239,66 +4356,41 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
kfree(iter->trace);
kfree(iter);
+ trace_array_put(tr);
+
return 0;
}
static unsigned int
-tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
- struct trace_iterator *iter = filp->private_data;
+ /* Iterators are static, they should be filled or empty */
+ if (trace_buffer_iter(iter, iter->cpu_file))
+ return POLLIN | POLLRDNORM;
- if (trace_flags & TRACE_ITER_BLOCK) {
+ if (trace_flags & TRACE_ITER_BLOCK)
/*
* Always select as readable when in blocking mode
*/
return POLLIN | POLLRDNORM;
- } else {
- if (!trace_empty(iter))
- return POLLIN | POLLRDNORM;
- poll_wait(filp, &trace_wait, poll_table);
- if (!trace_empty(iter))
- return POLLIN | POLLRDNORM;
-
- return 0;
- }
+ else
+ return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ filp, poll_table);
}
-
-void default_wait_pipe(struct trace_iterator *iter)
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
{
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
-
- if (trace_empty(iter))
- schedule();
-
- finish_wait(&trace_wait, &wait);
-}
+ struct trace_iterator *iter = filp->private_data;
-/*
- * This is a make-shift waitqueue.
- * A tracer might use this callback on some rare cases:
- *
- * 1) the current tracer might hold the runqueue lock when it wakes up
- * a reader, hence a deadlock (sched, function, and function graph tracers)
- * 2) the function tracers, trace all functions, we don't want
- * the overhead of calling wake_up and friends
- * (and tracing them too)
- *
- * Anyway, this is really very primitive wakeup.
- */
-void poll_wait_pipe(struct trace_iterator *iter)
-{
- set_current_state(TASK_INTERRUPTIBLE);
- /* sleep for 100 msecs, and try again. */
- schedule_timeout(HZ / 10);
+ return trace_poll(iter, filp, poll_table);
}
/* Must be called with trace_types_lock mutex held. */
static int tracing_wait_pipe(struct file *filp)
{
struct trace_iterator *iter = filp->private_data;
+ int ret;
while (trace_empty(iter)) {
@@ -3306,15 +4398,6 @@ static int tracing_wait_pipe(struct file *filp)
return -EAGAIN;
}
- mutex_unlock(&iter->mutex);
-
- iter->trace->wait_pipe(iter);
-
- mutex_lock(&iter->mutex);
-
- if (signal_pending(current))
- return -EINTR;
-
/*
* We block until we read something and tracing is disabled.
* We still block if tracing is disabled, but we have never
@@ -3324,8 +4407,20 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracer_enabled && iter->pos)
+ if (!tracing_is_on() && iter->pos)
break;
+
+ mutex_unlock(&iter->mutex);
+
+ ret = wait_on_pipe(iter);
+
+ mutex_lock(&iter->mutex);
+
+ if (ret)
+ return ret;
+
+ if (signal_pending(current))
+ return -EINTR;
}
return 1;
@@ -3339,7 +4434,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
- static struct tracer *old_tracer;
+ struct trace_array *tr = iter->tr;
ssize_t sret;
/* return any leftover data */
@@ -3351,10 +4446,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
- *iter->trace = *current_trace;
- }
+ if (unlikely(iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
mutex_unlock(&trace_types_lock);
/*
@@ -3387,6 +4480,7 @@ waitagain:
memset(&iter->seq, 0,
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
+ cpumask_clear(iter->started);
iter->pos = -1;
trace_event_read_lock();
@@ -3436,12 +4530,6 @@ out:
return sret;
}
-static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- __free_page(buf->page);
-}
-
static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
unsigned int idx)
{
@@ -3450,10 +4538,8 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
static const struct pipe_buf_operations tracing_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
- .release = tracing_pipe_buf_release,
+ .release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
@@ -3505,11 +4591,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.pages = pages_def,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
- static struct tracer *old_tracer;
+ struct trace_array *tr = iter->tr;
ssize_t ret;
size_t rem;
unsigned int i;
@@ -3519,10 +4606,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
- *iter->trace = *current_trace;
- }
+ if (unlikely(iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
mutex_unlock(&trace_types_lock);
mutex_lock(&iter->mutex);
@@ -3547,7 +4632,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
spd.pages[i] = alloc_page(GFP_KERNEL);
if (!spd.pages[i])
break;
@@ -3576,7 +4661,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
ret = splice_to_pipe(pipe, &spd);
out:
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
return ret;
out_err:
@@ -3588,26 +4673,56 @@ static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_array *tr = filp->private_data;
- char buf[96];
- int r;
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ char buf[64];
+ int r = 0;
+ ssize_t ret;
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- r = sprintf(buf, "%lu (expanded: %lu)\n",
- tr->entries >> 10,
- trace_buf_size >> 10);
- else
- r = sprintf(buf, "%lu\n", tr->entries >> 10);
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ int cpu, buf_size_same;
+ unsigned long size;
+
+ size = 0;
+ buf_size_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_tracing_cpu(cpu) {
+ /* fill in the size from first enabled cpu */
+ if (size == 0)
+ size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
+ if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
+ buf_size_same = 0;
+ break;
+ }
+ }
+
+ if (buf_size_same) {
+ if (!ring_buffer_expanded)
+ r = sprintf(buf, "%lu (expanded: %lu)\n",
+ size >> 10,
+ trace_buf_size >> 10);
+ else
+ r = sprintf(buf, "%lu\n", size >> 10);
+ } else
+ r = sprintf(buf, "X\n");
+ } else
+ r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
+
mutex_unlock(&trace_types_lock);
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ return ret;
}
static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
unsigned long val;
int ret;
@@ -3621,8 +4736,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
/* value is in KB */
val <<= 10;
-
- ret = tracing_resize_ring_buffer(val);
+ ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
if (ret < 0)
return ret;
@@ -3642,7 +4756,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += tr->entries >> 10;
+ size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -3672,11 +4786,15 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
static int
tracing_free_buffer_release(struct inode *inode, struct file *filp)
{
+ struct trace_array *tr = inode->i_private;
+
/* disable tracing ? */
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
- tracing_off();
+ tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
- tracing_resize_ring_buffer(0);
+ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
+
+ trace_array_put(tr);
return 0;
}
@@ -3686,23 +4804,27 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
unsigned long addr = (unsigned long)ubuf;
+ struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
struct page *pages[2];
+ void *map_page[2];
int nr_pages = 1;
ssize_t written;
- void *page1;
- void *page2;
int offset;
int size;
int len;
int ret;
+ int i;
if (tracing_disabled)
return -EINVAL;
+ if (!(trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
@@ -3737,13 +4859,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
goto out;
}
- page1 = kmap_atomic(pages[0]);
- if (nr_pages == 2)
- page2 = kmap_atomic(pages[1]);
+ for (i = 0; i < nr_pages; i++)
+ map_page[i] = kmap_atomic(pages[i]);
local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* possible \n added */
- buffer = global_trace.buffer;
+ buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
if (!event) {
@@ -3757,10 +4878,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (nr_pages == 2) {
len = PAGE_SIZE - offset;
- memcpy(&entry->buf, page1 + offset, len);
- memcpy(&entry->buf[len], page2, cnt - len);
+ memcpy(&entry->buf, map_page[0] + offset, len);
+ memcpy(&entry->buf[len], map_page[1], cnt - len);
} else
- memcpy(&entry->buf, page1 + offset, cnt);
+ memcpy(&entry->buf, map_page[0] + offset, cnt);
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
@@ -3768,42 +4889,78 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
} else
entry->buf[cnt] = '\0';
- ring_buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
written = cnt;
*fpos += written;
out_unlock:
- if (nr_pages == 2)
- kunmap_atomic(page2);
- kunmap_atomic(page1);
- while (nr_pages > 0)
- put_page(pages[--nr_pages]);
+ for (i = 0; i < nr_pages; i++){
+ kunmap_atomic(map_page[i]);
+ put_page(pages[i]);
+ }
out:
return written;
}
static int tracing_clock_show(struct seq_file *m, void *v)
{
+ struct trace_array *tr = m->private;
int i;
for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
seq_printf(m,
"%s%s%s%s", i ? " " : "",
- i == trace_clock_id ? "[" : "", trace_clocks[i].name,
- i == trace_clock_id ? "]" : "");
+ i == tr->clock_id ? "[" : "", trace_clocks[i].name,
+ i == tr->clock_id ? "]" : "");
seq_putc(m, '\n');
return 0;
}
+static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
+ if (strcmp(trace_clocks[i].name, clockstr) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(trace_clocks))
+ return -EINVAL;
+
+ mutex_lock(&trace_types_lock);
+
+ tr->clock_id = i;
+
+ ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
+
+ /*
+ * New clock may not be consistent with the previous clock.
+ * Reset the buffer so that it doesn't have incomparable timestamps.
+ */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
+ tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
char buf[64];
const char *clockstr;
- int i;
+ int ret;
if (cnt >= sizeof(buf))
return -EINVAL;
@@ -3815,35 +4972,204 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
clockstr = strstrip(buf);
- for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
- if (strcmp(trace_clocks[i].name, clockstr) == 0)
- break;
+ ret = tracing_set_clock(tr, clockstr);
+ if (ret)
+ return ret;
+
+ *fpos += cnt;
+
+ return cnt;
+}
+
+static int tracing_clock_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr))
+ return -ENODEV;
+
+ ret = single_open(file, tracing_clock_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+struct ftrace_buffer_info {
+ struct trace_iterator iter;
+ void *spare;
+ unsigned int read;
+};
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ struct seq_file *m;
+ int ret = 0;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ } else {
+ /* Writes still need the seq_file to hold the private data */
+ ret = -ENOMEM;
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ goto out;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter) {
+ kfree(m);
+ goto out;
+ }
+ ret = 0;
+
+ iter->tr = tr;
+ iter->trace_buffer = &tr->max_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
+ m->private = iter;
+ file->private_data = m;
}
- if (i == ARRAY_SIZE(trace_clocks))
- return -EINVAL;
+out:
+ if (ret < 0)
+ trace_array_put(tr);
- trace_clock_id = i;
+ return ret;
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
mutex_lock(&trace_types_lock);
- ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
- if (max_tr.buffer)
- ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+ if (tr->current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ switch (val) {
+ case 0:
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ ret = -EINVAL;
+ break;
+ }
+ if (tr->allocated_snapshot)
+ free_snapshot(tr);
+ break;
+ case 1:
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ ret = -EINVAL;
+ break;
+ }
+#endif
+ if (!tr->allocated_snapshot) {
+ ret = alloc_snapshot(tr);
+ if (ret < 0)
+ break;
+ }
+ local_irq_disable();
+ /* Now, we're going to swap */
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ update_max_tr(tr, current, smp_processor_id());
+ else
+ update_max_tr_single(tr, current, iter->cpu_file);
+ local_irq_enable();
+ break;
+ default:
+ if (tr->allocated_snapshot) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->max_buffer);
+ else
+ tracing_reset(&tr->max_buffer, iter->cpu_file);
+ }
+ break;
+ }
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+out:
mutex_unlock(&trace_types_lock);
+ return ret;
+}
- *fpos += cnt;
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ int ret;
- return cnt;
+ ret = tracing_release(inode, file);
+
+ if (file->f_mode & FMODE_READ)
+ return ret;
+
+ /* If write only, the seq_file is just a stub */
+ if (m)
+ kfree(m->private);
+ kfree(m);
+
+ return 0;
}
-static int tracing_clock_open(struct inode *inode, struct file *file)
+static int tracing_buffers_open(struct inode *inode, struct file *filp);
+static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos);
+static int tracing_buffers_release(struct inode *inode, struct file *file);
+static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
{
- if (tracing_disabled)
- return -ENODEV;
- return single_open(file, tracing_clock_show, NULL);
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ ret = tracing_buffers_open(inode, filp);
+ if (ret < 0)
+ return ret;
+
+ info = filp->private_data;
+
+ if (info->iter.trace->use_max_tr) {
+ tracing_buffers_release(inode, filp);
+ return -EBUSY;
+ }
+
+ info->iter.snapshot = true;
+ info->iter.trace_buffer = &info->iter.tr->max_buffer;
+
+ return ret;
}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -3851,13 +5177,6 @@ static const struct file_operations tracing_max_lat_fops = {
.llseek = generic_file_llseek,
};
-static const struct file_operations tracing_ctrl_fops = {
- .open = tracing_open_generic,
- .read = tracing_ctrl_read,
- .write = tracing_ctrl_write,
- .llseek = generic_file_llseek,
-};
-
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
.read = tracing_set_trace_read,
@@ -3875,65 +5194,106 @@ static const struct file_operations tracing_pipe_fops = {
};
static const struct file_operations tracing_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_entries_read,
.write = tracing_entries_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations tracing_total_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations tracing_free_buffer_fops = {
+ .open = tracing_open_generic_tr,
.write = tracing_free_buffer_write,
.release = tracing_free_buffer_release,
};
static const struct file_operations tracing_mark_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.write = tracing_mark_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations trace_clock_fops = {
.open = tracing_clock_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_tr,
.write = tracing_clock_write,
};
-struct ftrace_buffer_info {
- struct trace_array *tr;
- void *spare;
- int cpu;
- unsigned int read;
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_lseek,
+ .release = tracing_snapshot_release,
};
+static const struct file_operations snapshot_raw_fops = {
+ .open = snapshot_raw_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+ .llseek = no_llseek,
+};
+
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
static int tracing_buffers_open(struct inode *inode, struct file *filp)
{
- int cpu = (int)(long)inode->i_private;
+ struct trace_array *tr = inode->i_private;
struct ftrace_buffer_info *info;
+ int ret;
if (tracing_disabled)
return -ENODEV;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
+ if (!info) {
+ trace_array_put(tr);
return -ENOMEM;
+ }
+
+ mutex_lock(&trace_types_lock);
- info->tr = &global_trace;
- info->cpu = cpu;
- info->spare = NULL;
+ info->iter.tr = tr;
+ info->iter.cpu_file = tracing_get_cpu(inode);
+ info->iter.trace = tr->current_trace;
+ info->iter.trace_buffer = &tr->trace_buffer;
+ info->spare = NULL;
/* Force reading ring buffer for first read */
- info->read = (unsigned int)-1;
+ info->read = (unsigned int)-1;
filp->private_data = info;
- return nonseekable_open(inode, filp);
+ mutex_unlock(&trace_types_lock);
+
+ ret = nonseekable_open(inode, filp);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static unsigned int
+tracing_buffers_poll(struct file *filp, poll_table *poll_table)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ return trace_poll(iter, filp, poll_table);
}
static ssize_t
@@ -3941,56 +5301,101 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
ssize_t ret;
- size_t size;
+ ssize_t size;
if (!count)
return 0;
+ mutex_lock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+ size = -EBUSY;
+ goto out_unlock;
+ }
+#endif
+
if (!info->spare)
- info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
+ info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+ iter->cpu_file);
+ size = -ENOMEM;
if (!info->spare)
- return -ENOMEM;
+ goto out_unlock;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
goto read;
- trace_access_lock(info->cpu);
- ret = ring_buffer_read_page(info->tr->buffer,
+ again:
+ trace_access_lock(iter->cpu_file);
+ ret = ring_buffer_read_page(iter->trace_buffer->buffer,
&info->spare,
count,
- info->cpu, 0);
- trace_access_unlock(info->cpu);
- if (ret < 0)
- return 0;
+ iter->cpu_file, 0);
+ trace_access_unlock(iter->cpu_file);
- info->read = 0;
+ if (ret < 0) {
+ if (trace_empty(iter)) {
+ if ((filp->f_flags & O_NONBLOCK)) {
+ size = -EAGAIN;
+ goto out_unlock;
+ }
+ mutex_unlock(&trace_types_lock);
+ ret = wait_on_pipe(iter);
+ mutex_lock(&trace_types_lock);
+ if (ret) {
+ size = ret;
+ goto out_unlock;
+ }
+ if (signal_pending(current)) {
+ size = -EINTR;
+ goto out_unlock;
+ }
+ goto again;
+ }
+ size = 0;
+ goto out_unlock;
+ }
-read:
+ info->read = 0;
+ read:
size = PAGE_SIZE - info->read;
if (size > count)
size = count;
ret = copy_to_user(ubuf, info->spare + info->read, size);
- if (ret == size)
- return -EFAULT;
+ if (ret == size) {
+ size = -EFAULT;
+ goto out_unlock;
+ }
size -= ret;
*ppos += size;
info->read += size;
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
return size;
}
static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ mutex_lock(&trace_types_lock);
+
+ __trace_array_put(iter->tr);
if (info->spare)
- ring_buffer_free_read_page(info->tr->buffer, info->spare);
+ ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
kfree(info);
+ mutex_unlock(&trace_types_lock);
+
return 0;
}
@@ -4013,12 +5418,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
-static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- return 1;
-}
-
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -4030,11 +5429,9 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = buffer_pipe_buf_steal,
+ .steal = generic_pipe_buf_steal,
.get = buffer_pipe_buf_get,
};
@@ -4061,30 +5458,41 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
struct partial_page partial_def[PIPE_DEF_BUFFERS];
struct page *pages_def[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages_def,
.partial = partial_def,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
int entries, size, i;
- size_t ret;
+ ssize_t ret;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
+ mutex_lock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto out;
+ }
+#endif
+
+ if (splice_grow_spd(pipe, &spd)) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (*ppos & (PAGE_SIZE - 1)) {
- WARN_ONCE(1, "Ftrace: previous read must page-align\n");
ret = -EINVAL;
goto out;
}
if (len & (PAGE_SIZE - 1)) {
- WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
if (len < PAGE_SIZE) {
ret = -EINVAL;
goto out;
@@ -4092,10 +5500,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
- trace_access_lock(info->cpu);
- entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ again:
+ trace_access_lock(iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
- for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
@@ -4104,15 +5513,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
ref->ref = 1;
- ref->buffer = info->tr->buffer;
- ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
+ ref->buffer = iter->trace_buffer->buffer;
+ ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (!ref->page) {
kfree(ref);
break;
}
r = ring_buffer_read_page(ref->buffer, &ref->page,
- len, info->cpu, 1);
+ len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->page);
kfree(ref);
@@ -4136,31 +5545,42 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
spd.nr_pages++;
*ppos += PAGE_SIZE;
- entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
}
- trace_access_unlock(info->cpu);
+ trace_access_unlock(iter->cpu_file);
spd.nr_pages = i;
/* did we read anything? */
if (!spd.nr_pages) {
- if (flags & SPLICE_F_NONBLOCK)
+ if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
ret = -EAGAIN;
- else
- ret = 0;
- /* TODO: block */
- goto out;
+ goto out;
+ }
+ mutex_unlock(&trace_types_lock);
+ ret = wait_on_pipe(iter);
+ mutex_lock(&trace_types_lock);
+ if (ret)
+ goto out;
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ goto again;
}
ret = splice_to_pipe(pipe, &spd);
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
out:
+ mutex_unlock(&trace_types_lock);
+
return ret;
}
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
+ .poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
.llseek = no_llseek,
@@ -4170,8 +5590,10 @@ static ssize_t
tracing_stats_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
- unsigned long cpu = (unsigned long)filp->private_data;
- struct trace_array *tr = &global_trace;
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+ int cpu = tracing_get_cpu(inode);
struct trace_seq *s;
unsigned long cnt;
unsigned long long t;
@@ -4183,25 +5605,42 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
trace_seq_init(s);
- cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "entries: %ld\n", cnt);
- cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "overrun: %ld\n", cnt);
- cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "commit overrun: %ld\n", cnt);
- cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "bytes: %ld\n", cnt);
- t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
- usec_rem = do_div(t, USEC_PER_SEC);
- trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+ if (trace_clocks[tr->clock_id].in_ns) {
+ /* local or global for trace_clock */
+ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
+ usec_rem = do_div(t, USEC_PER_SEC);
+ trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
+ t, usec_rem);
+
+ t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ usec_rem = do_div(t, USEC_PER_SEC);
+ trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+ } else {
+ /* counter or tsc mode for trace_clock */
+ trace_seq_printf(s, "oldest event ts: %llu\n",
+ ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
+
+ trace_seq_printf(s, "now ts: %llu\n",
+ ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ }
+
+ cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "dropped events: %ld\n", cnt);
- t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
- usec_rem = do_div(t, USEC_PER_SEC);
- trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+ cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "read events: %ld\n", cnt);
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4211,9 +5650,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
}
static const struct file_operations tracing_stats_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_stats_read,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -4252,63 +5692,177 @@ static const struct file_operations tracing_dyn_info_fops = {
.read = tracing_read_dyn_info,
.llseek = generic_file_llseek,
};
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
-static struct dentry *d_tracer;
+#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ tracing_snapshot();
+}
-struct dentry *tracing_init_dentry(void)
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
{
- static int once;
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return;
+
+ if (*count != -1)
+ (*count)--;
- if (d_tracer)
- return d_tracer;
+ tracing_snapshot();
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ long count = (long)data;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_printf(m, "snapshot");
+
+ if (count == -1)
+ seq_printf(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", count);
+
+ return 0;
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+ .func = ftrace_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+ .func = ftrace_count_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ return 0;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = register_ftrace_function_probe(glob, ops, count);
+
+ if (ret >= 0)
+ alloc_snapshot(&global_trace);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+ .name = "snapshot",
+ .func = ftrace_trace_snapshot_callback,
+};
+
+static __init int register_snapshot_cmd(void)
+{
+ return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#else
+static inline __init int register_snapshot_cmd(void) { return 0; }
+#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
+
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
+{
+ if (tr->dir)
+ return tr->dir;
if (!debugfs_initialized())
return NULL;
- d_tracer = debugfs_create_dir("tracing", NULL);
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ tr->dir = debugfs_create_dir("tracing", NULL);
- if (!d_tracer && !once) {
- once = 1;
- pr_warning("Could not create debugfs directory 'tracing'\n");
- return NULL;
- }
+ if (!tr->dir)
+ pr_warn_once("Could not create debugfs directory 'tracing'\n");
- return d_tracer;
+ return tr->dir;
}
-static struct dentry *d_percpu;
+struct dentry *tracing_init_dentry(void)
+{
+ return tracing_init_dentry_tr(&global_trace);
+}
-struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
{
- static int once;
struct dentry *d_tracer;
- if (d_percpu)
- return d_percpu;
-
- d_tracer = tracing_init_dentry();
+ if (tr->percpu_dir)
+ return tr->percpu_dir;
+ d_tracer = tracing_init_dentry_tr(tr);
if (!d_tracer)
return NULL;
- d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+ tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
- if (!d_percpu && !once) {
- once = 1;
- pr_warning("Could not create debugfs directory 'per_cpu'\n");
- return NULL;
- }
+ WARN_ONCE(!tr->percpu_dir,
+ "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
- return d_percpu;
+ return tr->percpu_dir;
}
-static void tracing_init_debugfs_percpu(long cpu)
+static struct dentry *
+trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
+ void *data, long cpu, const struct file_operations *fops)
{
- struct dentry *d_percpu = tracing_dentry_percpu();
+ struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
+
+ if (ret) /* See tracing_get_cpu() */
+ ret->d_inode->i_cdev = (void *)(cpu + 1);
+ return ret;
+}
+
+static void
+tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+{
+ struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
char cpu_dir[30]; /* 30 characters should be more than enough */
+ if (!d_percpu)
+ return;
+
snprintf(cpu_dir, 30, "cpu%ld", cpu);
d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
@@ -4317,18 +5871,29 @@ static void tracing_init_debugfs_percpu(long cpu)
}
/* per cpu trace_pipe */
- trace_create_file("trace_pipe", 0444, d_cpu,
- (void *) cpu, &tracing_pipe_fops);
+ trace_create_cpu_file("trace_pipe", 0444, d_cpu,
+ tr, cpu, &tracing_pipe_fops);
/* per cpu trace */
- trace_create_file("trace", 0644, d_cpu,
- (void *) cpu, &tracing_fops);
+ trace_create_cpu_file("trace", 0644, d_cpu,
+ tr, cpu, &tracing_fops);
+
+ trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
+ tr, cpu, &tracing_buffers_fops);
+
+ trace_create_cpu_file("stats", 0444, d_cpu,
+ tr, cpu, &tracing_stats_fops);
- trace_create_file("trace_pipe_raw", 0444, d_cpu,
- (void *) cpu, &tracing_buffers_fops);
+ trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
+ tr, cpu, &tracing_entries_fops);
- trace_create_file("stats", 0444, d_cpu,
- (void *) cpu, &tracing_stats_fops);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_cpu_file("snapshot", 0644, d_cpu,
+ tr, cpu, &snapshot_fops);
+
+ trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+#endif
}
#ifdef CONFIG_FTRACE_SELFTEST
@@ -4339,6 +5904,7 @@ static void tracing_init_debugfs_percpu(long cpu)
struct trace_option_dentry {
struct tracer_opt *opt;
struct tracer_flags *flags;
+ struct trace_array *tr;
struct dentry *entry;
};
@@ -4374,7 +5940,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (!!(topt->flags->val & topt->opt->bit) != val) {
mutex_lock(&trace_types_lock);
- ret = __set_tracer_option(current_trace, topt->flags,
+ ret = __set_tracer_option(topt->tr, topt->flags,
topt->opt, !val);
mutex_unlock(&trace_types_lock);
if (ret)
@@ -4413,6 +5979,7 @@ static ssize_t
trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
+ struct trace_array *tr = &global_trace;
long index = (long)filp->private_data;
unsigned long val;
int ret;
@@ -4423,7 +5990,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
- set_tracer_flags(1 << index, val);
+
+ mutex_lock(&trace_types_lock);
+ ret = set_tracer_flag(tr, 1 << index, val);
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -4453,40 +6026,41 @@ struct dentry *trace_create_file(const char *name,
}
-static struct dentry *trace_options_init_dentry(void)
+static struct dentry *trace_options_init_dentry(struct trace_array *tr)
{
struct dentry *d_tracer;
- static struct dentry *t_options;
- if (t_options)
- return t_options;
+ if (tr->options)
+ return tr->options;
- d_tracer = tracing_init_dentry();
+ d_tracer = tracing_init_dentry_tr(tr);
if (!d_tracer)
return NULL;
- t_options = debugfs_create_dir("options", d_tracer);
- if (!t_options) {
+ tr->options = debugfs_create_dir("options", d_tracer);
+ if (!tr->options) {
pr_warning("Could not create debugfs directory 'options'\n");
return NULL;
}
- return t_options;
+ return tr->options;
}
static void
-create_trace_option_file(struct trace_option_dentry *topt,
+create_trace_option_file(struct trace_array *tr,
+ struct trace_option_dentry *topt,
struct tracer_flags *flags,
struct tracer_opt *opt)
{
struct dentry *t_options;
- t_options = trace_options_init_dentry();
+ t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
topt->flags = flags;
topt->opt = opt;
+ topt->tr = tr;
topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
&trace_options_fops);
@@ -4494,7 +6068,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
}
static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer)
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
struct trace_option_dentry *topts;
struct tracer_flags *flags;
@@ -4519,7 +6093,7 @@ create_trace_option_files(struct tracer *tracer)
return NULL;
for (cnt = 0; opts[cnt].name; cnt++)
- create_trace_option_file(&topts[cnt], flags,
+ create_trace_option_file(tr, &topts[cnt], flags,
&opts[cnt]);
return topts;
@@ -4542,11 +6116,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
}
static struct dentry *
-create_trace_option_core_file(const char *option, long index)
+create_trace_option_core_file(struct trace_array *tr,
+ const char *option, long index)
{
struct dentry *t_options;
- t_options = trace_options_init_dentry();
+ t_options = trace_options_init_dentry(tr);
if (!t_options)
return NULL;
@@ -4554,87 +6129,429 @@ create_trace_option_core_file(const char *option, long index)
&trace_options_core_fops);
}
-static __init void create_trace_options_dir(void)
+static __init void create_trace_options_dir(struct trace_array *tr)
{
struct dentry *t_options;
int i;
- t_options = trace_options_init_dentry();
+ t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
for (i = 0; trace_options[i]; i++)
- create_trace_option_core_file(trace_options[i], i);
+ create_trace_option_core_file(tr, trace_options[i], i);
}
-static __init int tracer_init_debugfs(void)
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- struct dentry *d_tracer;
- int cpu;
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r;
- trace_access_lock_init();
+ r = tracer_tracing_is_on(tr);
+ r = sprintf(buf, "%d\n", r);
- d_tracer = tracing_init_dentry();
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
- trace_create_file("tracing_enabled", 0644, d_tracer,
- &global_trace, &tracing_ctrl_fops);
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ unsigned long val;
+ int ret;
- trace_create_file("trace_options", 0644, d_tracer,
- NULL, &tracing_iter_fops);
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
- trace_create_file("tracing_cpumask", 0644, d_tracer,
- NULL, &tracing_cpumask_fops);
+ if (buffer) {
+ mutex_lock(&trace_types_lock);
+ if (val) {
+ tracer_tracing_on(tr);
+ if (tr->current_trace->start)
+ tr->current_trace->start(tr);
+ } else {
+ tracer_tracing_off(tr);
+ if (tr->current_trace->stop)
+ tr->current_trace->stop(tr);
+ }
+ mutex_unlock(&trace_types_lock);
+ }
- trace_create_file("trace", 0644, d_tracer,
- (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+ (*ppos)++;
- trace_create_file("available_tracers", 0444, d_tracer,
- &global_trace, &show_traces_fops);
+ return cnt;
+}
- trace_create_file("current_tracer", 0644, d_tracer,
- &global_trace, &set_tracer_fops);
+static const struct file_operations rb_simple_fops = {
+ .open = tracing_open_generic_tr,
+ .read = rb_simple_read,
+ .write = rb_simple_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
+struct dentry *trace_instance_dir;
+
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+
+static int
+allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
+{
+ enum ring_buffer_flags rb_flags;
+
+ rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
+ buf->tr = tr;
+
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (!buf->buffer)
+ return -ENOMEM;
+
+ buf->data = alloc_percpu(struct trace_array_cpu);
+ if (!buf->data) {
+ ring_buffer_free(buf->buffer);
+ return -ENOMEM;
+ }
+
+ /* Allocate the first page for all buffers */
+ set_buffer_entries(&tr->trace_buffer,
+ ring_buffer_size(tr->trace_buffer.buffer, 0));
+
+ return 0;
+}
+
+static int allocate_trace_buffers(struct trace_array *tr, int size)
+{
+ int ret;
+
+ ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+ if (ret)
+ return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
- trace_create_file("tracing_max_latency", 0644, d_tracer,
- &tracing_max_latency, &tracing_max_lat_fops);
+ ret = allocate_trace_buffer(tr, &tr->max_buffer,
+ allocate_snapshot ? size : 1);
+ if (WARN_ON(ret)) {
+ ring_buffer_free(tr->trace_buffer.buffer);
+ free_percpu(tr->trace_buffer.data);
+ return -ENOMEM;
+ }
+ tr->allocated_snapshot = allocate_snapshot;
+
+ /*
+ * Only the top level trace array gets its snapshot allocated
+ * from the kernel command line.
+ */
+ allocate_snapshot = false;
#endif
+ return 0;
+}
- trace_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
+static void free_trace_buffer(struct trace_buffer *buf)
+{
+ if (buf->buffer) {
+ ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
+ free_percpu(buf->data);
+ buf->data = NULL;
+ }
+}
- trace_create_file("README", 0444, d_tracer,
- NULL, &tracing_readme_fops);
+static void free_trace_buffers(struct trace_array *tr)
+{
+ if (!tr)
+ return;
+
+ free_trace_buffer(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ free_trace_buffer(&tr->max_buffer);
+#endif
+}
+
+static int new_instance_create(const char *name)
+{
+ struct trace_array *tr;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = -EEXIST;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0)
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+ if (!tr)
+ goto out_unlock;
+
+ tr->name = kstrdup(name, GFP_KERNEL);
+ if (!tr->name)
+ goto out_free_tr;
+
+ if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
+ cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
+
+ raw_spin_lock_init(&tr->start_lock);
+
+ tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+ tr->current_trace = &nop_trace;
+
+ INIT_LIST_HEAD(&tr->systems);
+ INIT_LIST_HEAD(&tr->events);
+
+ if (allocate_trace_buffers(tr, trace_buf_size) < 0)
+ goto out_free_tr;
+
+ tr->dir = debugfs_create_dir(name, trace_instance_dir);
+ if (!tr->dir)
+ goto out_free_tr;
+
+ ret = event_trace_add_tracer(tr->dir, tr);
+ if (ret) {
+ debugfs_remove_recursive(tr->dir);
+ goto out_free_tr;
+ }
+
+ init_tracer_debugfs(tr, tr->dir);
+
+ list_add(&tr->list, &ftrace_trace_arrays);
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+
+ out_free_tr:
+ free_trace_buffers(tr);
+ free_cpumask_var(tr->tracing_cpumask);
+ kfree(tr->name);
+ kfree(tr);
+
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+
+}
+
+static int instance_delete(const char *name)
+{
+ struct trace_array *tr;
+ int found = 0;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = -ENODEV;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ goto out_unlock;
+
+ ret = -EBUSY;
+ if (tr->ref)
+ goto out_unlock;
+
+ list_del(&tr->list);
+
+ tracing_set_nop(tr);
+ event_trace_del_tracer(tr);
+ ftrace_destroy_function_files(tr);
+ debugfs_remove_recursive(tr->dir);
+ free_trace_buffers(tr);
+
+ kfree(tr->name);
+ kfree(tr);
+
+ ret = 0;
+
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+ struct dentry *parent;
+ int ret;
+
+ /* Paranoid: Make sure the parent is the "instances" directory */
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ if (WARN_ON_ONCE(parent != trace_instance_dir))
+ return -ENOENT;
+
+ /*
+ * The inode mutex is locked, but debugfs_create_dir() will also
+ * take the mutex. As the instances directory can not be destroyed
+ * or changed in any other way, it is safe to unlock it, and
+ * let the dentry try. If two users try to make the same dir at
+ * the same time, then the new_instance_create() will determine the
+ * winner.
+ */
+ mutex_unlock(&inode->i_mutex);
+
+ ret = new_instance_create(dentry->d_iname);
+
+ mutex_lock(&inode->i_mutex);
+
+ return ret;
+}
+
+static int instance_rmdir(struct inode *inode, struct dentry *dentry)
+{
+ struct dentry *parent;
+ int ret;
+
+ /* Paranoid: Make sure the parent is the "instances" directory */
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ if (WARN_ON_ONCE(parent != trace_instance_dir))
+ return -ENOENT;
+
+ /* The caller did a dget() on dentry */
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ /*
+ * The inode mutex is locked, but debugfs_create_dir() will also
+ * take the mutex. As the instances directory can not be destroyed
+ * or changed in any other way, it is safe to unlock it, and
+ * let the dentry try. If two users try to make the same dir at
+ * the same time, then the instance_delete() will determine the
+ * winner.
+ */
+ mutex_unlock(&inode->i_mutex);
+
+ ret = instance_delete(dentry->d_iname);
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock(&dentry->d_inode->i_mutex);
+
+ return ret;
+}
+
+static const struct inode_operations instance_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = instance_mkdir,
+ .rmdir = instance_rmdir,
+};
+
+static __init void create_trace_instances(struct dentry *d_tracer)
+{
+ trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+ if (WARN_ON(!trace_instance_dir))
+ return;
+
+ /* Hijack the dir inode operations, to allow mkdir */
+ trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
+}
+
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+{
+ int cpu;
+
+ trace_create_file("available_tracers", 0444, d_tracer,
+ tr, &show_traces_fops);
+
+ trace_create_file("current_tracer", 0644, d_tracer,
+ tr, &set_tracer_fops);
+
+ trace_create_file("tracing_cpumask", 0644, d_tracer,
+ tr, &tracing_cpumask_fops);
+
+ trace_create_file("trace_options", 0644, d_tracer,
+ tr, &tracing_iter_fops);
+
+ trace_create_file("trace", 0644, d_tracer,
+ tr, &tracing_fops);
trace_create_file("trace_pipe", 0444, d_tracer,
- (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
+ tr, &tracing_pipe_fops);
trace_create_file("buffer_size_kb", 0644, d_tracer,
- &global_trace, &tracing_entries_fops);
+ tr, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", 0444, d_tracer,
- &global_trace, &tracing_total_entries_fops);
+ tr, &tracing_total_entries_fops);
- trace_create_file("free_buffer", 0644, d_tracer,
- &global_trace, &tracing_free_buffer_fops);
+ trace_create_file("free_buffer", 0200, d_tracer,
+ tr, &tracing_free_buffer_fops);
trace_create_file("trace_marker", 0220, d_tracer,
- NULL, &tracing_mark_fops);
+ tr, &tracing_mark_fops);
+
+ trace_create_file("trace_clock", 0644, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("tracing_on", 0644, d_tracer,
+ tr, &rb_simple_fops);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ trace_create_file("tracing_max_latency", 0644, d_tracer,
+ &tr->max_latency, &tracing_max_lat_fops);
+#endif
+
+ if (ftrace_create_function_files(tr, d_tracer))
+ WARN(1, "Could not allocate function filter files");
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_file("snapshot", 0644, d_tracer,
+ tr, &snapshot_fops);
+#endif
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_debugfs_percpu(tr, cpu);
+
+}
+
+static __init int tracer_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+
+ trace_access_lock_init();
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ init_tracer_debugfs(&global_trace, d_tracer);
+
+ trace_create_file("tracing_thresh", 0644, d_tracer,
+ &tracing_thresh, &tracing_max_lat_fops);
+
+ trace_create_file("README", 0444, d_tracer,
+ NULL, &tracing_readme_fops);
trace_create_file("saved_cmdlines", 0444, d_tracer,
NULL, &tracing_saved_cmdlines_fops);
- trace_create_file("trace_clock", 0644, d_tracer, NULL,
- &trace_clock_fops);
+ trace_create_file("saved_cmdlines_size", 0644, d_tracer,
+ NULL, &tracing_saved_cmdlines_size_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
- create_trace_options_dir();
+ create_trace_instances(d_tracer);
- for_each_tracing_cpu(cpu)
- tracing_init_debugfs_percpu(cpu);
+ create_trace_options_dir(&global_trace);
return 0;
}
@@ -4690,8 +6607,8 @@ void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
- if (s->len >= 1000)
- s->len = 1000;
+ if (s->len >= TRACE_MAX_PRINT)
+ s->len = TRACE_MAX_PRINT;
/* should be zero ended, but we are paranoid. */
s->buffer[s->len] = 0;
@@ -4704,45 +6621,54 @@ trace_printk_seq(struct trace_seq *s)
void trace_init_global_iter(struct trace_iterator *iter)
{
iter->tr = &global_trace;
- iter->trace = current_trace;
- iter->cpu_file = TRACE_PIPE_ALL_CPU;
+ iter->trace = iter->tr->current_trace;
+ iter->cpu_file = RING_BUFFER_ALL_CPUS;
+ iter->trace_buffer = &global_trace.trace_buffer;
+
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
+ /* Annotate start of buffers if we had overruns */
+ if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+ /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+ if (trace_clocks[iter->tr->clock_id].in_ns)
+ iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
}
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- static arch_spinlock_t ftrace_dump_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static atomic_t dump_running;
unsigned int old_userobj;
- static int dump_ran;
unsigned long flags;
int cnt = 0, cpu;
- /* only one dump */
- local_irq_save(flags);
- arch_spin_lock(&ftrace_dump_lock);
- if (dump_ran)
- goto out;
-
- dump_ran = 1;
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+ /*
+ * Always turn off tracing when we dump.
+ * We don't need to show trace output of what happens
+ * between multiple crashes.
+ *
+ * If the user does a sysrq-z, then they can re-enable
+ * tracing with echo 1 > tracing_on.
+ */
tracing_off();
- /* Did function tracer already get disabled? */
- if (ftrace_is_dead()) {
- printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- printk("# MAY BE MISSING FUNCTION EVENTS\n");
- }
-
- if (disable_tracing)
- ftrace_kill();
+ local_irq_save(flags);
+ /* Simulate the iterator */
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&iter.tr->data[cpu]->disabled);
+ atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4750,13 +6676,9 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
/* don't look at user memory in panic mode */
trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- /* Simulate the iterator */
- iter.tr = &global_trace;
- iter.trace = current_trace;
-
switch (oops_dump_mode) {
case DUMP_ALL:
- iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ iter.cpu_file = RING_BUFFER_ALL_CPUS;
break;
case DUMP_ORIG:
iter.cpu_file = raw_smp_processor_id();
@@ -4765,11 +6687,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
goto out_enable;
default:
printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
- iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ iter.cpu_file = RING_BUFFER_ALL_CPUS;
}
printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ /* Did function tracer already get disabled? */
+ if (ftrace_is_dead()) {
+ printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ printk("# MAY BE MISSING FUNCTION EVENTS\n");
+ }
+
/*
* We need to stop all tracing on all CPUS to read the
* the next buffer. This is a bit expensive, but is
@@ -4798,6 +6726,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(&iter);
}
+ touch_nmi_watchdog();
trace_printk_seq(&iter.seq);
}
@@ -4808,84 +6737,82 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- /* Re-enable tracing if requested */
- if (!disable_tracing) {
- trace_flags |= old_userobj;
+ trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
- }
- tracing_on();
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
-
- out:
- arch_spin_unlock(&ftrace_dump_lock);
+ atomic_dec(&dump_running);
local_irq_restore(flags);
}
-
-/* By default: disable tracing after the dump */
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-{
- __ftrace_dump(true, oops_dump_mode);
-}
EXPORT_SYMBOL_GPL(ftrace_dump);
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
- enum ring_buffer_flags rb_flags;
- int i;
int ret = -ENOMEM;
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
- if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
+ /* Only allocate trace_printk buffers if a trace_printk exists */
+ if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+ /* Must be called before global_trace.buffer is allocated */
+ trace_printk_init_buffers();
+
/* To save memory, keep the ring buffer size to its minimum */
if (ring_buffer_expanded)
ring_buf_size = trace_buf_size;
else
ring_buf_size = 1;
- rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
-
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
- cpumask_copy(tracing_cpumask, cpu_all_mask);
+ cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);
+
+ raw_spin_lock_init(&global_trace.start_lock);
+
+ /* Used for event triggers */
+ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
+ if (!temp_buffer)
+ goto out_free_cpumask;
+
+ if (trace_create_savedcmd() < 0)
+ goto out_free_temp_buffer;
/* TODO: make the number of buffers hot pluggable with CPUS */
- global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
- if (!global_trace.buffer) {
+ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
- goto out_free_cpumask;
+ goto out_free_savedcmd;
}
- global_trace.entries = ring_buffer_size(global_trace.buffer);
+ if (global_trace.buffer_disabled)
+ tracing_off();
-#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(1, rb_flags);
- if (!max_tr.buffer) {
- printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
- WARN_ON(1);
- ring_buffer_free(global_trace.buffer);
- goto out_free_cpumask;
+ if (trace_boot_clock) {
+ ret = tracing_set_clock(&global_trace, trace_boot_clock);
+ if (ret < 0)
+ pr_warning("Trace clock %s not defined, going back to default\n",
+ trace_boot_clock);
}
- max_tr.entries = 1;
-#endif
- /* Allocate the first page for all buffers */
- for_each_tracing_cpu(i) {
- global_trace.data[i] = &per_cpu(global_trace_cpu, i);
- max_tr.data[i] = &per_cpu(max_tr_data, i);
- }
+ /*
+ * register_tracer() might reference current_trace, so it
+ * needs to be set before we register anything. This is
+ * just a bootstrap of current_trace anyway.
+ */
+ global_trace.current_trace = &nop_trace;
+
+ global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
- trace_init_cmdlines();
+ ftrace_init_global_array_ops(&global_trace);
register_tracer(&nop_trace);
- current_trace = &nop_trace;
+
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -4894,10 +6821,29 @@ __init static int tracer_alloc_buffers(void)
register_die_notifier(&trace_die_notifier);
+ global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+
+ INIT_LIST_HEAD(&global_trace.systems);
+ INIT_LIST_HEAD(&global_trace.events);
+ list_add(&global_trace.list, &ftrace_trace_arrays);
+
+ while (trace_boot_options) {
+ char *option;
+
+ option = strsep(&trace_boot_options, ",");
+ trace_set_options(&global_trace, option);
+ }
+
+ register_snapshot_cmd();
+
return 0;
+out_free_savedcmd:
+ free_saved_cmdlines_buffer(savedcmd);
+out_free_temp_buffer:
+ ring_buffer_free(temp_buffer);
out_free_cpumask:
- free_cpumask_var(tracing_cpumask);
+ free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b93ecbadad6..9258f5a815d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1,3 +1,4 @@
+
#ifndef _LINUX_KERNEL_TRACE_H
#define _LINUX_KERNEL_TRACE_H
@@ -12,6 +13,12 @@
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
+#include <linux/compiler.h>
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+#include <asm/unistd.h> /* For NR_SYSCALLS */
+#include <asm/syscall.h> /* some archs define it here */
+#endif
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -29,6 +36,7 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_BLK,
+ TRACE_BPUTS,
__TRACE_LAST_TYPE,
};
@@ -56,17 +64,23 @@ enum trace_type {
#define F_STRUCT(args...) args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
- struct struct_name { \
- struct trace_entry ent; \
- tstruct \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+ struct struct_name { \
+ struct trace_entry ent; \
+ tstruct \
}
#undef TP_ARGS
#define TP_ARGS(args...) args
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
+ filter, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
#include "trace_entries.h"
@@ -112,10 +126,13 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_PREEMPT_RESCHED = 0x20,
};
#define TRACE_BUF_SIZE 1024
+struct trace_array;
+
/*
* The CPU trace array - it consists of thousands of trace entries
* plus some other descriptor data: (for example which task started
@@ -125,6 +142,7 @@ struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
+ unsigned long entries;
unsigned long saved_latency;
unsigned long critical_start;
unsigned long critical_end;
@@ -135,24 +153,114 @@ struct trace_array_cpu {
unsigned long skipped_entries;
cycle_t preempt_timestamp;
pid_t pid;
- uid_t uid;
+ kuid_t uid;
char comm[TASK_COMM_LEN];
};
+struct tracer;
+
+struct trace_buffer {
+ struct trace_array *tr;
+ struct ring_buffer *buffer;
+ struct trace_array_cpu __percpu *data;
+ cycle_t time_start;
+ int cpu;
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
* They have on/off state as well:
*/
struct trace_array {
- struct ring_buffer *buffer;
- unsigned long entries;
- int cpu;
- cycle_t time_start;
- struct task_struct *waiter;
- struct trace_array_cpu *data[NR_CPUS];
+ struct list_head list;
+ char *name;
+ struct trace_buffer trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * The max_buffer is used to snapshot the trace when a maximum
+ * latency is reached, or when the user initiates a snapshot.
+ * Some tracers will use this to store a maximum trace while
+ * it continues examining live traces.
+ *
+ * The buffers for the max_buffer are set up the same as the trace_buffer
+ * When a snapshot is taken, the buffer of the max_buffer is swapped
+ * with the buffer of the trace_buffer and the buffers are reset for
+ * the trace_buffer so the tracing can continue.
+ */
+ struct trace_buffer max_buffer;
+ bool allocated_snapshot;
+ unsigned long max_latency;
+#endif
+ /*
+ * max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a arch_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ *
+ * It is also used in other places outside the update_max_tr
+ * so it needs to be defined outside of the
+ * CONFIG_TRACER_MAX_TRACE.
+ */
+ arch_spinlock_t max_lock;
+ int buffer_disabled;
+#ifdef CONFIG_FTRACE_SYSCALLS
+ int sys_refcount_enter;
+ int sys_refcount_exit;
+ struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
+ struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+#endif
+ int stop_count;
+ int clock_id;
+ struct tracer *current_trace;
+ unsigned int flags;
+ raw_spinlock_t start_lock;
+ struct dentry *dir;
+ struct dentry *options;
+ struct dentry *percpu_dir;
+ struct dentry *event_dir;
+ struct list_head systems;
+ struct list_head events;
+ cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ int ref;
+#ifdef CONFIG_FUNCTION_TRACER
+ struct ftrace_ops *ops;
+ /* function tracing enabled */
+ int function_enabled;
+#endif
};
+enum {
+ TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+};
+
+extern struct list_head ftrace_trace_arrays;
+
+extern struct mutex trace_types_lock;
+
+extern int trace_array_get(struct trace_array *tr);
+extern void trace_array_put(struct trace_array *tr);
+
+/*
+ * The global tracer (top) should be the first trace array added,
+ * but we check the flag anyway.
+ */
+static inline struct trace_array *top_trace_array(void)
+{
+ struct trace_array *tr;
+
+ if (list_empty(&ftrace_trace_arrays))
+ return NULL;
+
+ tr = list_entry(ftrace_trace_arrays.prev,
+ typeof(*tr), list);
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+ return tr;
+}
+
#define FTRACE_CMP_TYPE(var, type) \
__builtin_types_compatible_p(typeof(var), type *)
@@ -188,6 +296,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
+ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -232,7 +341,6 @@ struct tracer_flags {
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
- * @wait_pipe: override how the user waits for traces on trace_pipe
* @close: called when the trace file is released
* @pipe_close: called when the trace_pipe file is released
* @read: override the default read callback on trace_pipe
@@ -251,7 +359,6 @@ struct tracer {
void (*stop)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
- void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
void (*pipe_close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
@@ -270,24 +377,69 @@ struct tracer {
void (*print_header)(struct seq_file *m);
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
- int (*set_flag)(u32 old_flags, u32 bit, int set);
+ int (*set_flag)(struct trace_array *tr,
+ u32 old_flags, u32 bit, int set);
+ /* Return 0 if OK with change, else return non-zero */
+ int (*flag_changed)(struct trace_array *tr,
+ u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
- int print_max;
- int use_max_tr;
+ int enabled;
+ bool print_max;
+ bool allow_instances;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ bool use_max_tr;
+#endif
};
/* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
-/* Ring buffer has the 10 LSB bits to count */
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+/*
+ * For function tracing recursion:
+ * The order of these bits are important.
+ *
+ * When function tracing occurs, the following steps are made:
+ * If arch does not support a ftrace feature:
+ * call internal function (uses INTERNAL bits) which calls...
+ * If callback is registered to the "global" list, the list
+ * function is called and recursion checks the GLOBAL bits.
+ * then this function calls...
+ * The function callback, which can use the FTRACE bits to
+ * check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+ TRACE_BUFFER_BIT,
+ TRACE_BUFFER_NMI_BIT,
+ TRACE_BUFFER_IRQ_BIT,
+ TRACE_BUFFER_SIRQ_BIT,
+
+ /* Start of function recursion bits */
+ TRACE_FTRACE_BIT,
+ TRACE_FTRACE_NMI_BIT,
+ TRACE_FTRACE_IRQ_BIT,
+ TRACE_FTRACE_SIRQ_BIT,
+
+ /* INTERNAL_BITs must be greater than FTRACE_BITs */
+ TRACE_INTERNAL_BIT,
+ TRACE_INTERNAL_NMI_BIT,
+ TRACE_INTERNAL_IRQ_BIT,
+ TRACE_INTERNAL_SIRQ_BIT,
+
+ TRACE_CONTROL_BIT,
-/* for function tracing recursion */
-#define TRACE_INTERNAL_BIT (1<<11)
-#define TRACE_GLOBAL_BIT (1<<12)
/*
* Abuse of the trace_recursion.
* As we need a way to maintain state if we are tracing the function
@@ -295,28 +447,98 @@ struct tracer {
* was called in irq context but we have irq tracing off. Since this
* can only be modified by current, we can reuse trace_recursion.
*/
-#define TRACE_IRQ_BIT (1<<13)
+ TRACE_IRQ_BIT,
+};
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
+
+#define TRACE_CONTEXT_BITS 4
+
+#define TRACE_FTRACE_START TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+ int bit;
+
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
+
+ return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+ unsigned int val = current->trace_recursion;
+ int bit;
+
+ /* A previous recursion check was made */
+ if ((val & TRACE_CONTEXT_MASK) > max)
+ return 0;
+
+ bit = trace_get_context_bit() + start;
+ if (unlikely(val & (1 << bit)))
+ return -1;
+
+ val |= 1 << bit;
+ current->trace_recursion = val;
+ barrier();
+
+ return bit;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+ unsigned int val = current->trace_recursion;
-#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
-#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
-#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
+ if (!bit)
+ return;
-#define TRACE_PIPE_ALL_CPU -1
+ bit = 1 << bit;
+ val &= ~bit;
+
+ barrier();
+ current->trace_recursion = val;
+}
+
+static inline struct ring_buffer_iter *
+trace_buffer_iter(struct trace_iterator *iter, int cpu)
+{
+ if (iter->buffer_iter && iter->buffer_iter[cpu])
+ return iter->buffer_iter[cpu];
+ return NULL;
+}
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
-void trace_wake_up(void);
-void tracing_reset(struct trace_array *tr, int cpu);
-void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset(struct trace_buffer *buf, int cpu);
+void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
-void tracing_reset_current_online_cpus(void);
+void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
+bool tracing_is_disabled(void);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
@@ -327,9 +549,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
unsigned long len,
unsigned long flags,
int pc);
-void trace_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -337,6 +556,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
+void __buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event);
+
int trace_empty(struct trace_iterator *iter);
void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -345,14 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void default_wait_pipe(struct trace_iterator *iter);
-void poll_wait_pipe(struct trace_iterator *iter);
-
-void ftrace(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long ip,
- unsigned long parent_ip,
- unsigned long flags, int pc);
void tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
@@ -385,12 +599,9 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
void tracing_stop_sched_switch_record(void);
void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
-void unregister_tracer(struct tracer *type);
int is_tracing_stopped(void);
-enum trace_file_type {
- TRACE_FILE_LAT_FMT = 1,
- TRACE_FILE_ANNOTATE = 2,
-};
+
+loff_t tracing_lseek(struct file *file, loff_t offset, int whence);
extern cpumask_var_t __read_mostly tracing_buffer_mask;
@@ -402,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
-extern unsigned long tracing_max_latency;
-
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
@@ -450,13 +659,13 @@ extern void trace_find_cmdline(int pid, char comm[]);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+#endif
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
-#endif
-extern int ring_buffer_expanded;
+extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;
DECLARE_PER_CPU(int, ftrace_cpu_disabled);
@@ -479,6 +688,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
+/*
+ * Tracer data references selftest functions that only occur
+ * on boot up. These can be __init functions. Thus, when selftests
+ * are enabled, then the tracers need to reference __init functions.
+ */
+#define __tracer_data __refdata
+#else
+/* Tracers are seldom changed. Optimize when selftests are disabled. */
+#define __tracer_data __read_mostly
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
@@ -492,13 +710,13 @@ trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...);
+int trace_array_printk_buf(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
-extern int trace_clock_id;
-
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -509,6 +727,10 @@ extern int trace_clock_id;
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+#define TRACE_GRAPH_PRINT_IRQS 0x40
+#define TRACE_GRAPH_PRINT_TAIL 0x80
+#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
+#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
@@ -528,15 +750,16 @@ extern void __trace_graph_return(struct trace_array *tr,
#ifdef CONFIG_DYNAMIC_FTRACE
/* TODO: make this variable */
#define FTRACE_GRAPH_MAX_FUNCS 32
-extern int ftrace_graph_filter_enabled;
extern int ftrace_graph_count;
extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern int ftrace_graph_notrace_count;
+extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
static inline int ftrace_graph_addr(unsigned long addr)
{
int i;
- if (!ftrace_graph_filter_enabled)
+ if (!ftrace_graph_count)
return 1;
for (i = 0; i < ftrace_graph_count; i++) {
@@ -556,11 +779,31 @@ static inline int ftrace_graph_addr(unsigned long addr)
return 0;
}
+
+static inline int ftrace_graph_notrace_addr(unsigned long addr)
+{
+ int i;
+
+ if (!ftrace_graph_notrace_count)
+ return 0;
+
+ for (i = 0; i < ftrace_graph_notrace_count; i++) {
+ if (addr == ftrace_graph_notrace_funcs[i])
+ return 1;
+ }
+
+ return 0;
+}
#else
static inline int ftrace_graph_addr(unsigned long addr)
{
return 1;
}
+
+static inline int ftrace_graph_notrace_addr(unsigned long addr)
+{
+ return 0;
+}
#endif /* CONFIG_DYNAMIC_FTRACE */
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
@@ -573,6 +816,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
+extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct task_struct *task)
{
if (list_empty(&ftrace_pids))
@@ -581,13 +825,47 @@ static inline int ftrace_trace_task(struct task_struct *task)
return test_tsk_trace_trace(task);
}
extern int ftrace_is_dead(void);
+int ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent);
+void ftrace_destroy_function_files(struct trace_array *tr);
+void ftrace_init_global_array_ops(struct trace_array *tr);
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
+void ftrace_reset_array_ops(struct trace_array *tr);
+int using_ftrace_ops_list_func(void);
#else
static inline int ftrace_trace_task(struct task_struct *task)
{
return 1;
}
static inline int ftrace_is_dead(void) { return 0; }
-#endif
+static inline int
+ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent)
+{
+ return 0;
+}
+static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
+static inline __init void
+ftrace_init_global_array_ops(struct trace_array *tr) { }
+static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+/* ftace_func_t type is not defined, use macro instead of static inline */
+#define ftrace_init_array_ops(tr, func) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+ struct dentry *parent);
+void ftrace_destroy_filter_files(struct ftrace_ops *ops);
+#else
+/*
+ * The ops parameter passed in is usually undefined.
+ * This must be a macro.
+ */
+#define ftrace_create_filter_files(ops, parent) do { } while (0)
+#define ftrace_destroy_filter_files(ops) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
+
+int ftrace_event_is_function(struct ftrace_event_call *call);
/*
* struct trace_parser - servers for reading the user input separated by spaces
@@ -656,6 +934,8 @@ enum trace_iterator_flags {
TRACE_ITER_OVERWRITE = 0x200000,
TRACE_ITER_STOP_ON_FREE = 0x400000,
TRACE_ITER_IRQ_INFO = 0x800000,
+ TRACE_ITER_MARKERS = 0x1000000,
+ TRACE_ITER_FUNCTION = 0x2000000,
};
/*
@@ -694,16 +974,10 @@ static inline void trace_branch_disable(void)
/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);
-/* trace event type bit fields, not numeric */
-enum {
- TRACE_EVENT_TYPE_PRINTF = 1,
- TRACE_EVENT_TYPE_RAW = 2,
-};
-
struct ftrace_event_field {
struct list_head link;
- char *name;
- char *type;
+ const char *name;
+ const char *type;
int filter_type;
int offset;
int size;
@@ -721,12 +995,19 @@ struct event_filter {
struct event_subsystem {
struct list_head list;
const char *name;
- struct dentry *entry;
struct event_filter *filter;
- int nr_events;
int ref_count;
};
+struct ftrace_subsystem_dir {
+ struct list_head list;
+ struct event_subsystem *subsystem;
+ struct trace_array *tr;
+ struct dentry *entry;
+ int ref_count;
+ int nr_events;
+};
+
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
@@ -766,9 +1047,7 @@ struct filter_pred {
u64 val;
struct regex regex;
unsigned short *ops;
-#ifdef CONFIG_FTRACE_STARTUP_TEST
struct ftrace_event_field *field;
-#endif
int offset;
int not;
int op;
@@ -778,52 +1057,255 @@ struct filter_pred {
unsigned short right;
};
-extern struct list_head ftrace_common_fields;
-
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_call *call,
+extern void print_event_filter(struct ftrace_event_file *file,
struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_call *call,
+extern int apply_event_filter(struct ftrace_event_file *file,
char *filter_string);
-extern int apply_subsystem_event_filter(struct event_subsystem *system,
+extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
+extern int create_event_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp);
+extern void free_event_filter(struct event_filter *filter);
-struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call);
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name);
-static inline int
-filter_check_discard(struct ftrace_event_call *call, void *rec,
- struct ring_buffer *buffer,
- struct ring_buffer_event *event)
-{
- if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(call->filter, rec)) {
- ring_buffer_discard_commit(buffer, event);
- return 1;
- }
+extern void trace_event_enable_cmd_record(bool enable);
+extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
+extern int event_trace_del_tracer(struct trace_array *tr);
- return 0;
-}
+extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
+ const char *system,
+ const char *event);
-extern void trace_event_enable_cmd_record(bool enable);
+static inline void *event_file_data(struct file *filp)
+{
+ return ACCESS_ONCE(file_inode(filp)->i_private);
+}
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
+extern const struct file_operations event_trigger_fops;
+
+extern int register_trigger_cmds(void);
+extern void clear_event_triggers(struct trace_array *tr);
+
+struct event_trigger_data {
+ unsigned long count;
+ int ref;
+ struct event_trigger_ops *ops;
+ struct event_command *cmd_ops;
+ struct event_filter __rcu *filter;
+ char *filter_str;
+ void *private_data;
+ struct list_head list;
+};
+
+/**
+ * struct event_trigger_ops - callbacks for trace event triggers
+ *
+ * The methods in this structure provide per-event trigger hooks for
+ * various trigger operations.
+ *
+ * All the methods below, except for @init() and @free(), must be
+ * implemented.
+ *
+ * @func: The trigger 'probe' function called when the triggering
+ * event occurs. The data passed into this callback is the data
+ * that was supplied to the event_command @reg() function that
+ * registered the trigger (see struct event_command).
+ *
+ * @init: An optional initialization function called for the trigger
+ * when the trigger is registered (via the event_command reg()
+ * function). This can be used to perform per-trigger
+ * initialization such as incrementing a per-trigger reference
+ * count, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_init() (see
+ * trace_event_triggers.c).
+ *
+ * @free: An optional de-initialization function called for the
+ * trigger when the trigger is unregistered (via the
+ * event_command @reg() function). This can be used to perform
+ * per-trigger de-initialization such as decrementing a
+ * per-trigger reference count and freeing corresponding trigger
+ * data, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_free() (see
+ * trace_event_triggers.c).
+ *
+ * @print: The callback function invoked to have the trigger print
+ * itself. This is usually implemented by a wrapper function
+ * that calls the generic utility function @event_trigger_print()
+ * (see trace_event_triggers.c).
+ */
+struct event_trigger_ops {
+ void (*func)(struct event_trigger_data *data);
+ int (*init)(struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+ void (*free)(struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+ int (*print)(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+};
+
+/**
+ * struct event_command - callbacks and data members for event commands
+ *
+ * Event commands are invoked by users by writing the command name
+ * into the 'trigger' file associated with a trace event. The
+ * parameters associated with a specific invocation of an event
+ * command are used to create an event trigger instance, which is
+ * added to the list of trigger instances associated with that trace
+ * event. When the event is hit, the set of triggers associated with
+ * that event is invoked.
+ *
+ * The data members in this structure provide per-event command data
+ * for various event commands.
+ *
+ * All the data members below, except for @post_trigger, must be set
+ * for each event command.
+ *
+ * @name: The unique name that identifies the event command. This is
+ * the name used when setting triggers via trigger files.
+ *
+ * @trigger_type: A unique id that identifies the event command
+ * 'type'. This value has two purposes, the first to ensure that
+ * only one trigger of the same type can be set at a given time
+ * for a particular event e.g. it doesn't make sense to have both
+ * a traceon and traceoff trigger attached to a single event at
+ * the same time, so traceon and traceoff have the same type
+ * though they have different names. The @trigger_type value is
+ * also used as a bit value for deferring the actual trigger
+ * action until after the current event is finished. Some
+ * commands need to do this if they themselves log to the trace
+ * buffer (see the @post_trigger() member below). @trigger_type
+ * values are defined by adding new values to the trigger_type
+ * enum in include/linux/ftrace_event.h.
+ *
+ * @post_trigger: A flag that says whether or not this command needs
+ * to have its action delayed until after the current event has
+ * been closed. Some triggers need to avoid being invoked while
+ * an event is currently in the process of being logged, since
+ * the trigger may itself log data into the trace buffer. Thus
+ * we make sure the current event is committed before invoking
+ * those triggers. To do that, the trigger invocation is split
+ * in two - the first part checks the filter using the current
+ * trace record; if a command has the @post_trigger flag set, it
+ * sets a bit for itself in the return value, otherwise it
+ * directly invokes the trigger. Once all commands have been
+ * either invoked or set their return flag, the current record is
+ * either committed or discarded. At that point, if any commands
+ * have deferred their triggers, those commands are finally
+ * invoked following the close of the current event. In other
+ * words, if the event_trigger_ops @func() probe implementation
+ * itself logs to the trace buffer, this flag should be set,
+ * otherwise it can be left unspecified.
+ *
+ * All the methods below, except for @set_filter(), must be
+ * implemented.
+ *
+ * @func: The callback function responsible for parsing and
+ * registering the trigger written to the 'trigger' file by the
+ * user. It allocates the trigger instance and registers it with
+ * the appropriate trace event. It makes use of the other
+ * event_command callback functions to orchestrate this, and is
+ * usually implemented by the generic utility function
+ * @event_trigger_callback() (see trace_event_triggers.c).
+ *
+ * @reg: Adds the trigger to the list of triggers associated with the
+ * event, and enables the event trigger itself, after
+ * initializing it (via the event_trigger_ops @init() function).
+ * This is also where commands can use the @trigger_type value to
+ * make the decision as to whether or not multiple instances of
+ * the trigger should be allowed. This is usually implemented by
+ * the generic utility function @register_trigger() (see
+ * trace_event_triggers.c).
+ *
+ * @unreg: Removes the trigger from the list of triggers associated
+ * with the event, and disables the event trigger itself, after
+ * initializing it (via the event_trigger_ops @free() function).
+ * This is usually implemented by the generic utility function
+ * @unregister_trigger() (see trace_event_triggers.c).
+ *
+ * @set_filter: An optional function called to parse and set a filter
+ * for the trigger. If no @set_filter() method is set for the
+ * event command, filters set by the user for the command will be
+ * ignored. This is usually implemented by the generic utility
+ * function @set_trigger_filter() (see trace_event_triggers.c).
+ *
+ * @get_trigger_ops: The callback function invoked to retrieve the
+ * event_trigger_ops implementation associated with the command.
+ */
+struct event_command {
+ struct list_head list;
+ char *name;
+ enum event_trigger_type trigger_type;
+ bool post_trigger;
+ int (*func)(struct event_command *cmd_ops,
+ struct ftrace_event_file *file,
+ char *glob, char *cmd, char *params);
+ int (*reg)(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file);
+ void (*unreg)(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file);
+ int (*set_filter)(char *filter_str,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file);
+ struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+};
+
+extern int trace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable);
+extern int tracing_alloc_snapshot(void);
+
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
+extern const char *__start___tracepoint_str[];
+extern const char *__stop___tracepoint_str[];
+
+void trace_printk_init_buffers(void);
+void trace_printk_start_comm(void);
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+
+/*
+ * Normal trace_printk() and friends allocates special buffers
+ * to do the manipulation, as well as saves the print formats
+ * into sections to display. But the trace infrastructure wants
+ * to use these without the added overhead at the price of being
+ * a bit slower (used mainly for warnings, where we don't care
+ * about performance). The internal_trace_puts() is for such
+ * a purpose.
+ */
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
+
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
extern struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_##call;
+ __aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
#include "trace_entries.h"
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
+int perf_ftrace_event_register(struct ftrace_event_call *call,
+ enum trace_reg type, void *data);
+#else
+#define perf_ftrace_event_register NULL
+#endif
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 00000000000..40a14cbcf8e
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/trace_clock.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_benchmark.h"
+
+static struct task_struct *bm_event_thread;
+
+static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
+
+static u64 bm_total;
+static u64 bm_totalsq;
+static u64 bm_last;
+static u64 bm_max;
+static u64 bm_min;
+static u64 bm_first;
+static u64 bm_cnt;
+static u64 bm_stddev;
+static unsigned int bm_avg;
+static unsigned int bm_std;
+
+/*
+ * This gets called in a loop recording the time it took to write
+ * the tracepoint. What it writes is the time statistics of the last
+ * tracepoint write. As there is nothing to write the first time
+ * it simply writes "START". As the first write is cold cache and
+ * the rest is hot, we save off that time in bm_first and it is
+ * reported as "first", which is shown in the second write to the
+ * tracepoint. The "first" field is writen within the statics from
+ * then on but never changes.
+ */
+static void trace_do_benchmark(void)
+{
+ u64 start;
+ u64 stop;
+ u64 delta;
+ u64 stddev;
+ u64 seed;
+ u64 last_seed;
+ unsigned int avg;
+ unsigned int std = 0;
+
+ /* Only run if the tracepoint is actually active */
+ if (!trace_benchmark_event_enabled())
+ return;
+
+ local_irq_disable();
+ start = trace_clock_local();
+ trace_benchmark_event(bm_str);
+ stop = trace_clock_local();
+ local_irq_enable();
+
+ bm_cnt++;
+
+ delta = stop - start;
+
+ /*
+ * The first read is cold cached, keep it separate from the
+ * other calculations.
+ */
+ if (bm_cnt == 1) {
+ bm_first = delta;
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "first=%llu [COLD CACHED]", bm_first);
+ return;
+ }
+
+ bm_last = delta;
+
+ if (delta > bm_max)
+ bm_max = delta;
+ if (!bm_min || delta < bm_min)
+ bm_min = delta;
+
+ /*
+ * When bm_cnt is greater than UINT_MAX, it breaks the statistics
+ * accounting. Freeze the statistics when that happens.
+ * We should have enough data for the avg and stddev anyway.
+ */
+ if (bm_cnt > UINT_MAX) {
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
+ return;
+ }
+
+ bm_total += delta;
+ bm_totalsq += delta * delta;
+
+
+ if (bm_cnt > 1) {
+ /*
+ * Apply Welford's method to calculate standard deviation:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ */
+ stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
+ do_div(stddev, (u32)bm_cnt);
+ do_div(stddev, (u32)bm_cnt - 1);
+ } else
+ stddev = 0;
+
+ delta = bm_total;
+ do_div(delta, bm_cnt);
+ avg = delta;
+
+ if (stddev > 0) {
+ int i = 0;
+ /*
+ * stddev is the square of standard deviation but
+ * we want the actualy number. Use the average
+ * as our seed to find the std.
+ *
+ * The next try is:
+ * x = (x + N/x) / 2
+ *
+ * Where N is the squared number to find the square
+ * root of.
+ */
+ seed = avg;
+ do {
+ last_seed = seed;
+ seed = stddev;
+ if (!last_seed)
+ break;
+ do_div(seed, last_seed);
+ seed += last_seed;
+ do_div(seed, 2);
+ } while (i++ < 10 && last_seed != seed);
+
+ std = seed;
+ }
+
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
+
+ bm_std = std;
+ bm_avg = avg;
+ bm_stddev = stddev;
+}
+
+static int benchmark_event_kthread(void *arg)
+{
+ /* sleep a bit to make sure the tracepoint gets activated */
+ msleep(100);
+
+ while (!kthread_should_stop()) {
+
+ trace_do_benchmark();
+
+ /*
+ * We don't go to sleep, but let others
+ * run as well.
+ */
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * When the benchmark tracepoint is enabled, it calls this
+ * function and the thread that calls the tracepoint is created.
+ */
+void trace_benchmark_reg(void)
+{
+ bm_event_thread = kthread_run(benchmark_event_kthread,
+ NULL, "event_benchmark");
+ WARN_ON(!bm_event_thread);
+}
+
+/*
+ * When the benchmark tracepoint is disabled, it calls this
+ * function and the thread that calls the tracepoint is deleted
+ * and all the numbers are reset.
+ */
+void trace_benchmark_unreg(void)
+{
+ if (!bm_event_thread)
+ return;
+
+ kthread_stop(bm_event_thread);
+
+ strcpy(bm_str, "START");
+ bm_total = 0;
+ bm_totalsq = 0;
+ bm_last = 0;
+ bm_max = 0;
+ bm_min = 0;
+ bm_cnt = 0;
+ /* These don't need to be reset but reset them anyway */
+ bm_first = 0;
+ bm_std = 0;
+ bm_avg = 0;
+ bm_stddev = 0;
+}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 00000000000..3c1df1df4e2
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM benchmark
+
+#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BENCHMARK_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_benchmark_reg(void);
+extern void trace_benchmark_unreg(void);
+
+#define BENCHMARK_EVENT_STRLEN 128
+
+TRACE_EVENT_FN(benchmark_event,
+
+ TP_PROTO(const char *str),
+
+ TP_ARGS(str),
+
+ TP_STRUCT__entry(
+ __array( char, str, BENCHMARK_EVENT_STRLEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ ),
+
+ TP_printk("%s", __entry->str),
+
+ trace_benchmark_reg, trace_benchmark_unreg
+);
+
+#endif /* _TRACE_BENCHMARK_H */
+
+#undef TRACE_INCLUDE_FILE
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_benchmark
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8d3538b4ea5..697fb9bac8f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
struct ftrace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
+ struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ if (atomic_inc_return(&data->disabled) != 1)
goto out;
pc = preempt_count();
- buffer = tr->buffer;
+ buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
@@ -76,11 +78,11 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
- if (!filter_check_discard(call, entry, buffer, event))
- ring_buffer_unlock_commit(buffer, event);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
out:
- atomic_dec(&tr->data[cpu]->disabled);
+ atomic_dec(&data->disabled);
local_irq_restore(flags);
}
@@ -199,7 +201,7 @@ __init static int init_branch_tracer(void)
}
return register_tracer(&branch_trace);
}
-device_initcall(init_branch_tracer);
+core_initcall(init_branch_tracer);
#else
static inline
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cb..57b67b1f24d 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
#include <linux/ktime.h>
#include <linux/trace_clock.h>
-#include "trace.h"
-
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
return clock;
}
+EXPORT_SYMBOL_GPL(trace_clock_local);
/*
* trace_clock(): 'between' trace clock. Not completely serialized,
@@ -58,6 +57,17 @@ u64 notrace trace_clock(void)
return local_clock();
}
+/*
+ * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
+ */
+u64 notrace trace_clock_jiffies(void)
+{
+ return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
+}
/*
* trace_clock_global(): special globally coherent trace clock
@@ -86,7 +96,7 @@ u64 notrace trace_clock_global(void)
local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = cpu_clock(this_cpu);
+ now = sched_clock_cpu(this_cpu);
/*
* If in an NMI context then dont risk lockups and return the
* cpu_clock() time:
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 93365907f21..e2d027ac66a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
/*
* Function trace entry - function address and parent function address:
*/
-FTRACE_ENTRY(function, ftrace_entry,
+FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
@@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,
__field( unsigned long, parent_ip )
),
- F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
+ F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
+
+ FILTER_TRACE_FN,
+
+ perf_ftrace_event_register
);
/* Function call entry */
@@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
__field_desc( int, graph_ent, depth )
),
- F_printk("--> %lx (%d)", __entry->func, __entry->depth)
+ F_printk("--> %lx (%d)", __entry->func, __entry->depth),
+
+ FILTER_OTHER
);
/* Function return entry */
@@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
- __entry->depth)
+ __entry->depth),
+
+ FILTER_OTHER
);
/*
@@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu
- )
+ __entry->next_cpu),
+
+ FILTER_OTHER
);
/*
@@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu
- )
+ __entry->next_cpu),
+
+ FILTER_OTHER
);
/*
@@ -156,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
#define FTRACE_STACK_ENTRIES 8
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
FTRACE_ENTRY(kernel_stack, stack_entry,
TRACE_STACK,
@@ -165,11 +181,14 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
__dynamic_array(unsigned long, caller )
),
- F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
- "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+ F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
__entry->caller[0], __entry->caller[1], __entry->caller[2],
__entry->caller[3], __entry->caller[4], __entry->caller[5],
- __entry->caller[6], __entry->caller[7])
+ __entry->caller[6], __entry->caller[7]),
+
+ FILTER_OTHER
);
FTRACE_ENTRY(user_stack, userstack_entry,
@@ -181,11 +200,14 @@ FTRACE_ENTRY(user_stack, userstack_entry,
__array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
- F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
- "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+ F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
__entry->caller[0], __entry->caller[1], __entry->caller[2],
__entry->caller[3], __entry->caller[4], __entry->caller[5],
- __entry->caller[6], __entry->caller[7])
+ __entry->caller[6], __entry->caller[7]),
+
+ FILTER_OTHER
);
/*
@@ -201,8 +223,10 @@ FTRACE_ENTRY(bprint, bprint_entry,
__dynamic_array( u32, buf )
),
- F_printk("%08lx fmt:%p",
- __entry->ip, __entry->fmt)
+ F_printk("%pf: %s",
+ (void *)__entry->ip, __entry->fmt),
+
+ FILTER_OTHER
);
FTRACE_ENTRY(print, print_entry,
@@ -214,8 +238,25 @@ FTRACE_ENTRY(print, print_entry,
__dynamic_array( char, buf )
),
- F_printk("%08lx %s",
- __entry->ip, __entry->buf)
+ F_printk("%pf: %s",
+ (void *)__entry->ip, __entry->buf),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(bputs, bputs_entry,
+
+ TRACE_BPUTS,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( const char *, str )
+ ),
+
+ F_printk("%pf: %s",
+ (void *)__entry->ip, __entry->str),
+
+ FILTER_OTHER
);
FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -234,7 +275,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
F_printk("%lx %lx %lx %d %x %x",
(unsigned long)__entry->phys, __entry->value, __entry->pc,
- __entry->map_id, __entry->opcode, __entry->width)
+ __entry->map_id, __entry->opcode, __entry->width),
+
+ FILTER_OTHER
);
FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -252,7 +295,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
F_printk("%lx %lx %lx %d %x",
(unsigned long)__entry->phys, __entry->virt, __entry->len,
- __entry->map_id, __entry->opcode)
+ __entry->map_id, __entry->opcode),
+
+ FILTER_OTHER
);
@@ -272,6 +317,8 @@ FTRACE_ENTRY(branch, trace_branch,
F_printk("%u:%s:%s (%u)",
__entry->line,
- __entry->func, __entry->file, __entry->correct)
+ __entry->func, __entry->file, __entry->correct),
+
+ FILTER_OTHER
);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 19a359d5e6d..5d12bb407b4 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,33 @@ static int total_ref_count;
static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
+ if (tp_event->perf_perm) {
+ int ret = tp_event->perf_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+ }
+
+ /* The ftrace function trace is allowed only for root. */
+ if (ftrace_event_is_function(tp_event)) {
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * We don't allow user space callchains for function trace
+ * event, due to issues with page faults while tracing page
+ * fault handler and its overall trickiness nature.
+ */
+ if (!p_event->attr.exclude_callchain_user)
+ return -EINVAL;
+
+ /*
+ * Same reason to disable user stack dump as for user space
+ * callchains above.
+ */
+ if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+ return -EINVAL;
+ }
+
/* No tracing, just counting, so no obvious leak */
if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
return 0;
@@ -44,23 +71,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
return 0;
}
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
- struct perf_event *p_event)
+static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
{
struct hlist_head __percpu *list;
- int ret;
+ int ret = -ENOMEM;
int cpu;
- ret = perf_trace_event_perm(tp_event, p_event);
- if (ret)
- return ret;
-
p_event->tp_event = tp_event;
if (tp_event->perf_refcount++ > 0)
return 0;
- ret = -ENOMEM;
-
list = alloc_percpu(struct hlist_head);
if (!list)
goto fail;
@@ -83,7 +104,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
}
}
- ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+ ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
if (ret)
goto fail;
@@ -108,10 +129,73 @@ fail:
return ret;
}
+static void perf_trace_event_unreg(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ int i;
+
+ if (--tp_event->perf_refcount > 0)
+ goto out;
+
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
+
+ /*
+ * Ensure our callback won't be called anymore. The buffers
+ * will be freed after that.
+ */
+ tracepoint_synchronize_unregister();
+
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+
+ if (!--total_ref_count) {
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+out:
+ module_put(tp_event->mod);
+}
+
+static int perf_trace_event_open(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
+}
+
+static void perf_trace_event_close(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
+}
+
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ int ret;
+
+ ret = perf_trace_event_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+
+ ret = perf_trace_event_reg(tp_event, p_event);
+ if (ret)
+ return ret;
+
+ ret = perf_trace_event_open(p_event);
+ if (ret) {
+ perf_trace_event_unreg(p_event);
+ return ret;
+ }
+
+ return 0;
+}
+
int perf_trace_init(struct perf_event *p_event)
{
struct ftrace_event_call *tp_event;
- int event_id = p_event->attr.config;
+ u64 event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
@@ -130,6 +214,14 @@ int perf_trace_init(struct perf_event *p_event)
return ret;
}
+void perf_trace_destroy(struct perf_event *p_event)
+{
+ mutex_lock(&event_mutex);
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+ mutex_unlock(&event_mutex);
+}
+
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -146,47 +238,18 @@ int perf_trace_add(struct perf_event *p_event, int flags)
list = this_cpu_ptr(pcpu_list);
hlist_add_head_rcu(&p_event->hlist_entry, list);
- return 0;
+ return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
}
void perf_trace_del(struct perf_event *p_event, int flags)
{
- hlist_del_rcu(&p_event->hlist_entry);
-}
-
-void perf_trace_destroy(struct perf_event *p_event)
-{
struct ftrace_event_call *tp_event = p_event->tp_event;
- int i;
-
- mutex_lock(&event_mutex);
- if (--tp_event->perf_refcount > 0)
- goto out;
-
- tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-
- /*
- * Ensure our callback won't be called anymore. The buffers
- * will be freed after that.
- */
- tracepoint_synchronize_unregister();
-
- free_percpu(tp_event->perf_events);
- tp_event->perf_events = NULL;
-
- if (!--total_ref_count) {
- for (i = 0; i < PERF_NR_CONTEXTS; i++) {
- free_percpu(perf_trace_buf[i]);
- perf_trace_buf[i] = NULL;
- }
- }
-out:
- module_put(tp_event->mod);
- mutex_unlock(&event_mutex);
+ hlist_del_rcu(&p_event->hlist_entry);
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
-__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs *regs, int *rctxp)
+void *perf_trace_buf_prepare(int size, unsigned short type,
+ struct pt_regs *regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -195,6 +258,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "perf buffer not large enough"))
+ return NULL;
+
pc = preempt_count();
*rctxp = perf_swevent_get_recursion_context();
@@ -214,3 +281,90 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void
+perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *pt_regs)
+{
+ struct ftrace_entry *entry;
+ struct hlist_head *head;
+ struct pt_regs regs;
+ int rctx;
+
+ head = this_cpu_ptr(event_function.perf_events);
+ if (hlist_empty(head))
+ return;
+
+#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
+ sizeof(u64)) - sizeof(u32))
+
+ BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
+
+ perf_fetch_caller_regs(&regs);
+
+ entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+ if (!entry)
+ return;
+
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+ perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+ 1, &regs, head, NULL);
+
+#undef ENTRY_SIZE
+}
+
+static int perf_ftrace_function_register(struct perf_event *event)
+{
+ struct ftrace_ops *ops = &event->ftrace_ops;
+
+ ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->func = perf_ftrace_function_call;
+ return register_ftrace_function(ops);
+}
+
+static int perf_ftrace_function_unregister(struct perf_event *event)
+{
+ struct ftrace_ops *ops = &event->ftrace_ops;
+ int ret = unregister_ftrace_function(ops);
+ ftrace_free_filter(ops);
+ return ret;
+}
+
+static void perf_ftrace_function_enable(struct perf_event *event)
+{
+ ftrace_function_local_enable(&event->ftrace_ops);
+}
+
+static void perf_ftrace_function_disable(struct perf_event *event)
+{
+ ftrace_function_local_disable(&event->ftrace_ops);
+}
+
+int perf_ftrace_event_register(struct ftrace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ case TRACE_REG_UNREGISTER:
+ break;
+ case TRACE_REG_PERF_REGISTER:
+ case TRACE_REG_PERF_UNREGISTER:
+ return 0;
+ case TRACE_REG_PERF_OPEN:
+ return perf_ftrace_function_register(data);
+ case TRACE_REG_PERF_CLOSE:
+ return perf_ftrace_function_unregister(data);
+ case TRACE_REG_PERF_ADD:
+ perf_ftrace_function_enable(data);
+ return 0;
+ case TRACE_REG_PERF_DEL:
+ perf_ftrace_function_disable(data);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c212a7f934e..2de53628689 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,16 +27,45 @@
DEFINE_MUTEX(event_mutex);
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
+LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_common_fields);
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
+#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
-LIST_HEAD(ftrace_events);
-LIST_HEAD(ftrace_common_fields);
+static struct kmem_cache *field_cachep;
+static struct kmem_cache *file_cachep;
+
+#define SYSTEM_FL_FREE_NAME (1 << 31)
+
+static inline int system_refcount(struct event_subsystem *system)
+{
+ return system->ref_count & ~SYSTEM_FL_FREE_NAME;
+}
+
+static int system_refcount_inc(struct event_subsystem *system)
+{
+ return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
+}
+
+static int system_refcount_dec(struct event_subsystem *system)
+{
+ return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
+}
-struct list_head *
+/* Double loops, do not use break, only goto's work */
+#define do_for_each_event_file(tr, file) \
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
+ list_for_each_entry(file, &tr->events, list)
+
+#define do_for_each_event_file_safe(tr, file) \
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
+ struct ftrace_event_file *___n; \
+ list_for_each_entry_safe(file, ___n, &tr->events, list)
+
+#define while_for_each_event_file() \
+ }
+
+static struct list_head *
trace_get_fields(struct ftrace_event_call *event_call)
{
if (!event_call->class->get_fields)
@@ -44,23 +73,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
return event_call->class->get_fields(event_call);
}
+static struct ftrace_event_field *
+__find_event_field(struct list_head *head, char *name)
+{
+ struct ftrace_event_field *field;
+
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(field->name, name))
+ return field;
+ }
+
+ return NULL;
+}
+
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ field = __find_event_field(&ftrace_common_fields, name);
+ if (field)
+ return field;
+
+ head = trace_get_fields(call);
+ return __find_event_field(head, name);
+}
+
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
int is_signed, int filter_type)
{
struct ftrace_event_field *field;
- field = kzalloc(sizeof(*field), GFP_KERNEL);
+ field = kmem_cache_alloc(field_cachep, GFP_TRACE);
if (!field)
- goto err;
-
- field->name = kstrdup(name, GFP_KERNEL);
- if (!field->name)
- goto err;
+ return -ENOMEM;
- field->type = kstrdup(type, GFP_KERNEL);
- if (!field->type)
- goto err;
+ field->name = name;
+ field->type = type;
if (filter_type == FILTER_OTHER)
field->filter_type = filter_assign_type(type);
@@ -74,13 +125,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
list_add(&field->link, head);
return 0;
-
-err:
- if (field)
- kfree(field->name);
- kfree(field);
-
- return -ENOMEM;
}
int trace_define_field(struct ftrace_event_call *call, const char *type,
@@ -116,12 +160,11 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
- __common_field(int, padding);
return ret;
}
-void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
struct list_head *head;
@@ -129,9 +172,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
head = trace_get_fields(call);
list_for_each_entry_safe(field, next, head, link) {
list_del(&field->link);
- kfree(field->type);
- kfree(field->name);
- kfree(field);
+ kmem_cache_free(field_cachep, field);
}
}
@@ -147,29 +188,68 @@ int trace_event_raw_init(struct ftrace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
-int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+ struct ftrace_event_file *ftrace_file,
+ unsigned long len)
{
+ struct ftrace_event_call *event_call = ftrace_file->event_call;
+
+ local_save_flags(fbuffer->flags);
+ fbuffer->pc = preempt_count();
+ fbuffer->ftrace_file = ftrace_file;
+
+ fbuffer->event =
+ trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+ event_call->event.type, len,
+ fbuffer->flags, fbuffer->pc);
+ if (!fbuffer->event)
+ return NULL;
+
+ fbuffer->entry = ring_buffer_event_data(fbuffer->event);
+ return fbuffer->entry;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+
+void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+{
+ event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+ fbuffer->event, fbuffer->entry,
+ fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+
+int ftrace_event_reg(struct ftrace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ struct ftrace_event_file *file = data;
+
+ WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
case TRACE_REG_REGISTER:
- return tracepoint_probe_register(call->name,
+ return tracepoint_probe_register(call->tp,
call->class->probe,
- call);
+ file);
case TRACE_REG_UNREGISTER:
- tracepoint_probe_unregister(call->name,
+ tracepoint_probe_unregister(call->tp,
call->class->probe,
- call);
+ file);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return tracepoint_probe_register(call->name,
+ return tracepoint_probe_register(call->tp,
call->class->perf_probe,
call);
case TRACE_REG_PERF_UNREGISTER:
- tracepoint_probe_unregister(call->name,
+ tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
call);
return 0;
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
#endif
}
return 0;
@@ -178,54 +258,108 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
void trace_event_enable_cmd_record(bool enable)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
- if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+ do_for_each_event_file(tr, file) {
+
+ if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
continue;
if (enable) {
tracing_start_cmdline_record();
- call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
} else {
tracing_stop_cmdline_record();
- call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
}
- }
+ } while_for_each_event_file();
mutex_unlock(&event_mutex);
}
-static int ftrace_event_enable_disable(struct ftrace_event_call *call,
- int enable)
+static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable)
{
+ struct ftrace_event_call *call = file->event_call;
int ret = 0;
+ int disable;
switch (enable) {
case 0:
- if (call->flags & TRACE_EVENT_FL_ENABLED) {
- call->flags &= ~TRACE_EVENT_FL_ENABLED;
- if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+ /*
+ * When soft_disable is set and enable is cleared, the sm_ref
+ * reference counter is decremented. If it reaches 0, we want
+ * to clear the SOFT_DISABLED flag but leave the event in the
+ * state that it was. That is, if the event was enabled and
+ * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
+ * is set we do not want the event to be enabled before we
+ * clear the bit.
+ *
+ * When soft_disable is not set but the SOFT_MODE flag is,
+ * we do nothing. Do not disable the tracepoint, otherwise
+ * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+ */
+ if (soft_disable) {
+ if (atomic_dec_return(&file->sm_ref) > 0)
+ break;
+ disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
+ clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ } else
+ disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+
+ if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
+ clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+ if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
tracing_stop_cmdline_record();
- call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER);
+ call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
+ /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
+ if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ else
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
break;
case 1:
- if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+ /*
+ * When soft_disable is set and enable is set, we want to
+ * register the tracepoint for the event, but leave the event
+ * as is. That means, if the event was already enabled, we do
+ * nothing (but set SOFT_MODE). If the event is disabled, we
+ * set SOFT_DISABLED before enabling the event tracepoint, so
+ * it still seems to be disabled.
+ */
+ if (!soft_disable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ else {
+ if (atomic_inc_return(&file->sm_ref) > 1)
+ break;
+ set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ }
+
+ if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+
+ /* Keep the event disabled, when going to SOFT_MODE. */
+ if (soft_disable)
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+
if (trace_flags & TRACE_ITER_RECORD_CMD) {
tracing_start_cmdline_record();
- call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
}
- ret = call->class->reg(call, TRACE_REG_REGISTER);
+ ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
- "%s\n", call->name);
+ "%s\n", ftrace_event_name(call));
break;
}
- call->flags |= TRACE_EVENT_FL_ENABLED;
+ set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+
+ /* WAS_ENABLED gets set but never cleared. */
+ call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
}
break;
}
@@ -233,13 +367,25 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
return ret;
}
-static void ftrace_clear_events(void)
+int trace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable)
{
- struct ftrace_event_call *call;
+ return __ftrace_event_enable_disable(file, enable, soft_disable);
+}
+
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+ int enable)
+{
+ return __ftrace_event_enable_disable(file, enable, 0);
+}
+
+static void ftrace_clear_events(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
- ftrace_event_enable_disable(call, 0);
+ list_for_each_entry(file, &tr->events, list) {
+ ftrace_event_enable_disable(file, 0);
}
mutex_unlock(&event_mutex);
}
@@ -248,67 +394,141 @@ static void __put_system(struct event_subsystem *system)
{
struct event_filter *filter = system->filter;
- WARN_ON_ONCE(system->ref_count == 0);
- if (--system->ref_count)
+ WARN_ON_ONCE(system_refcount(system) == 0);
+ if (system_refcount_dec(system))
return;
+ list_del(&system->list);
+
if (filter) {
kfree(filter->filter_string);
kfree(filter);
}
- kfree(system->name);
+ if (system->ref_count & SYSTEM_FL_FREE_NAME)
+ kfree(system->name);
kfree(system);
}
static void __get_system(struct event_subsystem *system)
{
- WARN_ON_ONCE(system->ref_count == 0);
- system->ref_count++;
+ WARN_ON_ONCE(system_refcount(system) == 0);
+ system_refcount_inc(system);
+}
+
+static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+{
+ WARN_ON_ONCE(dir->ref_count == 0);
+ dir->ref_count++;
+ __get_system(dir->subsystem);
}
-static void put_system(struct event_subsystem *system)
+static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+{
+ WARN_ON_ONCE(dir->ref_count == 0);
+ /* If the subsystem is about to be freed, the dir must be too */
+ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
+
+ __put_system(dir->subsystem);
+ if (!--dir->ref_count)
+ kfree(dir);
+}
+
+static void put_system(struct ftrace_subsystem_dir *dir)
{
mutex_lock(&event_mutex);
- __put_system(system);
+ __put_system_dir(dir);
mutex_unlock(&event_mutex);
}
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+ if (!dir)
+ return;
+
+ if (!--dir->nr_events) {
+ debugfs_remove_recursive(dir->entry);
+ list_del(&dir->list);
+ __put_system_dir(dir);
+ }
+}
+
+static void remove_event_file_dir(struct ftrace_event_file *file)
+{
+ struct dentry *dir = file->dir;
+ struct dentry *child;
+
+ if (dir) {
+ spin_lock(&dir->d_lock); /* probably unneeded */
+ list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ if (child->d_inode) /* probably unneeded */
+ child->d_inode->i_private = NULL;
+ }
+ spin_unlock(&dir->d_lock);
+
+ debugfs_remove_recursive(dir);
+ }
+
+ list_del(&file->list);
+ remove_subsystem(file->system);
+ free_event_filter(file->filter);
+ kmem_cache_free(file_cachep, file);
+}
+
/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
-static int __ftrace_set_clr_event(const char *match, const char *sub,
- const char *event, int set)
+static int
+__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
{
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
+ const char *name;
int ret = -EINVAL;
- mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
+ name = ftrace_event_name(call);
- if (!call->name || !call->class || !call->class->reg)
+ if (!name || !call->class || !call->class->reg)
+ continue;
+
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
continue;
if (match &&
- strcmp(match, call->name) != 0 &&
+ strcmp(match, name) != 0 &&
strcmp(match, call->class->system) != 0)
continue;
if (sub && strcmp(sub, call->class->system) != 0)
continue;
- if (event && strcmp(event, call->name) != 0)
+ if (event && strcmp(event, name) != 0)
continue;
- ftrace_event_enable_disable(call, set);
+ ftrace_event_enable_disable(file, set);
ret = 0;
}
+
+ return ret;
+}
+
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
mutex_unlock(&event_mutex);
return ret;
}
-static int ftrace_set_clr_event(char *buf, int set)
+static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
@@ -336,7 +556,7 @@ static int ftrace_set_clr_event(char *buf, int set)
event = NULL;
}
- return __ftrace_set_clr_event(match, sub, event, set);
+ return __ftrace_set_clr_event(tr, match, sub, event, set);
}
/**
@@ -353,7 +573,12 @@ static int ftrace_set_clr_event(char *buf, int set)
*/
int trace_set_clr_event(const char *system, const char *event, int set)
{
- return __ftrace_set_clr_event(NULL, system, event, set);
+ struct trace_array *tr = top_trace_array();
+
+ if (!tr)
+ return -ENODEV;
+
+ return __ftrace_set_clr_event(tr, NULL, system, event, set);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -365,6 +590,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
+ struct seq_file *m = file->private_data;
+ struct trace_array *tr = m->private;
ssize_t read, ret;
if (!cnt)
@@ -387,7 +614,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
parser.buffer[parser.idx] = 0;
- ret = ftrace_set_clr_event(parser.buffer + !set, set);
+ ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
if (ret)
goto out_put;
}
@@ -403,17 +630,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = v;
+ struct ftrace_event_file *file = v;
+ struct ftrace_event_call *call;
+ struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(call, &ftrace_events, list) {
+ list_for_each_entry_continue(file, &tr->events, list) {
+ call = file->event_call;
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
if (call->class && call->class->reg)
- return call;
+ return file;
}
return NULL;
@@ -421,30 +651,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+ file = list_entry(&tr->events, struct ftrace_event_file, list);
for (l = 0; l <= *pos; ) {
- call = t_next(m, call, &l);
- if (!call)
+ file = t_next(m, file, &l);
+ if (!file)
break;
}
- return call;
+ return file;
}
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = v;
+ struct ftrace_event_file *file = v;
+ struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(call, &ftrace_events, list) {
- if (call->flags & TRACE_EVENT_FL_ENABLED)
- return call;
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return file;
}
return NULL;
@@ -452,27 +684,29 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+ file = list_entry(&tr->events, struct ftrace_event_file, list);
for (l = 0; l <= *pos; ) {
- call = s_next(m, call, &l);
- if (!call)
+ file = s_next(m, file, &l);
+ if (!file)
break;
}
- return call;
+ return file;
}
static int t_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = v;
+ struct ftrace_event_file *file = v;
+ struct ftrace_event_call *call = file->event_call;
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->class->system);
- seq_printf(m, "%s\n", call->name);
+ seq_printf(m, "%s\n", ftrace_event_name(call));
return 0;
}
@@ -482,39 +716,41 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
-static int
-ftrace_event_seq_open(struct inode *inode, struct file *file)
-{
- const struct seq_operations *seq_ops;
-
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC))
- ftrace_clear_events();
-
- seq_ops = inode->i_private;
- return seq_open(file, seq_ops);
-}
-
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
- char *buf;
+ struct ftrace_event_file *file;
+ unsigned long flags;
+ char buf[4] = "0";
- if (call->flags & TRACE_EVENT_FL_ENABLED)
- buf = "1\n";
- else
- buf = "0\n";
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (likely(file))
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+ if (!file)
+ return -ENODEV;
+
+ if (flags & FTRACE_EVENT_FL_ENABLED &&
+ !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ strcpy(buf, "1");
+
+ if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
+ flags & FTRACE_EVENT_FL_SOFT_MODE)
+ strcat(buf, "*");
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+ strcat(buf, "\n");
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
}
static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_file *file;
unsigned long val;
int ret;
@@ -529,8 +765,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
switch (val) {
case 0:
case 1:
+ ret = -ENODEV;
mutex_lock(&event_mutex);
- ret = ftrace_event_enable_disable(call, val);
+ file = event_file_data(filp);
+ if (likely(file))
+ ret = ftrace_event_enable_disable(file, val);
mutex_unlock(&event_mutex);
break;
@@ -548,15 +787,19 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = dir->tr;
char buf[2];
int set = 0;
int ret;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->class || !call->class->reg)
+ list_for_each_entry(file, &tr->events, list) {
+ call = file->event_call;
+ if (!ftrace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -567,7 +810,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
+ set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -589,7 +832,8 @@ static ssize_t
system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
const char *name = NULL;
unsigned long val;
ssize_t ret;
@@ -612,7 +856,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (system)
name = system->name;
- ret = __ftrace_set_clr_event(NULL, name, NULL, val);
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
if (ret)
goto out;
@@ -632,71 +876,45 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
- struct ftrace_event_field *field;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
+ struct list_head *node = v;
(*pos)++;
switch ((unsigned long)v) {
case FORMAT_HEADER:
- if (unlikely(list_empty(common_head)))
- return NULL;
-
- field = list_entry(common_head->prev,
- struct ftrace_event_field, link);
- return field;
+ node = common_head;
+ break;
case FORMAT_FIELD_SEPERATOR:
- if (unlikely(list_empty(head)))
- return NULL;
-
- field = list_entry(head->prev, struct ftrace_event_field, link);
- return field;
+ node = head;
+ break;
case FORMAT_PRINTFMT:
/* all done */
return NULL;
}
- field = v;
- if (field->link.prev == common_head)
+ node = node->prev;
+ if (node == common_head)
return (void *)FORMAT_FIELD_SEPERATOR;
- else if (field->link.prev == head)
+ else if (node == head)
return (void *)FORMAT_PRINTFMT;
-
- field = list_entry(field->link.prev, struct ftrace_event_field, link);
-
- return field;
-}
-
-static void *f_start(struct seq_file *m, loff_t *pos)
-{
- loff_t l = 0;
- void *p;
-
- /* Start by showing the header */
- if (!*pos)
- return (void *)FORMAT_HEADER;
-
- p = (void *)FORMAT_HEADER;
- do {
- p = f_next(m, p, &l);
- } while (p && l < *pos);
-
- return p;
+ else
+ return node;
}
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
const char *array_descriptor;
switch ((unsigned long)v) {
case FORMAT_HEADER:
- seq_printf(m, "name: %s\n", call->name);
+ seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_printf(m, "format:\n");
return 0;
@@ -711,8 +929,7 @@ static int f_show(struct seq_file *m, void *v)
return 0;
}
- field = v;
-
+ field = list_entry(v, struct ftrace_event_field, link);
/*
* Smartly shows the array type(except dynamic array).
* Normal:
@@ -739,8 +956,25 @@ static int f_show(struct seq_file *m, void *v)
return 0;
}
+static void *f_start(struct seq_file *m, loff_t *pos)
+{
+ void *p = (void *)FORMAT_HEADER;
+ loff_t l = 0;
+
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ if (!event_file_data(m->private))
+ return ERR_PTR(-ENODEV);
+
+ while (l < *pos && p)
+ p = f_next(m, p, &l);
+
+ return p;
+}
+
static void f_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&event_mutex);
}
static const struct seq_operations trace_format_seq_ops = {
@@ -752,7 +986,6 @@ static const struct seq_operations trace_format_seq_ops = {
static int trace_format_open(struct inode *inode, struct file *file)
{
- struct ftrace_event_call *call = inode->i_private;
struct seq_file *m;
int ret;
@@ -761,7 +994,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return ret;
m = file->private_data;
- m->private = call;
+ m->private = file;
return 0;
}
@@ -769,45 +1002,47 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
- struct trace_seq *s;
- int r;
+ int id = (long)event_file_data(filp);
+ char buf[32];
+ int len;
if (*ppos)
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
+ if (unlikely(!id))
+ return -ENODEV;
- trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->event.type);
+ len = sprintf(buf, "%d\n", id);
- r = simple_read_from_buffer(ubuf, cnt, ppos,
- s->buffer, s->len);
- kfree(s);
- return r;
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_file *file;
struct trace_seq *s;
- int r;
+ int r = -ENODEV;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
+
if (!s)
return -ENOMEM;
trace_seq_init(s);
- print_event_filter(call, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (file)
+ print_event_filter(file, s);
+ mutex_unlock(&event_mutex);
+
+ if (file)
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
@@ -818,9 +1053,9 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_file *file;
char *buf;
- int err;
+ int err = -ENODEV;
if (cnt >= PAGE_SIZE)
return -EINVAL;
@@ -835,7 +1070,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_event_filter(call, buf);
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (file)
+ err = apply_event_filter(file, buf);
+ mutex_unlock(&event_mutex);
+
free_page((unsigned long) buf);
if (err < 0)
return err;
@@ -850,43 +1090,101 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
struct event_subsystem *system = NULL;
+ struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+ struct trace_array *tr;
int ret;
- if (!inode->i_private)
- goto skip_search;
+ if (tracing_is_disabled())
+ return -ENODEV;
/* Make sure the system still exists */
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
- list_for_each_entry(system, &event_subsystems, list) {
- if (system == inode->i_private) {
- /* Don't open systems with no events */
- if (!system->nr_events) {
- system = NULL;
- break;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+ if (dir == inode->i_private) {
+ /* Don't open systems with no events */
+ if (dir->nr_events) {
+ __get_system_dir(dir);
+ system = dir->subsystem;
+ }
+ goto exit_loop;
}
- __get_system(system);
- break;
}
}
+ exit_loop:
mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+
+ if (!system)
+ return -ENODEV;
- if (system != inode->i_private)
+ /* Some versions of gcc think dir can be uninitialized here */
+ WARN_ON(!dir);
+
+ /* Still need to increment the ref count of the system */
+ if (trace_array_get(tr) < 0) {
+ put_system(dir);
return -ENODEV;
+ }
- skip_search:
ret = tracing_open_generic(inode, filp);
- if (ret < 0 && system)
- put_system(system);
+ if (ret < 0) {
+ trace_array_put(tr);
+ put_system(dir);
+ }
return ret;
}
+static int system_tr_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_subsystem_dir *dir;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_is_disabled())
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ /* Make a temporary dir that has no system but points to tr */
+ dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+ if (!dir) {
+ trace_array_put(tr);
+ return -ENOMEM;
+ }
+
+ dir->tr = tr;
+
+ ret = tracing_open_generic(inode, filp);
+ if (ret < 0) {
+ trace_array_put(tr);
+ kfree(dir);
+ return ret;
+ }
+
+ filp->private_data = dir;
+
+ return 0;
+}
+
static int subsystem_release(struct inode *inode, struct file *file)
{
- struct event_subsystem *system = inode->i_private;
+ struct ftrace_subsystem_dir *dir = file->private_data;
- if (system)
- put_system(system);
+ trace_array_put(dir->tr);
+
+ /*
+ * If dir->subsystem is NULL, then this is a temporary
+ * descriptor that was made for a trace_array to enable
+ * all subsystems.
+ */
+ if (dir->subsystem)
+ put_system(dir);
+ else
+ kfree(dir);
return 0;
}
@@ -895,7 +1193,8 @@ static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
struct trace_seq *s;
int r;
@@ -920,7 +1219,7 @@ static ssize_t
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
char *buf;
int err;
@@ -937,7 +1236,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_subsystem_event_filter(system, buf);
+ err = apply_subsystem_event_filter(dir, buf);
free_page((unsigned long) buf);
if (err < 0)
return err;
@@ -971,6 +1270,10 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return r;
}
+static int ftrace_event_avail_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_release(struct inode *inode, struct file *file);
+
static const struct seq_operations show_event_seq_ops = {
.start = t_start,
.next = t_next,
@@ -986,18 +1289,18 @@ static const struct seq_operations show_set_event_seq_ops = {
};
static const struct file_operations ftrace_avail_fops = {
- .open = ftrace_event_seq_open,
+ .open = ftrace_event_avail_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct file_operations ftrace_set_event_fops = {
- .open = ftrace_event_seq_open,
+ .open = ftrace_event_set_open,
.read = seq_read,
.write = ftrace_event_write,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = ftrace_event_release,
};
static const struct file_operations ftrace_enable_fops = {
@@ -1015,7 +1318,6 @@ static const struct file_operations ftrace_event_format_fops = {
};
static const struct file_operations ftrace_event_id_fops = {
- .open = tracing_open_generic,
.read = event_id_read,
.llseek = default_llseek,
};
@@ -1043,129 +1345,226 @@ static const struct file_operations ftrace_system_enable_fops = {
.release = subsystem_release,
};
+static const struct file_operations ftrace_tr_enable_fops = {
+ .open = system_tr_open,
+ .read = system_enable_read,
+ .write = system_enable_write,
+ .llseek = default_llseek,
+ .release = subsystem_release,
+};
+
static const struct file_operations ftrace_show_header_fops = {
.open = tracing_open_generic,
.read = show_header,
.llseek = default_llseek,
};
-static struct dentry *event_trace_events_dir(void)
+static int
+ftrace_event_open(struct inode *inode, struct file *file,
+ const struct seq_operations *seq_ops)
{
- static struct dentry *d_tracer;
- static struct dentry *d_events;
+ struct seq_file *m;
+ int ret;
- if (d_events)
- return d_events;
+ ret = seq_open(file, seq_ops);
+ if (ret < 0)
+ return ret;
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = inode->i_private;
- d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ return ret;
+}
+
+static int ftrace_event_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
+}
+
+static int
+ftrace_event_avail_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_event_seq_ops;
+
+ return ftrace_event_open(inode, file, seq_ops);
+}
+
+static int
+ftrace_event_set_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_clear_events(tr);
+
+ ret = ftrace_event_open(inode, file, seq_ops);
+ if (ret < 0)
+ trace_array_put(tr);
+ return ret;
+}
+
+static struct event_subsystem *
+create_new_subsystem(const char *name)
+{
+ struct event_subsystem *system;
+
+ /* need to create new entry */
+ system = kmalloc(sizeof(*system), GFP_KERNEL);
+ if (!system)
return NULL;
- d_events = debugfs_create_dir("events", d_tracer);
- if (!d_events)
- pr_warning("Could not create debugfs "
- "'events' directory\n");
+ system->ref_count = 1;
+
+ /* Only allocate if dynamic (kprobes and modules) */
+ if (!core_kernel_data((unsigned long)name)) {
+ system->ref_count |= SYSTEM_FL_FREE_NAME;
+ system->name = kstrdup(name, GFP_KERNEL);
+ if (!system->name)
+ goto out_free;
+ } else
+ system->name = name;
+
+ system->filter = NULL;
+
+ system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+ if (!system->filter)
+ goto out_free;
+
+ list_add(&system->list, &event_subsystems);
- return d_events;
+ return system;
+
+ out_free:
+ if (system->ref_count & SYSTEM_FL_FREE_NAME)
+ kfree(system->name);
+ kfree(system);
+ return NULL;
}
static struct dentry *
-event_subsystem_dir(const char *name, struct dentry *d_events)
+event_subsystem_dir(struct trace_array *tr, const char *name,
+ struct ftrace_event_file *file, struct dentry *parent)
{
+ struct ftrace_subsystem_dir *dir;
struct event_subsystem *system;
struct dentry *entry;
/* First see if we did not already create this dir */
- list_for_each_entry(system, &event_subsystems, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+ system = dir->subsystem;
if (strcmp(system->name, name) == 0) {
- system->nr_events++;
- return system->entry;
+ dir->nr_events++;
+ file->system = dir;
+ return dir->entry;
}
}
- /* need to create new entry */
- system = kmalloc(sizeof(*system), GFP_KERNEL);
- if (!system) {
- pr_warning("No memory to create event subsystem %s\n",
- name);
- return d_events;
+ /* Now see if the system itself exists. */
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0)
+ break;
}
+ /* Reset system variable when not found */
+ if (&system->list == &event_subsystems)
+ system = NULL;
- system->entry = debugfs_create_dir(name, d_events);
- if (!system->entry) {
- pr_warning("Could not create event subsystem %s\n",
- name);
- kfree(system);
- return d_events;
- }
+ dir = kmalloc(sizeof(*dir), GFP_KERNEL);
+ if (!dir)
+ goto out_fail;
- system->nr_events = 1;
- system->ref_count = 1;
- system->name = kstrdup(name, GFP_KERNEL);
- if (!system->name) {
- debugfs_remove(system->entry);
- kfree(system);
- return d_events;
+ if (!system) {
+ system = create_new_subsystem(name);
+ if (!system)
+ goto out_free;
+ } else
+ __get_system(system);
+
+ dir->entry = debugfs_create_dir(name, parent);
+ if (!dir->entry) {
+ pr_warning("Failed to create system directory %s\n", name);
+ __put_system(system);
+ goto out_free;
}
- list_add(&system->list, &event_subsystems);
-
- system->filter = NULL;
+ dir->tr = tr;
+ dir->ref_count = 1;
+ dir->nr_events = 1;
+ dir->subsystem = system;
+ file->system = dir;
- system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
- if (!system->filter) {
- pr_warning("Could not allocate filter for subsystem "
- "'%s'\n", name);
- return system->entry;
- }
-
- entry = debugfs_create_file("filter", 0644, system->entry, system,
+ entry = debugfs_create_file("filter", 0644, dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs "
- "'%s/filter' entry\n", name);
+ pr_warning("Could not create debugfs '%s/filter' entry\n", name);
}
- trace_create_file("enable", 0644, system->entry, system,
+ trace_create_file("enable", 0644, dir->entry, dir,
&ftrace_system_enable_fops);
- return system->entry;
+ list_add(&dir->list, &tr->systems);
+
+ return dir->entry;
+
+ out_free:
+ kfree(dir);
+ out_fail:
+ /* Only print this message if failed on memory allocation */
+ if (!dir || !system)
+ pr_warning("No memory to create event subsystem %s\n",
+ name);
+ return NULL;
}
static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
- const struct file_operations *id,
- const struct file_operations *enable,
- const struct file_operations *filter,
- const struct file_operations *format)
+event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
{
+ struct ftrace_event_call *call = file->event_call;
+ struct trace_array *tr = file->tr;
struct list_head *head;
+ struct dentry *d_events;
+ const char *name;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
- d_events = event_subsystem_dir(call->class->system, d_events);
-
- call->dir = debugfs_create_dir(call->name, d_events);
- if (!call->dir) {
- pr_warning("Could not create debugfs "
- "'%s' directory\n", call->name);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
+ d_events = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!d_events)
+ return -ENOMEM;
+ } else
+ d_events = parent;
+
+ name = ftrace_event_name(call);
+ file->dir = debugfs_create_dir(name, d_events);
+ if (!file->dir) {
+ pr_warning("Could not create debugfs '%s' directory\n",
+ name);
return -1;
}
- if (call->class->reg)
- trace_create_file("enable", 0644, call->dir, call,
- enable);
+ if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ trace_create_file("enable", 0644, file->dir, file,
+ &ftrace_enable_fops);
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, call->dir, call,
- id);
+ trace_create_file("id", 0444, file->dir,
+ (void *)(long)call->event.type,
+ &ftrace_event_id_fops);
#endif
/*
@@ -1177,222 +1576,286 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
ret = call->class->define_fields(call);
if (ret < 0) {
pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
- return ret;
+ " events/%s\n", name);
+ return -1;
}
}
- trace_create_file("filter", 0644, call->dir, call,
- filter);
+ trace_create_file("filter", 0644, file->dir, file,
+ &ftrace_event_filter_fops);
- trace_create_file("format", 0444, call->dir, call,
- format);
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
+
+ trace_create_file("format", 0444, file->dir, call,
+ &ftrace_event_format_fops);
return 0;
}
-static int
-__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
- const struct file_operations *id,
- const struct file_operations *enable,
- const struct file_operations *filter,
- const struct file_operations *format)
+static void remove_event_from_tracers(struct ftrace_event_call *call)
{
- struct dentry *d_events;
- int ret;
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
+
+ do_for_each_event_file_safe(tr, file) {
+ if (file->event_call != call)
+ continue;
+
+ remove_event_file_dir(file);
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+}
+
+static void event_remove(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ ftrace_event_enable_disable(file, 0);
+ destroy_preds(file);
+ /*
+ * The do_for_each_event_file() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
+ remove_event_from_tracers(call);
+ list_del(&call->list);
+}
+
+static int event_init(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ const char *name;
- /* The linker may leave blanks */
- if (!call->name)
+ name = ftrace_event_name(call);
+ if (WARN_ON(!name))
return -EINVAL;
if (call->class->raw_init) {
ret = call->class->raw_init(call);
- if (ret < 0) {
- if (ret != -ENOSYS)
- pr_warning("Could not initialize trace events/%s\n",
- call->name);
- return ret;
- }
+ if (ret < 0 && ret != -ENOSYS)
+ pr_warn("Could not initialize trace events/%s\n",
+ name);
}
- d_events = event_trace_events_dir();
- if (!d_events)
- return -ENOENT;
+ return ret;
+}
+
+static int
+__register_event(struct ftrace_event_call *call, struct module *mod)
+{
+ int ret;
+
+ ret = event_init(call);
+ if (ret < 0)
+ return ret;
- ret = event_create_dir(call, d_events, id, enable, filter, format);
- if (!ret)
- list_add(&call->list, &ftrace_events);
+ list_add(&call->list, &ftrace_events);
call->mod = mod;
- return ret;
+ return 0;
}
-/* Add an additional event_call dynamically */
-int trace_add_event_call(struct ftrace_event_call *call)
+static struct ftrace_event_file *
+trace_create_new_event(struct ftrace_event_call *call,
+ struct trace_array *tr)
{
- int ret;
- mutex_lock(&event_mutex);
- ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
- &ftrace_enable_fops,
- &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
- mutex_unlock(&event_mutex);
- return ret;
+ struct ftrace_event_file *file;
+
+ file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+ if (!file)
+ return NULL;
+
+ file->event_call = call;
+ file->tr = tr;
+ atomic_set(&file->sm_ref, 0);
+ atomic_set(&file->tm_ref, 0);
+ INIT_LIST_HEAD(&file->triggers);
+ list_add(&file->list, &tr->events);
+
+ return file;
}
-static void remove_subsystem_dir(const char *name)
+/* Add an event to a trace directory */
+static int
+__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
{
- struct event_subsystem *system;
+ struct ftrace_event_file *file;
- if (strcmp(name, TRACE_SYSTEM) == 0)
- return;
+ file = trace_create_new_event(call, tr);
+ if (!file)
+ return -ENOMEM;
- list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0) {
- if (!--system->nr_events) {
- debugfs_remove_recursive(system->entry);
- list_del(&system->list);
- __put_system(system);
- }
- break;
- }
- }
+ return event_create_dir(tr->event_dir, file);
}
/*
- * Must be called under locking both of event_mutex and trace_event_mutex.
+ * Just create a decriptor for early init. A descriptor is required
+ * for enabling events at boot. We want to enable events before
+ * the filesystem is initialized.
*/
-static void __trace_remove_event_call(struct ftrace_event_call *call)
+static __init int
+__trace_early_add_new_event(struct ftrace_event_call *call,
+ struct trace_array *tr)
{
- ftrace_event_enable_disable(call, 0);
- if (call->event.funcs)
- __unregister_ftrace_event(&call->event);
- debugfs_remove_recursive(call->dir);
- list_del(&call->list);
- trace_destroy_fields(call);
- destroy_preds(call);
- remove_subsystem_dir(call->class->system);
+ struct ftrace_event_file *file;
+
+ file = trace_create_new_event(call, tr);
+ if (!file)
+ return -ENOMEM;
+
+ return 0;
}
-/* Remove an event_call */
-void trace_remove_event_call(struct ftrace_event_call *call)
+struct ftrace_module_file_ops;
+static void __add_event_to_tracers(struct ftrace_event_call *call);
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
{
+ int ret;
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
- down_write(&trace_event_mutex);
- __trace_remove_event_call(call);
- up_write(&trace_event_mutex);
- mutex_unlock(&event_mutex);
-}
-
-#define for_each_event(event, start, end) \
- for (event = start; \
- (unsigned long)event < (unsigned long)end; \
- event++)
-#ifdef CONFIG_MODULES
+ ret = __register_event(call, NULL);
+ if (ret >= 0)
+ __add_event_to_tracers(call);
-static LIST_HEAD(ftrace_module_file_list);
+ mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+ return ret;
+}
/*
- * Modules must own their file_operations to keep up with
- * reference counting.
+ * Must be called under locking of trace_types_lock, event_mutex and
+ * trace_event_sem.
*/
-struct ftrace_module_file_ops {
- struct list_head list;
- struct module *mod;
- struct file_operations id;
- struct file_operations enable;
- struct file_operations format;
- struct file_operations filter;
-};
-
-static struct ftrace_module_file_ops *
-trace_create_file_ops(struct module *mod)
+static void __trace_remove_event_call(struct ftrace_event_call *call)
{
- struct ftrace_module_file_ops *file_ops;
+ event_remove(call);
+ trace_destroy_fields(call);
+ destroy_call_preds(call);
+}
- /*
- * This is a bit of a PITA. To allow for correct reference
- * counting, modules must "own" their file_operations.
- * To do this, we allocate the file operations that will be
- * used in the event directory.
- */
+static int probe_remove_event_call(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
- file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
- if (!file_ops)
- return NULL;
+#ifdef CONFIG_PERF_EVENTS
+ if (call->perf_refcount)
+ return -EBUSY;
+#endif
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ /*
+ * We can't rely on ftrace_event_enable_disable(enable => 0)
+ * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+ * TRACE_REG_UNREGISTER.
+ */
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return -EBUSY;
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
- file_ops->mod = mod;
+ __trace_remove_event_call(call);
- file_ops->id = ftrace_event_id_fops;
- file_ops->id.owner = mod;
+ return 0;
+}
- file_ops->enable = ftrace_enable_fops;
- file_ops->enable.owner = mod;
+/* Remove an event_call */
+int trace_remove_event_call(struct ftrace_event_call *call)
+{
+ int ret;
- file_ops->filter = ftrace_event_filter_fops;
- file_ops->filter.owner = mod;
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&event_mutex);
+ down_write(&trace_event_sem);
+ ret = probe_remove_event_call(call);
+ up_write(&trace_event_sem);
+ mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
- file_ops->format = ftrace_event_format_fops;
- file_ops->format.owner = mod;
+ return ret;
+}
- list_add(&file_ops->list, &ftrace_module_file_list);
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
- return file_ops;
-}
+#ifdef CONFIG_MODULES
static void trace_module_add_events(struct module *mod)
{
- struct ftrace_module_file_ops *file_ops = NULL;
struct ftrace_event_call **call, **start, **end;
- start = mod->trace_events;
- end = mod->trace_events + mod->num_trace_events;
-
- if (start == end)
+ if (!mod->num_trace_events)
return;
- file_ops = trace_create_file_ops(mod);
- if (!file_ops)
+ /* Don't add infrastructure for mods without tracepoints */
+ if (trace_module_has_bad_taint(mod)) {
+ pr_err("%s: module has bad taint, not creating trace events\n",
+ mod->name);
return;
+ }
+
+ start = mod->trace_events;
+ end = mod->trace_events + mod->num_trace_events;
for_each_event(call, start, end) {
- __trace_add_event_call(*call, mod,
- &file_ops->id, &file_ops->enable,
- &file_ops->filter, &file_ops->format);
+ __register_event(*call, mod);
+ __add_event_to_tracers(*call);
}
}
static void trace_module_remove_events(struct module *mod)
{
- struct ftrace_module_file_ops *file_ops;
struct ftrace_event_call *call, *p;
- bool found = false;
+ bool clear_trace = false;
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
if (call->mod == mod) {
- found = true;
+ if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
+ clear_trace = true;
__trace_remove_event_call(call);
}
}
-
- /* Now free the file_operations */
- list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
- if (file_ops->mod == mod)
- break;
- }
- if (&file_ops->list != &ftrace_module_file_list) {
- list_del(&file_ops->list);
- kfree(file_ops);
- }
+ up_write(&trace_event_sem);
/*
* It is safest to reset the ring buffer if the module being unloaded
- * registered any events.
+ * registered any events that were used. The only worry is if
+ * a new module gets loaded, and takes on the same id as the events
+ * of this module. When printing out the buffer, traced events left
+ * over from this module may be passed to the new module events and
+ * unexpected results may occur.
*/
- if (found)
- tracing_reset_current_online_cpus();
- up_write(&trace_event_mutex);
+ if (clear_trace)
+ tracing_reset_all_online_cpus();
}
static int trace_module_notify(struct notifier_block *self,
@@ -1400,6 +1863,7 @@ static int trace_module_notify(struct notifier_block *self,
{
struct module *mod = data;
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
switch (val) {
case MODULE_STATE_COMING:
@@ -1410,21 +1874,386 @@ static int trace_module_notify(struct notifier_block *self,
break;
}
mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
return 0;
}
-#else
-static int trace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- return 0;
-}
-#endif /* CONFIG_MODULES */
static struct notifier_block trace_module_nb = {
.notifier_call = trace_module_notify,
.priority = 0,
};
+#endif /* CONFIG_MODULES */
+
+/* Create a new event directory structure for a trace directory. */
+static void
+__trace_add_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_call *call;
+ int ret;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ ret = __trace_add_new_event(call, tr);
+ if (ret < 0)
+ pr_warning("Could not create directory for event %s\n",
+ ftrace_event_name(call));
+ }
+}
+
+struct ftrace_event_file *
+find_event_file(struct trace_array *tr, const char *system, const char *event)
+{
+ struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
+ const char *name;
+
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
+ name = ftrace_event_name(call);
+
+ if (!name || !call->class || !call->class->reg)
+ continue;
+
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ continue;
+
+ if (strcmp(event, name) == 0 &&
+ strcmp(system, call->class->system) == 0)
+ return file;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR "enable_event"
+#define DISABLE_EVENT_STR "disable_event"
+
+struct event_probe_data {
+ struct ftrace_event_file *file;
+ unsigned long count;
+ int ref;
+ bool enable;
+};
+
+static void
+event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (!data)
+ return;
+
+ if (data->enable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ else
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+}
+
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (!data)
+ return;
+
+ if (!data->count)
+ return;
+
+ /* Skip if the event is in a state we want to switch to */
+ if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ event_enable_probe(ip, parent_ip, _data);
+}
+
+static int
+event_enable_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *_data)
+{
+ struct event_probe_data *data = _data;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_printf(m, "%s:%s:%s",
+ data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ data->file->event_call->class->system,
+ ftrace_event_name(data->file->event_call));
+
+ if (data->count == -1)
+ seq_printf(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", data->count);
+
+ return 0;
+}
+
+static int
+event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
+ void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ data->ref++;
+ return 0;
+}
+
+static void
+event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
+ void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref) {
+ /* Remove the SOFT_MODE flag */
+ __ftrace_event_enable_disable(data->file, 0, 1);
+ module_put(data->file->event_call->mod);
+ kfree(data);
+ }
+ *pdata = NULL;
+}
+
+static struct ftrace_probe_ops event_enable_probe_ops = {
+ .func = event_enable_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_enable_count_probe_ops = {
+ .func = event_enable_count_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_probe_ops = {
+ .func = event_enable_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_count_probe_ops = {
+ .func = event_enable_count_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static int
+event_enable_func(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enabled)
+{
+ struct trace_array *tr = top_trace_array();
+ struct ftrace_event_file *file;
+ struct ftrace_probe_ops *ops;
+ struct event_probe_data *data;
+ const char *system;
+ const char *event;
+ char *number;
+ bool enable;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enabled || !param)
+ return -EINVAL;
+
+ system = strsep(&param, ":");
+ if (!param)
+ return -EINVAL;
+
+ event = strsep(&param, ":");
+
+ mutex_lock(&event_mutex);
+
+ ret = -EINVAL;
+ file = find_event_file(tr, system, event);
+ if (!file)
+ goto out;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ if (enable)
+ ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
+ else
+ ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ ret = 0;
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ data->enable = enable;
+ data->count = -1;
+ data->file = file;
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &data->count);
+ if (ret)
+ goto out_free;
+
+ out_reg:
+ /* Don't let event modules unload while probe registered */
+ ret = try_module_get(file->event_call->mod);
+ if (!ret) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = __ftrace_event_enable_disable(file, 1, 1);
+ if (ret < 0)
+ goto out_put;
+ ret = register_ftrace_function_probe(glob, ops, data);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ ret = -ENOENT;
+ goto out_disable;
+ } else if (ret < 0)
+ goto out_disable;
+ /* Just return zero, not the number of enabled functions */
+ ret = 0;
+ out:
+ mutex_unlock(&event_mutex);
+ return ret;
+
+ out_disable:
+ __ftrace_event_enable_disable(file, 0, 1);
+ out_put:
+ module_put(file->event_call->mod);
+ out_free:
+ kfree(data);
+ goto out;
+}
+
+static struct ftrace_func_command event_enable_cmd = {
+ .name = ENABLE_EVENT_STR,
+ .func = event_enable_func,
+};
+
+static struct ftrace_func_command event_disable_cmd = {
+ .name = DISABLE_EVENT_STR,
+ .func = event_enable_func,
+};
+
+static __init int register_event_cmds(void)
+{
+ int ret;
+
+ ret = register_ftrace_command(&event_enable_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_ftrace_command(&event_disable_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_ftrace_command(&event_enable_cmd);
+ return ret;
+}
+#else
+static inline int register_event_cmds(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * The top level array has already had its ftrace_event_file
+ * descriptors created in order to allow for early events to
+ * be recorded. This function is called after the debugfs has been
+ * initialized, and we now have to create the files associated
+ * to the events.
+ */
+static __init void
+__trace_early_add_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+ int ret;
+
+
+ list_for_each_entry(file, &tr->events, list) {
+ ret = event_create_dir(tr->event_dir, file);
+ if (ret < 0)
+ pr_warning("Could not create directory for event %s\n",
+ ftrace_event_name(file->event_call));
+ }
+}
+
+/*
+ * For early boot up, the top trace array requires to have
+ * a list of events that can be enabled. This must be done before
+ * the filesystem is set up in order to allow events to be traced
+ * early.
+ */
+static __init void
+__trace_early_add_events(struct trace_array *tr)
+{
+ struct ftrace_event_call *call;
+ int ret;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ /* Early boot up should not have any modules loaded */
+ if (WARN_ON_ONCE(call->mod))
+ continue;
+
+ ret = __trace_early_add_new_event(call, tr);
+ if (ret < 0)
+ pr_warning("Could not create early event %s\n",
+ ftrace_event_name(call));
+ }
+}
+
+/* Remove the event directory structure for a trace directory. */
+static void
+__trace_remove_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_file *file, *next;
+
+ list_for_each_entry_safe(file, next, &tr->events, list)
+ remove_event_file_dir(file);
+}
+
+static void __add_event_to_tracers(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ __trace_add_new_event(call, tr);
+}
extern struct ftrace_event_call *__start_ftrace_events[];
extern struct ftrace_event_call *__stop_ftrace_events[];
@@ -1434,44 +2263,32 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = 1;
- tracing_selftest_disabled = 1;
+ ring_buffer_expanded = true;
+ tracing_selftest_disabled = true;
return 1;
}
__setup("trace_event=", setup_trace_event);
-static __init int event_trace_init(void)
+/* Expects to have event_mutex held when called */
+static int
+create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
- struct ftrace_event_call **call;
- struct dentry *d_tracer;
- struct dentry *entry;
struct dentry *d_events;
- int ret;
- char *buf = bootup_event_buf;
- char *token;
-
- d_tracer = tracing_init_dentry();
- if (!d_tracer)
- return 0;
-
- entry = debugfs_create_file("available_events", 0444, d_tracer,
- (void *)&show_event_seq_ops,
- &ftrace_avail_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'available_events' entry\n");
+ struct dentry *entry;
- entry = debugfs_create_file("set_event", 0644, d_tracer,
- (void *)&show_set_event_seq_ops,
- &ftrace_set_event_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_event' entry\n");
+ entry = debugfs_create_file("set_event", 0644, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry) {
+ pr_warning("Could not create debugfs 'set_event' entry\n");
+ return -ENOMEM;
+ }
- d_events = event_trace_events_dir();
- if (!d_events)
- return 0;
+ d_events = debugfs_create_dir("events", parent);
+ if (!d_events) {
+ pr_warning("Could not create debugfs 'events' directory\n");
+ return -ENOMEM;
+ }
/* ring buffer internal formats */
trace_create_file("header_page", 0444, d_events,
@@ -1483,18 +2300,128 @@ static __init int event_trace_init(void)
&ftrace_show_header_fops);
trace_create_file("enable", 0644, d_events,
- NULL, &ftrace_system_enable_fops);
+ tr, &ftrace_tr_enable_fops);
- if (trace_define_common_fields())
- pr_warning("tracing: Failed to allocate common fields");
+ tr->event_dir = d_events;
+
+ return 0;
+}
+
+/**
+ * event_trace_add_tracer - add a instance of a trace_array to events
+ * @parent: The parent dentry to place the files/directories for events in
+ * @tr: The trace array associated with these events
+ *
+ * When a new instance is created, it needs to set up its events
+ * directory, as well as other files associated with events. It also
+ * creates the event hierachry in the @parent/events directory.
+ *
+ * Returns 0 on success.
+ */
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
- for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
- __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
- &ftrace_enable_fops,
- &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
+ ret = create_event_toplevel_files(parent, tr);
+ if (ret)
+ goto out_unlock;
+
+ down_write(&trace_event_sem);
+ __trace_add_event_dirs(tr);
+ up_write(&trace_event_sem);
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+/*
+ * The top trace array already had its file descriptors created.
+ * Now the files themselves need to be created.
+ */
+static __init int
+early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = create_event_toplevel_files(parent, tr);
+ if (ret)
+ goto out_unlock;
+
+ down_write(&trace_event_sem);
+ __trace_early_add_event_dirs(tr);
+ up_write(&trace_event_sem);
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+int event_trace_del_tracer(struct trace_array *tr)
+{
+ mutex_lock(&event_mutex);
+
+ /* Disable any event triggers and associated soft-disabled events */
+ clear_event_triggers(tr);
+
+ /* Disable any running events */
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+
+ /* Access to events are within rcu_read_lock_sched() */
+ synchronize_sched();
+
+ down_write(&trace_event_sem);
+ __trace_remove_event_dirs(tr);
+ debugfs_remove_recursive(tr->event_dir);
+ up_write(&trace_event_sem);
+
+ tr->event_dir = NULL;
+
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+
+static __init int event_trace_memsetup(void)
+{
+ field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
+ file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+ return 0;
+}
+
+static __init int event_trace_enable(void)
+{
+ struct trace_array *tr = top_trace_array();
+ struct ftrace_event_call **iter, *call;
+ char *buf = bootup_event_buf;
+ char *token;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
+
+ call = *iter;
+ ret = event_init(call);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
}
+ /*
+ * We need the top trace array to have a working set of trace
+ * points at early init, before the debug files and directories
+ * are created. Create the file entries now, and attach them
+ * to the actual file dentries later.
+ */
+ __trace_early_add_events(tr);
+
while (true) {
token = strsep(&buf, ",");
@@ -1503,17 +2430,57 @@ static __init int event_trace_init(void)
if (!*token)
continue;
- ret = ftrace_set_clr_event(token, 1);
+ ret = ftrace_set_clr_event(tr, token, 1);
if (ret)
- pr_warning("Failed to enable trace event: %s\n", token);
+ pr_warn("Failed to enable trace event: %s\n", token);
}
+ trace_printk_start_comm();
+
+ register_event_cmds();
+
+ register_trigger_cmds();
+
+ return 0;
+}
+
+static __init int event_trace_init(void)
+{
+ struct trace_array *tr;
+ struct dentry *d_tracer;
+ struct dentry *entry;
+ int ret;
+
+ tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ entry = debugfs_create_file("available_events", 0444, d_tracer,
+ tr, &ftrace_avail_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'available_events' entry\n");
+
+ if (trace_define_common_fields())
+ pr_warning("tracing: Failed to allocate common fields");
+
+ ret = early_event_add_tracer(d_tracer, tr);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MODULES
ret = register_module_notifier(&trace_module_nb);
if (ret)
pr_warning("Failed to register trace events module notifier\n");
-
+#endif
return 0;
}
+early_initcall(event_trace_memsetup);
+core_initcall(event_trace_enable);
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1572,13 +2539,22 @@ static __init void event_test_stuff(void)
*/
static __init void event_trace_self_tests(void)
{
+ struct ftrace_subsystem_dir *dir;
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
struct event_subsystem *system;
+ struct trace_array *tr;
int ret;
+ tr = top_trace_array();
+ if (!tr)
+ return;
+
pr_info("Running tests on trace events:\n");
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
/* Only test those that have a probe */
if (!call->class || !call->class->probe)
@@ -1596,21 +2572,21 @@ static __init void event_trace_self_tests(void)
continue;
#endif
- pr_info("Testing event %s: ", call->name);
+ pr_info("Testing event %s: ", ftrace_event_name(call));
/*
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (call->flags & TRACE_EVENT_FL_ENABLED) {
+ if (file->flags & FTRACE_EVENT_FL_ENABLED) {
pr_warning("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
}
- ftrace_event_enable_disable(call, 1);
+ ftrace_event_enable_disable(file, 1);
event_test_stuff();
- ftrace_event_enable_disable(call, 0);
+ ftrace_event_enable_disable(file, 0);
pr_cont("OK\n");
}
@@ -1619,7 +2595,9 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on trace event systems:\n");
- list_for_each_entry(system, &event_subsystems, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+
+ system = dir->subsystem;
/* the ftrace system is special, skip it */
if (strcmp(system->name, "ftrace") == 0)
@@ -1627,7 +2605,7 @@ static __init void event_trace_self_tests(void)
pr_info("Testing event system %s: ", system->name);
- ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
if (WARN_ON_ONCE(ret)) {
pr_warning("error enabling system %s\n",
system->name);
@@ -1636,10 +2614,12 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
- ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
- if (WARN_ON_ONCE(ret))
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
+ if (WARN_ON_ONCE(ret)) {
pr_warning("error disabling system %s\n",
system->name);
+ continue;
+ }
pr_cont("OK\n");
}
@@ -1649,7 +2629,7 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on all trace events:\n");
pr_info("Testing all events: ");
- ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
if (WARN_ON_ONCE(ret)) {
pr_warning("error enabling all events\n");
return;
@@ -1658,7 +2638,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
/* reset sysname */
- ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
if (WARN_ON_ONCE(ret)) {
pr_warning("error disabling all events\n");
return;
@@ -1672,7 +2652,8 @@ static __init void event_trace_self_tests(void)
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
static void
-function_test_events_call(unsigned long ip, unsigned long parent_ip)
+function_test_events_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct ring_buffer_event *event;
struct ring_buffer *buffer;
@@ -1701,7 +2682,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
entry->ip = ip;
entry->parent_ip = parent_ip;
- trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
@@ -1711,6 +2692,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __initdata =
{
.func = function_test_events_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 24aee712745..8a8631926a0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -44,6 +44,7 @@ enum filter_op_ids
OP_LE,
OP_GT,
OP_GE,
+ OP_BAND,
OP_NONE,
OP_OPEN_PAREN,
};
@@ -54,6 +55,7 @@ struct filter_op {
int precedence;
};
+/* Order must be the same as enum filter_op_ids above */
static struct filter_op filter_ops[] = {
{ OP_OR, "||", 1 },
{ OP_AND, "&&", 2 },
@@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = {
{ OP_LE, "<=", 5 },
{ OP_GT, ">", 5 },
{ OP_GE, ">=", 5 },
+ { OP_BAND, "&", 6 },
{ OP_NONE, "OP_NONE", 0 },
{ OP_OPEN_PAREN, "(", 0 },
};
@@ -81,6 +84,7 @@ enum {
FILT_ERR_TOO_MANY_PREDS,
FILT_ERR_MISSING_FIELD,
FILT_ERR_INVALID_FILTER,
+ FILT_ERR_IP_FIELD_ONLY,
};
static char *err_text[] = {
@@ -96,6 +100,7 @@ static char *err_text[] = {
"Too many terms in predicate expression",
"Missing field name and/or value",
"Meaningless filter expression",
+ "Only 'ip' field is supported for function trace",
};
struct opstack_op {
@@ -154,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
case OP_GE: \
match = (*addr >= val); \
break; \
+ case OP_BAND: \
+ match = (*addr & val); \
+ break; \
default: \
break; \
} \
@@ -629,17 +637,23 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
-void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
+static inline struct event_filter *event_filter(struct ftrace_event_file *file)
{
- struct event_filter *filter;
+ if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ return file->event_call->filter;
+ else
+ return file->filter;
+}
+
+/* caller must hold event_mutex */
+void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+{
+ struct event_filter *filter = event_filter(file);
- mutex_lock(&event_mutex);
- filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
- trace_seq_printf(s, "none\n");
- mutex_unlock(&event_mutex);
+ trace_seq_puts(s, "none\n");
}
void print_subsystem_event_filter(struct event_subsystem *system,
@@ -652,40 +666,13 @@ void print_subsystem_event_filter(struct event_subsystem *system,
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
- trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
+ trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
mutex_unlock(&event_mutex);
}
-static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
-{
- struct ftrace_event_field *field;
-
- list_for_each_entry(field, head, link) {
- if (!strcmp(field->name, name))
- return field;
- }
-
- return NULL;
-}
-
-static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
-{
- struct ftrace_event_field *field;
- struct list_head *head;
-
- field = __find_event_field(&ftrace_common_fields, name);
- if (field)
- return field;
-
- head = trace_get_fields(call);
- return __find_event_field(head, name);
-}
-
static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
{
- stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+ stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
if (!stack->preds)
return -ENOMEM;
stack->index = n_preds;
@@ -775,7 +762,11 @@ static int filter_set_pred(struct event_filter *filter,
static void __free_preds(struct event_filter *filter)
{
+ int i;
+
if (filter->preds) {
+ for (i = 0; i < filter->n_preds; i++)
+ kfree(filter->preds[i].ops);
kfree(filter->preds);
filter->preds = NULL;
}
@@ -783,11 +774,21 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void filter_disable(struct ftrace_event_call *call)
+static void call_filter_disable(struct ftrace_event_call *call)
{
call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
+static void filter_disable(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call_filter_disable(call);
+ else
+ file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+}
+
static void __free_filter(struct event_filter *filter)
{
if (!filter)
@@ -798,16 +799,35 @@ static void __free_filter(struct event_filter *filter)
kfree(filter);
}
+void free_event_filter(struct event_filter *filter)
+{
+ __free_filter(filter);
+}
+
+void destroy_call_preds(struct ftrace_event_call *call)
+{
+ __free_filter(call->filter);
+ call->filter = NULL;
+}
+
+static void destroy_file_preds(struct ftrace_event_file *file)
+{
+ __free_filter(file->filter);
+ file->filter = NULL;
+}
+
/*
- * Called when destroying the ftrace_event_call.
- * The call is being freed, so we do not need to worry about
- * the call being currently used. This is for module code removing
+ * Called when destroying the ftrace_event_file.
+ * The file is being freed, so we do not need to worry about
+ * the file being currently used. This is for module code removing
* the tracepoints from within it.
*/
-void destroy_preds(struct ftrace_event_call *call)
+void destroy_preds(struct ftrace_event_file *file)
{
- __free_filter(call->filter);
- call->filter = NULL;
+ if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ destroy_call_preds(file->event_call);
+ else
+ destroy_file_preds(file);
}
static struct event_filter *__alloc_filter(void)
@@ -826,8 +846,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
if (filter->preds)
__free_preds(filter);
- filter->preds =
- kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
+ filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
if (!filter->preds)
return -ENOMEM;
@@ -843,28 +862,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
return 0;
}
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static inline void __remove_filter(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ filter_disable(file);
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ remove_filter_string(call->filter);
+ else
+ remove_filter_string(file->filter);
+}
+
+static void filter_free_subsystem_preds(struct event_subsystem *system,
+ struct trace_array *tr)
{
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+ call = file->event_call;
if (strcmp(call->class->system, system->name) != 0)
continue;
- filter_disable(call);
- remove_filter_string(call->filter);
+ __remove_filter(file);
+ }
+}
+
+static inline void __free_subsystem_filter(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
+ __free_filter(call->filter);
+ call->filter = NULL;
+ } else {
+ __free_filter(file->filter);
+ file->filter = NULL;
}
}
-static void filter_free_subsystem_filters(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system,
+ struct trace_array *tr)
{
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+ call = file->event_call;
if (strcmp(call->class->system, system->name) != 0)
continue;
- __free_filter(call->filter);
- call->filter = NULL;
+ __free_subsystem_filter(file);
}
}
@@ -900,6 +947,11 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
+static bool is_function_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_TRACE_FN;
+}
+
static bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
@@ -987,11 +1039,16 @@ static int init_pred(struct filter_parse_state *ps,
fn = filter_pred_strloc;
else
fn = filter_pred_pchar;
+ } else if (is_function_field(field)) {
+ if (strcmp(field->name, "ip")) {
+ parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
+ return -EINVAL;
+ }
} else {
if (field->is_signed)
- ret = strict_strtoll(pred->regex.pattern, 0, &val);
+ ret = kstrtoll(pred->regex.pattern, 0, &val);
else
- ret = strict_strtoull(pred->regex.pattern, 0, &val);
+ ret = kstrtoull(pred->regex.pattern, 0, &val);
if (ret) {
parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
return -EINVAL;
@@ -1326,7 +1383,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
return NULL;
}
- field = find_event_field(call, operand1);
+ field = trace_find_event_field(call, operand1);
if (!field) {
parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
return NULL;
@@ -1334,10 +1391,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
strcpy(pred.regex.pattern, operand2);
pred.regex.len = strlen(pred.regex.pattern);
-
-#ifdef CONFIG_FTRACE_STARTUP_TEST
pred.field = field;
-#endif
return init_pred(ps, field, &pred) ? NULL : &pred;
}
@@ -1486,7 +1540,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
children = count_leafs(preds, &preds[root->left]);
children += count_leafs(preds, &preds[root->right]);
- root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+ root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
if (!root->ops)
return -ENOMEM;
@@ -1628,15 +1682,85 @@ fail:
return err;
}
+static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags |= TRACE_EVENT_FL_FILTERED;
+ else
+ file->flags |= FTRACE_EVENT_FL_FILTERED;
+}
+
+static inline void event_set_filter(struct ftrace_event_file *file,
+ struct event_filter *filter)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ rcu_assign_pointer(call->filter, filter);
+ else
+ rcu_assign_pointer(file->filter, filter);
+}
+
+static inline void event_clear_filter(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ RCU_INIT_POINTER(call->filter, NULL);
+ else
+ RCU_INIT_POINTER(file->filter, NULL);
+}
+
+static inline void
+event_set_no_set_filter_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+ else
+ file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+}
+
+static inline void
+event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
+ else
+ file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+}
+
+static inline bool
+event_no_set_filter_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+ return true;
+
+ if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
+ (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
+ return true;
+
+ return false;
+}
+
struct filter_list {
struct list_head list;
struct event_filter *filter;
};
static int replace_system_preds(struct event_subsystem *system,
+ struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
struct filter_list *filter_item;
struct filter_list *tmp;
@@ -1644,8 +1768,8 @@ static int replace_system_preds(struct event_subsystem *system,
bool fail = true;
int err;
- list_for_each_entry(call, &ftrace_events, list) {
-
+ list_for_each_entry(file, &tr->events, list) {
+ call = file->event_call;
if (strcmp(call->class->system, system->name) != 0)
continue;
@@ -1655,18 +1779,20 @@ static int replace_system_preds(struct event_subsystem *system,
*/
err = replace_preds(call, NULL, ps, filter_string, true);
if (err)
- call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+ event_set_no_set_filter_flag(file);
else
- call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
+ event_clear_no_set_filter_flag(file);
}
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
struct event_filter *filter;
+ call = file->event_call;
+
if (strcmp(call->class->system, system->name) != 0)
continue;
- if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
+ if (event_no_set_filter_flag(file))
continue;
filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
@@ -1687,17 +1813,17 @@ static int replace_system_preds(struct event_subsystem *system,
err = replace_preds(call, filter, ps, filter_string, false);
if (err) {
- filter_disable(call);
+ filter_disable(file);
parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
append_filter_err(ps, filter);
} else
- call->flags |= TRACE_EVENT_FL_FILTERED;
+ event_set_filtered_flag(file);
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
*/
- filter = call->filter;
- rcu_assign_pointer(call->filter, filter_item->filter);
+ filter = event_filter(file);
+ event_set_filter(file, filter_item->filter);
filter_item->filter = filter;
fail = false;
@@ -1817,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call,
return err;
}
+int create_event_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp)
+{
+ return create_filter(call, filter_str, set_str, filterp);
+}
+
/**
* create_system_filter - create a filter for an event_subsystem
* @system: event_subsystem to create a filter for
@@ -1827,6 +1960,7 @@ static int create_filter(struct ftrace_event_call *call,
* and always remembers @filter_str.
*/
static int create_system_filter(struct event_subsystem *system,
+ struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
struct event_filter *filter = NULL;
@@ -1835,7 +1969,7 @@ static int create_system_filter(struct event_subsystem *system,
err = create_filter_start(filter_str, true, &ps, &filter);
if (!err) {
- err = replace_system_preds(system, ps, filter_str);
+ err = replace_system_preds(system, tr, ps, filter_str);
if (!err) {
/* System filters just show a default message */
kfree(filter->filter_string);
@@ -1850,23 +1984,27 @@ static int create_system_filter(struct event_subsystem *system,
return err;
}
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+/* caller must hold event_mutex */
+int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
{
+ struct ftrace_event_call *call = file->event_call;
struct event_filter *filter;
- int err = 0;
-
- mutex_lock(&event_mutex);
+ int err;
if (!strcmp(strstrip(filter_string), "0")) {
- filter_disable(call);
- filter = call->filter;
+ filter_disable(file);
+ filter = event_filter(file);
+
if (!filter)
- goto out_unlock;
- RCU_INIT_POINTER(call->filter, NULL);
+ return 0;
+
+ event_clear_filter(file);
+
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
- goto out_unlock;
+
+ return 0;
}
err = create_filter(call, filter_string, true, &filter);
@@ -1878,14 +2016,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
* string
*/
if (filter) {
- struct event_filter *tmp = call->filter;
+ struct event_filter *tmp;
+ tmp = event_filter(file);
if (!err)
- call->flags |= TRACE_EVENT_FL_FILTERED;
+ event_set_filtered_flag(file);
else
- filter_disable(call);
+ filter_disable(file);
- rcu_assign_pointer(call->filter, filter);
+ event_set_filter(file, filter);
if (tmp) {
/* Make sure the call is done with the filter */
@@ -1893,39 +2032,39 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
__free_filter(tmp);
}
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
-int apply_subsystem_event_filter(struct event_subsystem *system,
+int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string)
{
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_array *tr = dir->tr;
struct event_filter *filter;
int err = 0;
mutex_lock(&event_mutex);
/* Make sure the system still has events */
- if (!system->nr_events) {
+ if (!dir->nr_events) {
err = -ENODEV;
goto out_unlock;
}
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system);
+ filter_free_subsystem_preds(system, tr);
remove_filter_string(system->filter);
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
synchronize_sched();
- filter_free_subsystem_filters(system);
+ filter_free_subsystem_filters(system, tr);
__free_filter(filter);
goto out_unlock;
}
- err = create_system_filter(system, filter_string, &filter);
+ err = create_system_filter(system, tr, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
@@ -1950,6 +2089,148 @@ void ftrace_profile_free_filter(struct perf_event *event)
__free_filter(filter);
}
+struct function_filter_data {
+ struct ftrace_ops *ops;
+ int first_filter;
+ int first_notrace;
+};
+
+#ifdef CONFIG_FUNCTION_TRACER
+static char **
+ftrace_function_filter_re(char *buf, int len, int *count)
+{
+ char *str, *sep, **re;
+
+ str = kstrndup(buf, len, GFP_KERNEL);
+ if (!str)
+ return NULL;
+
+ /*
+ * The argv_split function takes white space
+ * as a separator, so convert ',' into spaces.
+ */
+ while ((sep = strchr(str, ',')))
+ *sep = ' ';
+
+ re = argv_split(GFP_KERNEL, str, count);
+ kfree(str);
+ return re;
+}
+
+static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
+ int reset, char *re, int len)
+{
+ int ret;
+
+ if (filter)
+ ret = ftrace_set_filter(ops, re, len, reset);
+ else
+ ret = ftrace_set_notrace(ops, re, len, reset);
+
+ return ret;
+}
+
+static int __ftrace_function_set_filter(int filter, char *buf, int len,
+ struct function_filter_data *data)
+{
+ int i, re_cnt, ret = -EINVAL;
+ int *reset;
+ char **re;
+
+ reset = filter ? &data->first_filter : &data->first_notrace;
+
+ /*
+ * The 'ip' field could have multiple filters set, separated
+ * either by space or comma. We first cut the filter and apply
+ * all pieces separatelly.
+ */
+ re = ftrace_function_filter_re(buf, len, &re_cnt);
+ if (!re)
+ return -EINVAL;
+
+ for (i = 0; i < re_cnt; i++) {
+ ret = ftrace_function_set_regexp(data->ops, filter, *reset,
+ re[i], strlen(re[i]));
+ if (ret)
+ break;
+
+ if (*reset)
+ *reset = 0;
+ }
+
+ argv_free(re);
+ return ret;
+}
+
+static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
+{
+ struct ftrace_event_field *field = pred->field;
+
+ if (leaf) {
+ /*
+ * Check the leaf predicate for function trace, verify:
+ * - only '==' and '!=' is used
+ * - the 'ip' field is used
+ */
+ if ((pred->op != OP_EQ) && (pred->op != OP_NE))
+ return -EINVAL;
+
+ if (strcmp(field->name, "ip"))
+ return -EINVAL;
+ } else {
+ /*
+ * Check the non leaf predicate for function trace, verify:
+ * - only '||' is used
+ */
+ if (pred->op != OP_OR)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ftrace_function_set_filter_cb(enum move_type move,
+ struct filter_pred *pred,
+ int *err, void *data)
+{
+ /* Checking the node is valid for function trace. */
+ if ((move != MOVE_DOWN) ||
+ (pred->left != FILTER_PRED_INVALID)) {
+ *err = ftrace_function_check_pred(pred, 0);
+ } else {
+ *err = ftrace_function_check_pred(pred, 1);
+ if (*err)
+ return WALK_PRED_ABORT;
+
+ *err = __ftrace_function_set_filter(pred->op == OP_EQ,
+ pred->regex.pattern,
+ pred->regex.len,
+ data);
+ }
+
+ return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
+}
+
+static int ftrace_function_set_filter(struct perf_event *event,
+ struct event_filter *filter)
+{
+ struct function_filter_data data = {
+ .first_filter = 1,
+ .first_notrace = 1,
+ .ops = &event->ftrace_ops,
+ };
+
+ return walk_pred_tree(filter->preds, filter->root,
+ ftrace_function_set_filter_cb, &data);
+}
+#else
+static int ftrace_function_set_filter(struct perf_event *event,
+ struct event_filter *filter)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str)
{
@@ -1970,9 +2251,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
goto out_unlock;
err = create_filter(call, filter_str, false, &filter);
- if (!err)
- event->filter = filter;
+ if (err)
+ goto free_filter;
+
+ if (ftrace_event_is_function(call))
+ err = ftrace_function_set_filter(event, filter);
else
+ event->filter = filter;
+
+free_filter:
+ if (err || ftrace_event_is_function(call))
__free_filter(filter);
out_unlock:
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
new file mode 100644
index 00000000000..4747b476a03
--- /dev/null
+++ b/kernel/trace/trace_events_trigger.c
@@ -0,0 +1,1437 @@
+/*
+ * trace_events_trigger - trace event triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include "trace.h"
+
+static LIST_HEAD(trigger_commands);
+static DEFINE_MUTEX(trigger_cmd_mutex);
+
+static void
+trigger_data_free(struct event_trigger_data *data)
+{
+ if (data->cmd_ops->set_filter)
+ data->cmd_ops->set_filter(NULL, data, NULL);
+
+ synchronize_sched(); /* make sure current triggers exit before free */
+ kfree(data);
+}
+
+/**
+ * event_triggers_call - Call triggers associated with a trace event
+ * @file: The ftrace_event_file associated with the event
+ * @rec: The trace entry for the event, NULL for unconditional invocation
+ *
+ * For each trigger associated with an event, invoke the trigger
+ * function registered with the associated trigger command. If rec is
+ * non-NULL, it means that the trigger requires further processing and
+ * shouldn't be unconditionally invoked. If rec is non-NULL and the
+ * trigger has a filter associated with it, rec will checked against
+ * the filter and if the record matches the trigger will be invoked.
+ * If the trigger is a 'post_trigger', meaning it shouldn't be invoked
+ * in any case until the current event is written, the trigger
+ * function isn't invoked but the bit associated with the deferred
+ * trigger is set in the return value.
+ *
+ * Returns an enum event_trigger_type value containing a set bit for
+ * any trigger that should be deferred, ETT_NONE if nothing to defer.
+ *
+ * Called from tracepoint handlers (with rcu_read_lock_sched() held).
+ *
+ * Return: an enum event_trigger_type value containing a set bit for
+ * any trigger that should be deferred, ETT_NONE if nothing to defer.
+ */
+enum event_trigger_type
+event_triggers_call(struct ftrace_event_file *file, void *rec)
+{
+ struct event_trigger_data *data;
+ enum event_trigger_type tt = ETT_NONE;
+ struct event_filter *filter;
+
+ if (list_empty(&file->triggers))
+ return tt;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (!rec) {
+ data->ops->func(data);
+ continue;
+ }
+ filter = rcu_dereference_sched(data->filter);
+ if (filter && !filter_match_preds(filter, rec))
+ continue;
+ if (data->cmd_ops->post_trigger) {
+ tt |= data->cmd_ops->trigger_type;
+ continue;
+ }
+ data->ops->func(data);
+ }
+ return tt;
+}
+EXPORT_SYMBOL_GPL(event_triggers_call);
+
+/**
+ * event_triggers_post_call - Call 'post_triggers' for a trace event
+ * @file: The ftrace_event_file associated with the event
+ * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
+ *
+ * For each trigger associated with an event, invoke the trigger
+ * function registered with the associated trigger command, if the
+ * corresponding bit is set in the tt enum passed into this function.
+ * See @event_triggers_call for details on how those bits are set.
+ *
+ * Called from tracepoint handlers (with rcu_read_lock_sched() held).
+ */
+void
+event_triggers_post_call(struct ftrace_event_file *file,
+ enum event_trigger_type tt)
+{
+ struct event_trigger_data *data;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->cmd_ops->trigger_type & tt)
+ data->ops->func(data);
+ }
+}
+EXPORT_SYMBOL_GPL(event_triggers_post_call);
+
+#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL)
+
+static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
+{
+ struct ftrace_event_file *event_file = event_file_data(m->private);
+
+ if (t == SHOW_AVAILABLE_TRIGGERS)
+ return NULL;
+
+ return seq_list_next(t, &event_file->triggers, pos);
+}
+
+static void *trigger_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_event_file *event_file;
+
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ event_file = event_file_data(m->private);
+ if (unlikely(!event_file))
+ return ERR_PTR(-ENODEV);
+
+ if (list_empty(&event_file->triggers))
+ return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
+
+ return seq_list_start(&event_file->triggers, *pos);
+}
+
+static void trigger_stop(struct seq_file *m, void *t)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static int trigger_show(struct seq_file *m, void *v)
+{
+ struct event_trigger_data *data;
+ struct event_command *p;
+
+ if (v == SHOW_AVAILABLE_TRIGGERS) {
+ seq_puts(m, "# Available triggers:\n");
+ seq_putc(m, '#');
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry_reverse(p, &trigger_commands, list)
+ seq_printf(m, " %s", p->name);
+ seq_putc(m, '\n');
+ mutex_unlock(&trigger_cmd_mutex);
+ return 0;
+ }
+
+ data = list_entry(v, struct event_trigger_data, list);
+ data->ops->print(m, data->ops, data);
+
+ return 0;
+}
+
+static const struct seq_operations event_triggers_seq_ops = {
+ .start = trigger_start,
+ .next = trigger_next,
+ .stop = trigger_stop,
+ .show = trigger_show,
+};
+
+static int event_trigger_regex_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&event_mutex);
+
+ if (unlikely(!event_file_data(file))) {
+ mutex_unlock(&event_mutex);
+ return -ENODEV;
+ }
+
+ if (file->f_mode & FMODE_READ) {
+ ret = seq_open(file, &event_triggers_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = file;
+ }
+ }
+
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+{
+ char *command, *next = buff;
+ struct event_command *p;
+ int ret = -EINVAL;
+
+ command = strsep(&next, ": \t");
+ command = (command[0] != '!') ? command : command + 1;
+
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry(p, &trigger_commands, list) {
+ if (strcmp(p->name, command) == 0) {
+ ret = p->func(p, file, buff, command, next);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&trigger_cmd_mutex);
+
+ return ret;
+}
+
+static ssize_t event_trigger_regex_write(struct file *file,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct ftrace_event_file *event_file;
+ ssize_t ret;
+ char *buf;
+
+ if (!cnt)
+ return 0;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long)buf);
+ return -EFAULT;
+ }
+ buf[cnt] = '\0';
+ strim(buf);
+
+ mutex_lock(&event_mutex);
+ event_file = event_file_data(file);
+ if (unlikely(!event_file)) {
+ mutex_unlock(&event_mutex);
+ free_page((unsigned long)buf);
+ return -ENODEV;
+ }
+ ret = trigger_process_regex(event_file, buf);
+ mutex_unlock(&event_mutex);
+
+ free_page((unsigned long)buf);
+ if (ret < 0)
+ goto out;
+
+ *ppos += cnt;
+ ret = cnt;
+ out:
+ return ret;
+}
+
+static int event_trigger_regex_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&event_mutex);
+
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+
+static ssize_t
+event_trigger_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return event_trigger_regex_write(filp, ubuf, cnt, ppos);
+}
+
+static int
+event_trigger_open(struct inode *inode, struct file *filp)
+{
+ return event_trigger_regex_open(inode, filp);
+}
+
+static int
+event_trigger_release(struct inode *inode, struct file *file)
+{
+ return event_trigger_regex_release(inode, file);
+}
+
+const struct file_operations event_trigger_fops = {
+ .open = event_trigger_open,
+ .read = seq_read,
+ .write = event_trigger_write,
+ .llseek = tracing_lseek,
+ .release = event_trigger_release,
+};
+
+/*
+ * Currently we only register event commands from __init, so mark this
+ * __init too.
+ */
+static __init int register_event_command(struct event_command *cmd)
+{
+ struct event_command *p;
+ int ret = 0;
+
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry(p, &trigger_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ list_add(&cmd->list, &trigger_commands);
+ out_unlock:
+ mutex_unlock(&trigger_cmd_mutex);
+
+ return ret;
+}
+
+/*
+ * Currently we only unregister event commands from __init, so mark
+ * this __init too.
+ */
+static __init int unregister_event_command(struct event_command *cmd)
+{
+ struct event_command *p, *n;
+ int ret = -ENODEV;
+
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry_safe(p, n, &trigger_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = 0;
+ list_del_init(&p->list);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&trigger_cmd_mutex);
+
+ return ret;
+}
+
+/**
+ * event_trigger_print - Generic event_trigger_ops @print implementation
+ * @name: The name of the event trigger
+ * @m: The seq_file being printed to
+ * @data: Trigger-specific data
+ * @filter_str: filter_str to print, if present
+ *
+ * Common implementation for event triggers to print themselves.
+ *
+ * Usually wrapped by a function that simply sets the @name of the
+ * trigger command and then invokes this.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_print(const char *name, struct seq_file *m,
+ void *data, char *filter_str)
+{
+ long count = (long)data;
+
+ seq_printf(m, "%s", name);
+
+ if (count == -1)
+ seq_puts(m, ":unlimited");
+ else
+ seq_printf(m, ":count=%ld", count);
+
+ if (filter_str)
+ seq_printf(m, " if %s\n", filter_str);
+ else
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+/**
+ * event_trigger_init - Generic event_trigger_ops @init implementation
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data
+ *
+ * Common implementation of event trigger initialization.
+ *
+ * Usually used directly as the @init method in event trigger
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ data->ref++;
+ return 0;
+}
+
+/**
+ * event_trigger_free - Generic event_trigger_ops @free implementation
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data
+ *
+ * Common implementation of event trigger de-initialization.
+ *
+ * Usually used directly as the @free method in event trigger
+ * implementations.
+ */
+static void
+event_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref)
+ trigger_data_free(data);
+}
+
+static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+ int trigger_enable)
+{
+ int ret = 0;
+
+ if (trigger_enable) {
+ if (atomic_inc_return(&file->tm_ref) > 1)
+ return ret;
+ set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ ret = trace_event_enable_disable(file, 1, 1);
+ } else {
+ if (atomic_dec_return(&file->tm_ref) > 0)
+ return ret;
+ clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ ret = trace_event_enable_disable(file, 0, 1);
+ }
+
+ return ret;
+}
+
+/**
+ * clear_event_triggers - Clear all triggers associated with a trace array
+ * @tr: The trace array to clear
+ *
+ * For each trigger, the triggering event has its tm_ref decremented
+ * via trace_event_trigger_enable_disable(), and any associated event
+ * (in the case of enable/disable_event triggers) will have its sm_ref
+ * decremented via free()->trace_event_enable_disable(). That
+ * combination effectively reverses the soft-mode/trigger state added
+ * by trigger registration.
+ *
+ * Must be called with event_mutex held.
+ */
+void
+clear_event_triggers(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list) {
+ struct event_trigger_data *data;
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ trace_event_trigger_enable_disable(file, 0);
+ if (data->ops->free)
+ data->ops->free(data->ops, data);
+ }
+ }
+}
+
+/**
+ * update_cond_flag - Set or reset the TRIGGER_COND bit
+ * @file: The ftrace_event_file associated with the event
+ *
+ * If an event has triggers and any of those triggers has a filter or
+ * a post_trigger, trigger invocation needs to be deferred until after
+ * the current event has logged its data, and the event should have
+ * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
+ * cleared.
+ */
+static void update_cond_flag(struct ftrace_event_file *file)
+{
+ struct event_trigger_data *data;
+ bool set_cond = false;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->filter || data->cmd_ops->post_trigger) {
+ set_cond = true;
+ break;
+ }
+ }
+
+ if (set_cond)
+ set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+ else
+ clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+}
+
+/**
+ * register_trigger - Generic event_command @reg implementation
+ * @glob: The raw string used to register the trigger
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data to associate with the trigger
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event trigger registration.
+ *
+ * Usually used directly as the @reg method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int register_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file)
+{
+ struct event_trigger_data *test;
+ int ret = 0;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (data->ops->init) {
+ ret = data->ops->init(data->ops, data);
+ if (ret < 0)
+ goto out;
+ }
+
+ list_add_rcu(&data->list, &file->triggers);
+ ret++;
+
+ if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ list_del_rcu(&data->list);
+ ret--;
+ }
+ update_cond_flag(file);
+out:
+ return ret;
+}
+
+/**
+ * unregister_trigger - Generic event_command @unreg implementation
+ * @glob: The raw string used to register the trigger
+ * @ops: The trigger ops associated with the trigger
+ * @test: Trigger-specific data used to find the trigger to remove
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event trigger unregistration.
+ *
+ * Usually used directly as the @unreg method in event command
+ * implementations.
+ */
+static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct ftrace_event_file *file)
+{
+ struct event_trigger_data *data;
+ bool unregistered = false;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+ unregistered = true;
+ list_del_rcu(&data->list);
+ update_cond_flag(file);
+ trace_event_trigger_enable_disable(file, 0);
+ break;
+ }
+ }
+
+ if (unregistered && data->ops->free)
+ data->ops->free(data->ops, data);
+}
+
+/**
+ * event_trigger_callback - Generic event_command @func implementation
+ * @cmd_ops: The command ops, used for trigger registration
+ * @file: The ftrace_event_file associated with the event
+ * @glob: The raw string used to register the trigger
+ * @cmd: The cmd portion of the string used to register the trigger
+ * @param: The params portion of the string used to register the trigger
+ *
+ * Common implementation for event command parsing and trigger
+ * instantiation.
+ *
+ * Usually used directly as the @func method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_callback(struct event_command *cmd_ops,
+ struct ftrace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ struct event_trigger_data *trigger_data;
+ struct event_trigger_ops *trigger_ops;
+ char *trigger = NULL;
+ char *number;
+ int ret;
+
+ /* separate the trigger from the filter (t:n [if filter]) */
+ if (param && isdigit(param[0]))
+ trigger = strsep(&param, " \t");
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+ ret = -ENOMEM;
+ trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ if (!trigger_data)
+ goto out;
+
+ trigger_data->count = -1;
+ trigger_data->ops = trigger_ops;
+ trigger_data->cmd_ops = cmd_ops;
+ INIT_LIST_HEAD(&trigger_data->list);
+
+ if (glob[0] == '!') {
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ kfree(trigger_data);
+ ret = 0;
+ goto out;
+ }
+
+ if (trigger) {
+ number = strsep(&trigger, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &trigger_data->count);
+ if (ret)
+ goto out_free;
+ }
+
+ if (!param) /* if param is non-empty, it's supposed to be a filter */
+ goto out_reg;
+
+ if (!cmd_ops->set_filter)
+ goto out_reg;
+
+ ret = cmd_ops->set_filter(param, trigger_data, file);
+ if (ret < 0)
+ goto out_free;
+
+ out_reg:
+ ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ ret = -ENOENT;
+ goto out_free;
+ } else if (ret < 0)
+ goto out_free;
+ ret = 0;
+ out:
+ return ret;
+
+ out_free:
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+ kfree(trigger_data);
+ goto out;
+}
+
+/**
+ * set_trigger_filter - Generic event_command @set_filter implementation
+ * @filter_str: The filter string for the trigger, NULL to remove filter
+ * @trigger_data: Trigger-specific data
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event command filter parsing and filter
+ * instantiation.
+ *
+ * Usually used directly as the @set_filter method in event command
+ * implementations.
+ *
+ * Also used to remove a filter (if filter_str = NULL).
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int set_trigger_filter(char *filter_str,
+ struct event_trigger_data *trigger_data,
+ struct ftrace_event_file *file)
+{
+ struct event_trigger_data *data = trigger_data;
+ struct event_filter *filter = NULL, *tmp;
+ int ret = -EINVAL;
+ char *s;
+
+ if (!filter_str) /* clear the current filter */
+ goto assign;
+
+ s = strsep(&filter_str, " \t");
+
+ if (!strlen(s) || strcmp(s, "if") != 0)
+ goto out;
+
+ if (!filter_str)
+ goto out;
+
+ /* The filter is for the 'trigger' event, not the triggered event */
+ ret = create_event_filter(file->event_call, filter_str, false, &filter);
+ if (ret)
+ goto out;
+ assign:
+ tmp = rcu_access_pointer(data->filter);
+
+ rcu_assign_pointer(data->filter, filter);
+
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ free_event_filter(tmp);
+ }
+
+ kfree(data->filter_str);
+ data->filter_str = NULL;
+
+ if (filter_str) {
+ data->filter_str = kstrdup(filter_str, GFP_KERNEL);
+ if (!data->filter_str) {
+ free_event_filter(rcu_access_pointer(data->filter));
+ data->filter = NULL;
+ ret = -ENOMEM;
+ }
+ }
+ out:
+ return ret;
+}
+
+static void
+traceon_trigger(struct event_trigger_data *data)
+{
+ if (tracing_is_on())
+ return;
+
+ tracing_on();
+}
+
+static void
+traceon_count_trigger(struct event_trigger_data *data)
+{
+ if (tracing_is_on())
+ return;
+
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ tracing_on();
+}
+
+static void
+traceoff_trigger(struct event_trigger_data *data)
+{
+ if (!tracing_is_on())
+ return;
+
+ tracing_off();
+}
+
+static void
+traceoff_count_trigger(struct event_trigger_data *data)
+{
+ if (!tracing_is_on())
+ return;
+
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ tracing_off();
+}
+
+static int
+traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("traceon", m, (void *)data->count,
+ data->filter_str);
+}
+
+static int
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("traceoff", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops traceon_trigger_ops = {
+ .func = traceon_trigger,
+ .print = traceon_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops traceon_count_trigger_ops = {
+ .func = traceon_count_trigger,
+ .print = traceon_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops traceoff_trigger_ops = {
+ .func = traceoff_trigger,
+ .print = traceoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops traceoff_count_trigger_ops = {
+ .func = traceoff_count_trigger,
+ .print = traceoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+onoff_get_trigger_ops(char *cmd, char *param)
+{
+ struct event_trigger_ops *ops;
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = param ? &traceon_count_trigger_ops :
+ &traceon_trigger_ops;
+ else
+ ops = param ? &traceoff_count_trigger_ops :
+ &traceoff_trigger_ops;
+
+ return ops;
+}
+
+static struct event_command trigger_traceon_cmd = {
+ .name = "traceon",
+ .trigger_type = ETT_TRACE_ONOFF,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = onoff_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static struct event_command trigger_traceoff_cmd = {
+ .name = "traceoff",
+ .trigger_type = ETT_TRACE_ONOFF,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = onoff_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+static void
+snapshot_trigger(struct event_trigger_data *data)
+{
+ tracing_snapshot();
+}
+
+static void
+snapshot_count_trigger(struct event_trigger_data *data)
+{
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ snapshot_trigger(data);
+}
+
+static int
+register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file)
+{
+ int ret = register_trigger(glob, ops, data, file);
+
+ if (ret > 0 && tracing_alloc_snapshot() != 0) {
+ unregister_trigger(glob, ops, data, file);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("snapshot", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops snapshot_trigger_ops = {
+ .func = snapshot_trigger,
+ .print = snapshot_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops snapshot_count_trigger_ops = {
+ .func = snapshot_count_trigger,
+ .print = snapshot_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+snapshot_get_trigger_ops(char *cmd, char *param)
+{
+ return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
+}
+
+static struct event_command trigger_snapshot_cmd = {
+ .name = "snapshot",
+ .trigger_type = ETT_SNAPSHOT,
+ .func = event_trigger_callback,
+ .reg = register_snapshot_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = snapshot_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init int register_trigger_snapshot_cmd(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_snapshot_cmd);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+#else
+static __init int register_trigger_snapshot_cmd(void) { return 0; }
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+#ifdef CONFIG_STACKTRACE
+/*
+ * Skip 3:
+ * stacktrace_trigger()
+ * event_triggers_post_call()
+ * ftrace_raw_event_xxx()
+ */
+#define STACK_SKIP 3
+
+static void
+stacktrace_trigger(struct event_trigger_data *data)
+{
+ trace_dump_stack(STACK_SKIP);
+}
+
+static void
+stacktrace_count_trigger(struct event_trigger_data *data)
+{
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ stacktrace_trigger(data);
+}
+
+static int
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("stacktrace", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops stacktrace_trigger_ops = {
+ .func = stacktrace_trigger,
+ .print = stacktrace_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops stacktrace_count_trigger_ops = {
+ .func = stacktrace_count_trigger,
+ .print = stacktrace_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+stacktrace_get_trigger_ops(char *cmd, char *param)
+{
+ return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
+}
+
+static struct event_command trigger_stacktrace_cmd = {
+ .name = "stacktrace",
+ .trigger_type = ETT_STACKTRACE,
+ .post_trigger = true,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = stacktrace_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init int register_trigger_stacktrace_cmd(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_stacktrace_cmd);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+#else
+static __init int register_trigger_stacktrace_cmd(void) { return 0; }
+#endif /* CONFIG_STACKTRACE */
+
+static __init void unregister_trigger_traceon_traceoff_cmds(void)
+{
+ unregister_event_command(&trigger_traceon_cmd);
+ unregister_event_command(&trigger_traceoff_cmd);
+}
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR "enable_event"
+#define DISABLE_EVENT_STR "disable_event"
+
+struct enable_trigger_data {
+ struct ftrace_event_file *file;
+ bool enable;
+};
+
+static void
+event_enable_trigger(struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ if (enable_data->enable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+ else
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+}
+
+static void
+event_enable_count_trigger(struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ if (!data->count)
+ return;
+
+ /* Skip if the event is in a state we want to switch to */
+ if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ event_enable_trigger(data);
+}
+
+static int
+event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ seq_printf(m, "%s:%s:%s",
+ enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ enable_data->file->event_call->class->system,
+ ftrace_event_name(enable_data->file->event_call));
+
+ if (data->count == -1)
+ seq_puts(m, ":unlimited");
+ else
+ seq_printf(m, ":count=%ld", data->count);
+
+ if (data->filter_str)
+ seq_printf(m, " if %s\n", data->filter_str);
+ else
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static void
+event_enable_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref) {
+ /* Remove the SOFT_MODE flag */
+ trace_event_enable_disable(enable_data->file, 0, 1);
+ module_put(enable_data->file->event_call->mod);
+ trigger_data_free(data);
+ kfree(enable_data);
+ }
+}
+
+static struct event_trigger_ops event_enable_trigger_ops = {
+ .func = event_enable_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops event_enable_count_trigger_ops = {
+ .func = event_enable_count_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops event_disable_trigger_ops = {
+ .func = event_enable_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops event_disable_count_trigger_ops = {
+ .func = event_enable_count_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static int
+event_enable_trigger_func(struct event_command *cmd_ops,
+ struct ftrace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ struct ftrace_event_file *event_enable_file;
+ struct enable_trigger_data *enable_data;
+ struct event_trigger_data *trigger_data;
+ struct event_trigger_ops *trigger_ops;
+ struct trace_array *tr = file->tr;
+ const char *system;
+ const char *event;
+ char *trigger;
+ char *number;
+ bool enable;
+ int ret;
+
+ if (!param)
+ return -EINVAL;
+
+ /* separate the trigger from the filter (s:e:n [if filter]) */
+ trigger = strsep(&param, " \t");
+ if (!trigger)
+ return -EINVAL;
+
+ system = strsep(&trigger, ":");
+ if (!trigger)
+ return -EINVAL;
+
+ event = strsep(&trigger, ":");
+
+ ret = -EINVAL;
+ event_enable_file = find_event_file(tr, system, event);
+ if (!event_enable_file)
+ goto out;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+ ret = -ENOMEM;
+ trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ if (!trigger_data)
+ goto out;
+
+ enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
+ if (!enable_data) {
+ kfree(trigger_data);
+ goto out;
+ }
+
+ trigger_data->count = -1;
+ trigger_data->ops = trigger_ops;
+ trigger_data->cmd_ops = cmd_ops;
+ INIT_LIST_HEAD(&trigger_data->list);
+ RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+ enable_data->enable = enable;
+ enable_data->file = event_enable_file;
+ trigger_data->private_data = enable_data;
+
+ if (glob[0] == '!') {
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ kfree(trigger_data);
+ kfree(enable_data);
+ ret = 0;
+ goto out;
+ }
+
+ if (trigger) {
+ number = strsep(&trigger, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &trigger_data->count);
+ if (ret)
+ goto out_free;
+ }
+
+ if (!param) /* if param is non-empty, it's supposed to be a filter */
+ goto out_reg;
+
+ if (!cmd_ops->set_filter)
+ goto out_reg;
+
+ ret = cmd_ops->set_filter(param, trigger_data, file);
+ if (ret < 0)
+ goto out_free;
+
+ out_reg:
+ /* Don't let event modules unload while probe registered */
+ ret = try_module_get(event_enable_file->event_call->mod);
+ if (!ret) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = trace_event_enable_disable(event_enable_file, 1, 1);
+ if (ret < 0)
+ goto out_put;
+ ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ ret = -ENOENT;
+ goto out_disable;
+ } else if (ret < 0)
+ goto out_disable;
+ /* Just return zero, not the number of enabled functions */
+ ret = 0;
+ out:
+ return ret;
+
+ out_disable:
+ trace_event_enable_disable(event_enable_file, 0, 1);
+ out_put:
+ module_put(event_enable_file->event_call->mod);
+ out_free:
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+ kfree(trigger_data);
+ kfree(enable_data);
+ goto out;
+}
+
+static int event_enable_register_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+ struct enable_trigger_data *test_enable_data;
+ struct event_trigger_data *test;
+ int ret = 0;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ test_enable_data = test->private_data;
+ if (test_enable_data &&
+ (test_enable_data->file == enable_data->file)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (data->ops->init) {
+ ret = data->ops->init(data->ops, data);
+ if (ret < 0)
+ goto out;
+ }
+
+ list_add_rcu(&data->list, &file->triggers);
+ ret++;
+
+ if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ list_del_rcu(&data->list);
+ ret--;
+ }
+ update_cond_flag(file);
+out:
+ return ret;
+}
+
+static void event_enable_unregister_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct ftrace_event_file *file)
+{
+ struct enable_trigger_data *test_enable_data = test->private_data;
+ struct enable_trigger_data *enable_data;
+ struct event_trigger_data *data;
+ bool unregistered = false;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ enable_data = data->private_data;
+ if (enable_data &&
+ (enable_data->file == test_enable_data->file)) {
+ unregistered = true;
+ list_del_rcu(&data->list);
+ update_cond_flag(file);
+ trace_event_trigger_enable_disable(file, 0);
+ break;
+ }
+ }
+
+ if (unregistered && data->ops->free)
+ data->ops->free(data->ops, data);
+}
+
+static struct event_trigger_ops *
+event_enable_get_trigger_ops(char *cmd, char *param)
+{
+ struct event_trigger_ops *ops;
+ bool enable;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ if (enable)
+ ops = param ? &event_enable_count_trigger_ops :
+ &event_enable_trigger_ops;
+ else
+ ops = param ? &event_disable_count_trigger_ops :
+ &event_disable_trigger_ops;
+
+ return ops;
+}
+
+static struct event_command trigger_enable_cmd = {
+ .name = ENABLE_EVENT_STR,
+ .trigger_type = ETT_EVENT_ENABLE,
+ .func = event_enable_trigger_func,
+ .reg = event_enable_register_trigger,
+ .unreg = event_enable_unregister_trigger,
+ .get_trigger_ops = event_enable_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static struct event_command trigger_disable_cmd = {
+ .name = DISABLE_EVENT_STR,
+ .trigger_type = ETT_EVENT_ENABLE,
+ .func = event_enable_trigger_func,
+ .reg = event_enable_register_trigger,
+ .unreg = event_enable_unregister_trigger,
+ .get_trigger_ops = event_enable_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init void unregister_trigger_enable_disable_cmds(void)
+{
+ unregister_event_command(&trigger_enable_cmd);
+ unregister_event_command(&trigger_disable_cmd);
+}
+
+static __init int register_trigger_enable_disable_cmds(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_enable_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_event_command(&trigger_disable_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_trigger_enable_disable_cmds();
+
+ return ret;
+}
+
+static __init int register_trigger_traceon_traceoff_cmds(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_traceon_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_event_command(&trigger_traceoff_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_trigger_traceon_traceoff_cmds();
+
+ return ret;
+}
+
+__init int register_trigger_cmds(void)
+{
+ register_trigger_traceon_traceoff_cmds();
+ register_trigger_snapshot_cmd();
+ register_trigger_stacktrace_cmd();
+ register_trigger_enable_disable_cmds();
+
+ return 0;
+}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index bbeec31e0ae..d4ddde28a81 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,6 +18,16 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ftrace
+/*
+ * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
+ * function and thus become accesible via perf.
+ */
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
+ filter, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
/* not needed for this file */
#undef __field_struct
#define __field_struct(type, item)
@@ -44,21 +54,22 @@
#define F_printk(fmt, args...) fmt, args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
-struct ____ftrace_##name { \
- tstruct \
-}; \
-static void __always_unused ____ftrace_check_##name(void) \
-{ \
- struct ____ftrace_##name *__entry = NULL; \
- \
- /* force compile-time check on F_printk() */ \
- printk(print); \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+struct ____ftrace_##name { \
+ tstruct \
+}; \
+static void __always_unused ____ftrace_check_##name(void) \
+{ \
+ struct ____ftrace_##name *__entry = NULL; \
+ \
+ /* force compile-time check on F_printk() */ \
+ printk(print); \
}
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
#include "trace_entries.h"
@@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), filter_type); \
if (ret) \
return ret;
@@ -77,22 +88,19 @@ static void __always_unused ____ftrace_check_##name(void) \
offsetof(typeof(field), \
container.item), \
sizeof(field.container.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), filter_type); \
if (ret) \
return ret;
#undef __array
#define __array(type, item, len) \
do { \
+ char *type_str = #type"["__stringify(len)"]"; \
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- mutex_lock(&event_storage_mutex); \
- snprintf(event_storage, sizeof(event_storage), \
- "%s[%d]", #type, len); \
- ret = trace_define_field(event_call, event_storage, #item, \
+ ret = trace_define_field(event_call, type_str, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
- is_signed_type(type), FILTER_OTHER); \
- mutex_unlock(&event_storage_mutex); \
+ is_signed_type(type), filter_type); \
if (ret) \
return ret; \
} while (0);
@@ -104,7 +112,7 @@ static void __always_unused ____ftrace_check_##name(void) \
offsetof(typeof(field), \
container.item), \
sizeof(field.container.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), filter_type); \
if (ret) \
return ret;
@@ -112,17 +120,18 @@ static void __always_unused ____ftrace_check_##name(void) \
#define __dynamic_array(type, item) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
- 0, is_signed_type(type), FILTER_OTHER);\
+ 0, is_signed_type(type), filter_type);\
if (ret) \
return ret;
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
-int \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+static int __init \
ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
{ \
struct struct_name field; \
int ret; \
+ int filter_type = filter; \
\
tstruct; \
\
@@ -150,24 +159,39 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#define __dynamic_array(type, item)
#undef F_printk
-#define F_printk(fmt, args...) #fmt ", " __stringify(args)
+#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
+ regfn) \
\
-struct ftrace_event_class event_class_ftrace_##call = { \
+struct ftrace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
.define_fields = ftrace_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
+ .reg = regfn, \
}; \
\
struct ftrace_event_call __used event_##call = { \
- .name = #call, \
- .event.type = etype, \
.class = &event_class_ftrace_##call, \
+ { \
+ .name = #call, \
+ }, \
+ .event.type = etype, \
.print_fmt = print, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
}; \
struct ftrace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
+ FTRACE_ENTRY_REG(call, struct_name, etype, \
+ PARAMS(tstruct), PARAMS(print), filter, NULL)
+
+int ftrace_event_is_function(struct ftrace_event_call *call)
+{
+ return call == &event_function;
+}
+
#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db0..57f0ec962d2 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,115 +7,164 @@
* Based on code from the latency_tracer, that is:
*
* Copyright (C) 2004-2006 Ingo Molnar
- * Copyright (C) 2004 William Lee Irwin III
+ * Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/ring_buffer.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/slab.h>
#include <linux/fs.h>
#include "trace.h"
-/* function tracing enabled */
-static int ftrace_function_enabled;
+static void tracing_start_function_trace(struct trace_array *tr);
+static void tracing_stop_function_trace(struct trace_array *tr);
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs);
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs);
+static struct tracer_flags func_flags;
+
+/* Our option */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static int allocate_ftrace_ops(struct trace_array *tr)
+{
+ struct ftrace_ops *ops;
+
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
-static struct trace_array *func_trace;
+ /* Currently only the non stack verision is supported */
+ ops->func = function_trace_call;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
-static void tracing_start_function_trace(void);
-static void tracing_stop_function_trace(void);
+ tr->ops = ops;
+ ops->private = tr;
+ return 0;
+}
+
+
+int ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent)
+{
+ int ret;
+
+ /*
+ * The top level array uses the "global_ops", and the files are
+ * created on boot up.
+ */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
+ ret = allocate_ftrace_ops(tr);
+ if (ret)
+ return ret;
+
+ ftrace_create_filter_files(tr->ops, parent);
+
+ return 0;
+}
+
+void ftrace_destroy_function_files(struct trace_array *tr)
+{
+ ftrace_destroy_filter_files(tr->ops);
+ kfree(tr->ops);
+ tr->ops = NULL;
+}
static int function_trace_init(struct trace_array *tr)
{
- func_trace = tr;
- tr->cpu = get_cpu();
+ ftrace_func_t func;
+
+ /*
+ * Instance trace_arrays get their ops allocated
+ * at instance creation. Unless it failed
+ * the allocation.
+ */
+ if (!tr->ops)
+ return -ENOMEM;
+
+ /* Currently only the global instance can do stack tracing */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+ func_flags.val & TRACE_FUNC_OPT_STACK)
+ func = function_stack_trace_call;
+ else
+ func = function_trace_call;
+
+ ftrace_init_array_ops(tr, func);
+
+ tr->trace_buffer.cpu = get_cpu();
put_cpu();
tracing_start_cmdline_record();
- tracing_start_function_trace();
+ tracing_start_function_trace(tr);
return 0;
}
static void function_trace_reset(struct trace_array *tr)
{
- tracing_stop_function_trace();
+ tracing_stop_function_trace(tr);
tracing_stop_cmdline_record();
+ ftrace_reset_array_ops(tr);
}
static void function_trace_start(struct trace_array *tr)
{
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
}
static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
- struct trace_array *tr = func_trace;
+ struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
+ int bit;
int cpu;
int pc;
- if (unlikely(!ftrace_function_enabled))
+ if (unlikely(!tr->function_enabled))
return;
pc = preempt_count();
preempt_disable_notrace();
- local_save_flags(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1))
- trace_function(tr, ip, parent_ip, flags, pc);
- atomic_dec(&data->disabled);
- preempt_enable_notrace();
-}
+ bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ if (bit < 0)
+ goto out;
-static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
- struct trace_array *tr = func_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- if (unlikely(!ftrace_function_enabled))
- return;
-
- /*
- * Need to use raw, since this must be called before the
- * recursive protection is performed.
- */
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1)) {
- pc = preempt_count();
+ cpu = smp_processor_id();
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ if (!atomic_read(&data->disabled)) {
+ local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
}
+ trace_clear_recursion(bit);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ out:
+ preempt_enable_notrace();
}
static void
-function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
- struct trace_array *tr = func_trace;
+ struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
- if (unlikely(!ftrace_function_enabled))
+ if (unlikely(!tr->function_enabled))
return;
/*
@@ -124,7 +173,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
@@ -145,24 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
local_irq_restore(flags);
}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = function_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
-};
-
-static struct ftrace_ops trace_stack_ops __read_mostly =
-{
- .func = function_stack_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
-};
-
-/* Our two options */
-enum {
- TRACE_FUNC_OPT_STACK = 0x1,
-};
-
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -175,129 +206,159 @@ static struct tracer_flags func_flags = {
.opts = func_opts
};
-static void tracing_start_function_trace(void)
+static void tracing_start_function_trace(struct trace_array *tr)
{
- ftrace_function_enabled = 0;
-
- if (trace_flags & TRACE_ITER_PREEMPTONLY)
- trace_ops.func = function_trace_call_preempt_only;
- else
- trace_ops.func = function_trace_call;
-
- if (func_flags.val & TRACE_FUNC_OPT_STACK)
- register_ftrace_function(&trace_stack_ops);
- else
- register_ftrace_function(&trace_ops);
-
- ftrace_function_enabled = 1;
+ tr->function_enabled = 0;
+ register_ftrace_function(tr->ops);
+ tr->function_enabled = 1;
}
-static void tracing_stop_function_trace(void)
+static void tracing_stop_function_trace(struct trace_array *tr)
{
- ftrace_function_enabled = 0;
-
- if (func_flags.val & TRACE_FUNC_OPT_STACK)
- unregister_ftrace_function(&trace_stack_ops);
- else
- unregister_ftrace_function(&trace_ops);
+ tr->function_enabled = 0;
+ unregister_ftrace_function(tr->ops);
}
-static int func_set_flag(u32 old_flags, u32 bit, int set)
+static int
+func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
- if (bit == TRACE_FUNC_OPT_STACK) {
+ switch (bit) {
+ case TRACE_FUNC_OPT_STACK:
/* do nothing if already set */
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
- return 0;
+ break;
+
+ unregister_ftrace_function(tr->ops);
if (set) {
- unregister_ftrace_function(&trace_ops);
- register_ftrace_function(&trace_stack_ops);
+ tr->ops->func = function_stack_trace_call;
+ register_ftrace_function(tr->ops);
} else {
- unregister_ftrace_function(&trace_stack_ops);
- register_ftrace_function(&trace_ops);
+ tr->ops->func = function_trace_call;
+ register_ftrace_function(tr->ops);
}
- return 0;
+ break;
+ default:
+ return -EINVAL;
}
- return -EINVAL;
+ return 0;
}
-static struct tracer function_trace __read_mostly =
+static struct tracer function_trace __tracer_data =
{
.name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
- .wait_pipe = poll_wait_pipe,
.flags = &func_flags,
.set_flag = func_set_flag,
+ .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function,
#endif
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static void
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+static int update_count(void **data)
{
- long *count = (long *)data;
-
- if (tracing_is_on())
- return;
+ unsigned long *count = (long *)data;
if (!*count)
- return;
+ return 0;
if (*count != -1)
(*count)--;
- tracing_on();
+ return 1;
}
static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- long *count = (long *)data;
+ if (tracing_is_on())
+ return;
+
+ if (update_count(data))
+ tracing_on();
+}
+static void
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
if (!tracing_is_on())
return;
- if (!*count)
+ if (update_count(data))
+ tracing_off();
+}
+
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (tracing_is_on())
return;
- if (*count != -1)
- (*count)--;
+ tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (!tracing_is_on())
+ return;
tracing_off();
}
-static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data);
+/*
+ * Skip 4:
+ * ftrace_stacktrace()
+ * function_trace_probe_call()
+ * ftrace_ops_list_func()
+ * ftrace_call()
+ */
+#define STACK_SKIP 4
-static struct ftrace_probe_ops traceon_probe_ops = {
- .func = ftrace_traceon,
- .print = ftrace_trace_onoff_print,
-};
+static void
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ trace_dump_stack(STACK_SKIP);
+}
-static struct ftrace_probe_ops traceoff_probe_ops = {
- .func = ftrace_traceoff,
- .print = ftrace_trace_onoff_print,
-};
+static void
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (!tracing_is_on())
+ return;
+
+ if (update_count(data))
+ trace_dump_stack(STACK_SKIP);
+}
+
+static void
+ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (update_count(data))
+ ftrace_dump(DUMP_ALL);
+}
+
+/* Only dump the current CPU buffer. */
+static void
+ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (update_count(data))
+ ftrace_dump(DUMP_ORIG);
+}
static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data)
+ftrace_probe_print(const char *name, struct seq_file *m,
+ unsigned long ip, void *data)
{
long count = (long)data;
- seq_printf(m, "%ps:", (void *)ip);
-
- if (ops == &traceon_probe_ops)
- seq_printf(m, "traceon");
- else
- seq_printf(m, "traceoff");
+ seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
seq_printf(m, ":unlimited\n");
@@ -308,26 +369,85 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
}
static int
-ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+ftrace_traceon_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
{
- struct ftrace_probe_ops *ops;
+ return ftrace_probe_print("traceon", m, ip, data);
+}
- /* we register both traceon and traceoff to this callback */
- if (strcmp(cmd, "traceon") == 0)
- ops = &traceon_probe_ops;
- else
- ops = &traceoff_probe_ops;
+static int
+ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("traceoff", m, ip, data);
+}
- unregister_ftrace_function_probe_func(glob, ops);
+static int
+ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("stacktrace", m, ip, data);
+}
- return 0;
+static int
+ftrace_dump_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("dump", m, ip, data);
}
static int
-ftrace_trace_onoff_callback(struct ftrace_hash *hash,
- char *glob, char *cmd, char *param, int enable)
+ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("cpudump", m, ip, data);
+}
+
+static struct ftrace_probe_ops traceon_count_probe_ops = {
+ .func = ftrace_traceon_count,
+ .print = ftrace_traceon_print,
+};
+
+static struct ftrace_probe_ops traceoff_count_probe_ops = {
+ .func = ftrace_traceoff_count,
+ .print = ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_count_probe_ops = {
+ .func = ftrace_stacktrace_count,
+ .print = ftrace_stacktrace_print,
+};
+
+static struct ftrace_probe_ops dump_probe_ops = {
+ .func = ftrace_dump_probe,
+ .print = ftrace_dump_print,
+};
+
+static struct ftrace_probe_ops cpudump_probe_ops = {
+ .func = ftrace_cpudump_probe,
+ .print = ftrace_cpudump_print,
+};
+
+static struct ftrace_probe_ops traceon_probe_ops = {
+ .func = ftrace_traceon,
+ .print = ftrace_traceon_print,
+};
+
+static struct ftrace_probe_ops traceoff_probe_ops = {
+ .func = ftrace_traceoff,
+ .print = ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_probe_ops = {
+ .func = ftrace_stacktrace,
+ .print = ftrace_stacktrace_print,
+};
+
+static int
+ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
+ struct ftrace_hash *hash, char *glob,
+ char *cmd, char *param, int enable)
{
- struct ftrace_probe_ops *ops;
void *count = (void *)-1;
char *number;
int ret;
@@ -336,14 +456,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
if (!enable)
return -EINVAL;
- if (glob[0] == '!')
- return ftrace_trace_onoff_unreg(glob+1, cmd, param);
-
- /* we register both traceon and traceoff to this callback */
- if (strcmp(cmd, "traceon") == 0)
- ops = &traceon_probe_ops;
- else
- ops = &traceoff_probe_ops;
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ return 0;
+ }
if (!param)
goto out_reg;
@@ -357,7 +473,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
* We use the callback data field (which is a pointer)
* as our counter.
*/
- ret = strict_strtoul(number, 0, (unsigned long *)&count);
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
if (ret)
return ret;
@@ -367,6 +483,60 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
return ret < 0 ? ret : 0;
}
+static int
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
+ else
+ ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
+
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ param, enable);
+}
+
+static int
+ftrace_stacktrace_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
+
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ param, enable);
+}
+
+static int
+ftrace_dump_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = &dump_probe_ops;
+
+ /* Only dump once. */
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ "1", enable);
+}
+
+static int
+ftrace_cpudump_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = &cpudump_probe_ops;
+
+ /* Only dump once. */
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ "1", enable);
+}
+
static struct ftrace_func_command ftrace_traceon_cmd = {
.name = "traceon",
.func = ftrace_trace_onoff_callback,
@@ -377,6 +547,21 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
.func = ftrace_trace_onoff_callback,
};
+static struct ftrace_func_command ftrace_stacktrace_cmd = {
+ .name = "stacktrace",
+ .func = ftrace_stacktrace_callback,
+};
+
+static struct ftrace_func_command ftrace_dump_cmd = {
+ .name = "dump",
+ .func = ftrace_dump_callback,
+};
+
+static struct ftrace_func_command ftrace_cpudump_cmd = {
+ .name = "cpudump",
+ .func = ftrace_cpudump_callback,
+};
+
static int __init init_func_cmd_traceon(void)
{
int ret;
@@ -387,7 +572,31 @@ static int __init init_func_cmd_traceon(void)
ret = register_ftrace_command(&ftrace_traceon_cmd);
if (ret)
- unregister_ftrace_command(&ftrace_traceoff_cmd);
+ goto out_free_traceoff;
+
+ ret = register_ftrace_command(&ftrace_stacktrace_cmd);
+ if (ret)
+ goto out_free_traceon;
+
+ ret = register_ftrace_command(&ftrace_dump_cmd);
+ if (ret)
+ goto out_free_stacktrace;
+
+ ret = register_ftrace_command(&ftrace_cpudump_cmd);
+ if (ret)
+ goto out_free_dump;
+
+ return 0;
+
+ out_free_dump:
+ unregister_ftrace_command(&ftrace_dump_cmd);
+ out_free_stacktrace:
+ unregister_ftrace_command(&ftrace_stacktrace_cmd);
+ out_free_traceon:
+ unregister_ftrace_command(&ftrace_traceon_cmd);
+ out_free_traceoff:
+ unregister_ftrace_command(&ftrace_traceoff_cmd);
+
return ret;
}
#else
@@ -402,5 +611,4 @@ static __init int init_function_trace(void)
init_func_cmd_traceon();
return register_tracer(&function_trace);
}
-device_initcall(init_function_trace);
-
+core_initcall(init_function_trace);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d..4de3e57f723 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -38,14 +38,7 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
-/* Flag options */
-#define TRACE_GRAPH_PRINT_OVERRUN 0x1
-#define TRACE_GRAPH_PRINT_CPU 0x2
-#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
-#define TRACE_GRAPH_PRINT_PROC 0x8
-#define TRACE_GRAPH_PRINT_DURATION 0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
-#define TRACE_GRAPH_PRINT_IRQS 0x40
+static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
@@ -62,11 +55,13 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
/* Display interrupts */
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
+ /* Display function name after trailing } */
+ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
- /* Don't display overruns and proc by default */
+ /* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
.opts = trace_opts
@@ -80,9 +75,9 @@ static struct trace_array *graph_array;
* to fill in space into DURATION column.
*/
enum {
- DURATION_FILL_FULL = -1,
- DURATION_FILL_START = -2,
- DURATION_FILL_END = -3,
+ FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT,
+ FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT,
+ FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
};
static enum print_line_t
@@ -112,16 +107,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
return -EBUSY;
}
+ /*
+ * The curr_ret_stack is an index to ftrace return stack of
+ * current task. Its value should be in [0, FTRACE_RETFUNC_
+ * DEPTH) when the function graph tracer is used. To support
+ * filtering out specific functions, it makes the index
+ * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
+ * so when it sees a negative index the ftrace will ignore
+ * the record. And the index gets recovered when returning
+ * from the filtered function by adding the FTRACE_NOTRACE_
+ * DEPTH and then it'll continue to record functions normally.
+ *
+ * The curr_ret_stack is initialized to -1 and get increased
+ * in this function. So it can be less than -1 only if it was
+ * filtered out via ftrace_graph_notrace_addr() which can be
+ * set from set_graph_notrace file in debugfs by user.
+ */
+ if (current->curr_ret_stack < -1)
+ return -EBUSY;
+
calltime = trace_clock_local();
index = ++current->curr_ret_stack;
+ if (ftrace_graph_notrace_addr(func))
+ current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
barrier();
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = calltime;
current->ret_stack[index].subtime = 0;
current->ret_stack[index].fp = frame_pointer;
- *depth = index;
+ *depth = current->curr_ret_stack;
return 0;
}
@@ -135,7 +151,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
index = current->curr_ret_stack;
- if (unlikely(index < 0)) {
+ /*
+ * A negative index here means that it's just returned from a
+ * notrace'd function. Recover index to get an original
+ * return address. See ftrace_push_return_trace().
+ *
+ * TODO: Need to check whether the stack gets corrupted.
+ */
+ if (index < 0)
+ index += FTRACE_NOTRACE_DEPTH;
+
+ if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as well panic, otherwise we have no where to go */
@@ -143,7 +169,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
return;
}
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
/*
* The arch may choose to record the frame pointer used
* and check it here to make sure that it is what we expect it
@@ -154,6 +180,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
*
* Currently, x86_32 with optimize for size (-Os) makes the latest
* gcc do the above.
+ *
+ * Note, -mfentry does not use frame pointers, and this test
+ * is not needed if CC_USING_FENTRY is set.
*/
if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
ftrace_graph_stop();
@@ -186,9 +215,24 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
barrier();
current->curr_ret_stack--;
+ /*
+ * The curr_ret_stack can be less than -1 only if it was
+ * filtered out and it's about to return from the function.
+ * Recover the index and continue to trace normal functions.
+ */
+ if (current->curr_ret_stack < -1) {
+ current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
+ return ret;
+ }
+
+ /*
+ * The trace should run after decrementing the ret counter
+ * in case an interrupt were to come in. We don't want to
+ * lose the interrupt if max_depth is set.
+ */
+ ftrace_graph_return(&trace);
if (unlikely(!ret)) {
ftrace_graph_stop();
@@ -207,7 +251,7 @@ int __trace_graph_entry(struct trace_array *tr,
{
struct ftrace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -219,8 +263,8 @@ int __trace_graph_entry(struct trace_array *tr,
return 0;
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
- if (!filter_current_check_discard(buffer, call, entry, event))
- ring_buffer_unlock_commit(buffer, event);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
return 1;
}
@@ -247,13 +291,24 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return 0;
/* trace it when it is-nested-in or is a function enabled. */
- if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
- ftrace_graph_ignore_irqs())
+ if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
+ ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
+ (max_depth && trace->depth >= max_depth))
return 0;
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
+ if (ftrace_graph_notrace_addr(trace->func))
+ return 1;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -311,7 +366,7 @@ void __trace_graph_return(struct trace_array *tr,
{
struct ftrace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -323,8 +378,8 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
- if (!filter_current_check_discard(buffer, call, entry, event))
- ring_buffer_unlock_commit(buffer, event);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
}
void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -338,7 +393,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -434,7 +489,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
/* First spaces to align center */
for (i = 0; i < spaces / 2; i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -445,7 +500,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
/* Last spaces to align center */
for (i = 0; i < spaces - (spaces / 2); i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -491,7 +546,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
------------------------------------------
*/
- ret = trace_seq_printf(s,
+ ret = trace_seq_puts(s,
" ------------------------------------------\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -504,7 +559,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, " => ");
+ ret = trace_seq_puts(s, " => ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -512,7 +567,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s,
+ ret = trace_seq_puts(s,
"\n ------------------------------------------\n\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -538,7 +593,7 @@ get_return_for_leaf(struct trace_iterator *iter,
next = &data->ret;
} else {
- ring_iter = iter->buffer_iter[iter->cpu];
+ ring_iter = trace_buffer_iter(iter, iter->cpu);
/* First peek to compare current entry and the next one */
if (ring_iter)
@@ -548,9 +603,9 @@ get_return_for_leaf(struct trace_iterator *iter,
* We need to consume the current entry to see
* the next one.
*/
- ring_buffer_consume(iter->tr->buffer, iter->cpu,
+ ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
NULL, NULL);
- event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+ event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
NULL, NULL);
}
@@ -633,30 +688,30 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, " | ");
+ ret = trace_seq_puts(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
}
/* No overhead */
- ret = print_graph_duration(DURATION_FILL_START, s, flags);
+ ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
if (ret != TRACE_TYPE_HANDLED)
return ret;
if (type == TRACE_GRAPH_ENT)
- ret = trace_seq_printf(s, "==========>");
+ ret = trace_seq_puts(s, "==========>");
else
- ret = trace_seq_printf(s, "<==========");
+ ret = trace_seq_puts(s, "<==========");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- ret = print_graph_duration(DURATION_FILL_END, s, flags);
+ ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
if (ret != TRACE_TYPE_HANDLED)
return ret;
- ret = trace_seq_printf(s, "\n");
+ ret = trace_seq_putc(s, '\n');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -693,13 +748,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
len += strlen(nsecs_str);
}
- ret = trace_seq_printf(s, " us ");
+ ret = trace_seq_puts(s, " us ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Print remaining spaces to fit the row's width */
for (i = len; i < 7; i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -717,15 +772,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
return TRACE_TYPE_HANDLED;
/* No real adata, just filling the column with spaces */
- switch (duration) {
- case DURATION_FILL_FULL:
- ret = trace_seq_printf(s, " | ");
+ switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
+ case FLAGS_FILL_FULL:
+ ret = trace_seq_puts(s, " | ");
return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
- case DURATION_FILL_START:
- ret = trace_seq_printf(s, " ");
+ case FLAGS_FILL_START:
+ ret = trace_seq_puts(s, " ");
return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
- case DURATION_FILL_END:
- ret = trace_seq_printf(s, " |");
+ case FLAGS_FILL_END:
+ ret = trace_seq_puts(s, " |");
return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
}
@@ -733,10 +788,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
- ret = trace_seq_printf(s, "! ");
+ ret = trace_seq_puts(s, "! ");
/* Duration exceeded 10 msecs */
else if (duration > 10000ULL)
- ret = trace_seq_printf(s, "+ ");
+ ret = trace_seq_puts(s, "+ ");
}
/*
@@ -745,7 +800,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
* to fill out the space.
*/
if (ret == -1)
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_puts(s, " ");
/* Catching here any failure happenned above */
if (!ret)
@@ -755,7 +810,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
if (ret != TRACE_TYPE_HANDLED)
return ret;
- ret = trace_seq_printf(s, "| ");
+ ret = trace_seq_puts(s, "| ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -805,7 +860,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -840,13 +895,13 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
+ ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
if (ret != TRACE_TYPE_HANDLED)
return ret;
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -905,7 +960,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, " | ");
+ ret = trace_seq_puts(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -1105,7 +1160,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -1114,10 +1169,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* If the return function does not have a matching entry,
* then the entry was lost. Instead of just printing
* the '}' and letting the user guess what function this
- * belongs to, write out the function name.
+ * belongs to, write out the function name. Always do
+ * that if the funcgraph-tail option is enabled.
*/
- if (func_match) {
- ret = trace_seq_printf(s, "}\n");
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
+ ret = trace_seq_puts(s, "}\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
} else {
@@ -1160,20 +1216,20 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
- ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
+ ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
if (ret != TRACE_TYPE_HANDLED)
return ret;
/* Indentation */
if (depth > 0)
for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_printf(s, " ");
+ ret = trace_seq_putc(s, ' ');
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* The comment */
- ret = trace_seq_printf(s, "/* ");
+ ret = trace_seq_puts(s, "/* ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -1204,7 +1260,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
s->len--;
}
- ret = trace_seq_printf(s, " */\n");
+ ret = trace_seq_puts(s, " */\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -1414,7 +1470,8 @@ void graph_trace_close(struct trace_iterator *iter)
}
}
-static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+static int
+func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
if (bit == TRACE_GRAPH_PRINT_IRQS)
ftrace_graph_skip_irqs = !set;
@@ -1436,13 +1493,12 @@ static struct trace_event graph_trace_ret_event = {
.funcs = &graph_functions
};
-static struct tracer graph_trace __read_mostly = {
+static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
.open = graph_trace_open,
.pipe_open = graph_trace_open,
.close = graph_trace_close,
.pipe_close = graph_trace_close,
- .wait_pipe = poll_wait_pipe,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,
@@ -1454,6 +1510,59 @@ static struct tracer graph_trace __read_mostly = {
#endif
};
+
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ max_depth = val;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+ int n;
+
+ n = sprintf(buf, "%d\n", max_depth);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static const struct file_operations graph_depth_fops = {
+ .open = tracing_open_generic,
+ .write = graph_depth_write,
+ .read = graph_depth_read,
+ .llseek = generic_file_llseek,
+};
+
+static __init int init_graph_debugfs(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ trace_create_file("max_graph_depth", 0644, d_tracer,
+ NULL, &graph_depth_fops);
+
+ return 0;
+}
+fs_initcall(init_graph_debugfs);
+
static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
@@ -1471,4 +1580,4 @@ static __init int init_graph_trace(void)
return register_tracer(&graph_trace);
}
-device_initcall(init_graph_trace);
+core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 99d20e92036..9bb104f748d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
* From code in the latency_tracer, that is:
*
* Copyright (C) 2004-2006 Ingo Molnar
- * Copyright (C) 2004 William Lee Irwin III
+ * Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/kallsyms.h>
#include <linux/debugfs.h>
@@ -32,7 +32,8 @@ enum {
static int trace_type __read_mostly;
-static int save_lat_flag;
+static int save_flags;
+static bool function_enabled;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
if (!irqs_disabled_flags(*flags))
return 0;
- *data = tr->data[cpu];
+ *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
@@ -136,7 +137,8 @@ static int func_prolog_dec(struct trace_array *tr,
* irqsoff uses its own tracer function to keep the overhead down:
*/
static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
@@ -149,16 +151,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
atomic_dec(&data->disabled);
}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = irqsoff_tracer_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
-};
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
int cpu;
@@ -173,8 +170,8 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
for_each_possible_cpu(cpu)
per_cpu(tracing_cpu, cpu) = 0;
- tracing_max_latency = 0;
- tracing_reset_online_cpus(irqsoff_trace);
+ tr->max_latency = 0;
+ tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
return start_irqsoff_tracer(irqsoff_trace, set);
}
@@ -264,7 +261,8 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
return -EINVAL;
}
@@ -299,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(cycle_t delta)
+static int report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
return 0;
} else {
- if (delta <= tracing_max_latency)
+ if (delta <= tr->max_latency)
return 0;
}
return 1;
@@ -329,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
pc = preempt_count();
- if (!report_latency(delta))
+ if (!report_latency(tr, delta))
goto out;
raw_spin_lock_irqsave(&max_trace_lock, flags);
/* check if we are still the max latency */
- if (!report_latency(delta))
+ if (!report_latency(tr, delta))
goto out_unlock;
__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -348,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
data->critical_end = parent_ip;
if (likely(!is_tracing_stopped())) {
- tracing_max_latency = delta;
+ tr->max_latency = delta;
update_max_tr_single(tr, current, cpu);
}
@@ -371,7 +369,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
struct trace_array_cpu *data;
unsigned long flags;
- if (likely(!tracer_enabled))
+ if (!tracer_enabled || !tracing_is_enabled())
return;
cpu = raw_smp_processor_id();
@@ -379,7 +377,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
if (per_cpu(tracing_cpu, cpu))
return;
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (unlikely(!data) || atomic_read(&data->disabled))
return;
@@ -414,10 +412,10 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
else
return;
- if (!tracer_enabled)
+ if (!tracer_enabled || !tracing_is_enabled())
return;
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (unlikely(!data) ||
!data->critical_start || atomic_read(&data->disabled))
@@ -496,14 +494,14 @@ void trace_hardirqs_off(void)
}
EXPORT_SYMBOL(trace_hardirqs_off);
-void trace_hardirqs_on_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
-void trace_hardirqs_off_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
@@ -527,15 +525,62 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
-static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
- int ret = 0;
+ int ret;
- if (!graph)
- ret = register_ftrace_function(&trace_ops);
- else
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
ret = register_ftrace_graph(&irqsoff_graph_return,
&irqsoff_graph_entry);
+ else
+ ret = register_ftrace_function(tr->ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
+}
+
+static void unregister_irqsoff_function(struct trace_array *tr, int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph();
+ else
+ unregister_ftrace_function(tr->ops);
+
+ function_enabled = false;
+}
+
+static void irqsoff_function_set(struct trace_array *tr, int set)
+{
+ if (set)
+ register_irqsoff_function(tr, is_graph(), 1);
+ else
+ unregister_irqsoff_function(tr, is_graph());
+}
+
+static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
+{
+ struct tracer *tracer = tr->current_trace;
+
+ if (mask & TRACE_ITER_FUNCTION)
+ irqsoff_function_set(tr, set);
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+{
+ int ret;
+
+ ret = register_irqsoff_function(tr, graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -549,33 +594,51 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- if (!graph)
- unregister_ftrace_function(&trace_ops);
- else
- unregister_ftrace_graph();
+ unregister_irqsoff_function(tr, graph);
}
-static void __irqsoff_tracer_init(struct trace_array *tr)
+static bool irqsoff_busy;
+
+static int __irqsoff_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ if (irqsoff_busy)
+ return -EBUSY;
+
+ save_flags = trace_flags;
- tracing_max_latency = 0;
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+
+ tr->max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+ ftrace_init_array_ops(tr, irqsoff_tracer_call);
- if (start_irqsoff_tracer(tr, is_graph()))
+ /* Only toplevel instance supports graph tracing */
+ if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+ is_graph())))
printk(KERN_ERR "failed to start irqsoff tracer\n");
+
+ irqsoff_busy = true;
+ return 0;
}
static void irqsoff_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_irqsoff_tracer(tr, is_graph());
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ ftrace_reset_array_ops(tr);
+
+ irqsoff_busy = false;
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -593,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF;
- __irqsoff_tracer_init(tr);
- return 0;
+ return __irqsoff_tracer_init(tr);
}
static struct tracer irqsoff_tracer __read_mostly =
{
@@ -603,17 +665,19 @@ static struct tracer irqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
- .print_max = 1,
+ .print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
- .use_max_tr = 1,
+ .allow_instances = true,
+ .use_max_tr = true,
};
# define register_irqsoff(trace) register_tracer(&trace)
#else
@@ -625,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_PREEMPT_OFF;
- __irqsoff_tracer_init(tr);
- return 0;
+ return __irqsoff_tracer_init(tr);
}
static struct tracer preemptoff_tracer __read_mostly =
@@ -636,17 +699,19 @@ static struct tracer preemptoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
- .print_max = 1,
+ .print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
- .use_max_tr = 1,
+ .allow_instances = true,
+ .use_max_tr = true,
};
# define register_preemptoff(trace) register_tracer(&trace)
#else
@@ -660,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
- __irqsoff_tracer_init(tr);
- return 0;
+ return __irqsoff_tracer_init(tr);
}
static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -671,17 +735,19 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
- .print_max = 1,
+ .print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
- .use_max_tr = 1,
+ .allow_instances = true,
+ .use_max_tr = true,
};
# define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -697,4 +763,4 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
-device_initcall(init_irqsoff_tracer);
+core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b..bd90e1b0608 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&iter.tr->data[cpu]->disabled);
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
iter.iter_flags |= TRACE_FILE_LAT_FMT;
iter.pos = -1;
- if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.tr->buffer, cpu);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
@@ -83,7 +83,7 @@ out:
trace_flags = old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
!cpu_online(cpu_file))
return KDB_BADINT;
} else {
- cpu_file = TRACE_PIPE_ALL_CPU;
+ cpu_file = RING_BUFFER_ALL_CPUS;
}
kdb_trap_printk++;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 00d527c945a..282f6e4e553 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,185 +19,134 @@
#include <linux/module.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/debugfs.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/stringify.h>
-#include <linux/limits.h>
-#include <asm/bitsperlong.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-#define MAX_TRACE_ARGS 128
-#define MAX_ARGSTR_LEN 63
-#define MAX_EVENT_NAME_LEN 64
-#define MAX_STRING_SIZE PATH_MAX
+
+#include "trace_probe.h"
+
#define KPROBE_EVENT_SYSTEM "kprobes"
-/* Reserved field names */
-#define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_RETIP "__probe_ret_ip"
-#define FIELD_STRING_FUNC "__probe_func"
-
-const char *reserved_field_names[] = {
- "common_type",
- "common_flags",
- "common_preempt_count",
- "common_pid",
- "common_tgid",
- FIELD_STRING_IP,
- FIELD_STRING_RETIP,
- FIELD_STRING_FUNC,
+/**
+ * Kprobe event core functions
+ */
+struct trace_kprobe {
+ struct list_head list;
+ struct kretprobe rp; /* Use rp.kp for kprobe use */
+ unsigned long nhit;
+ const char *symbol; /* symbol name */
+ struct trace_probe tp;
};
-/* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
- void *);
-#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
-#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
-
-/* Printing in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
-static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, \
- void *data, void *ent)\
-{ \
- return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
-} \
-static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
-
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
-
-/* data_rloc: data relative location, compatible with u32 */
-#define make_data_rloc(len, roffs) \
- (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
-#define get_rloc_len(dl) ((u32)(dl) >> 16)
-#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
-
-static inline void *get_rloc_data(u32 *dl)
+#define SIZEOF_TRACE_KPROBE(n) \
+ (offsetof(struct trace_kprobe, tp.args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+
+static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
- return (u8 *)dl + get_rloc_offs(*dl);
+ return tk->rp.handler != NULL;
}
-/* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
+static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
{
- return (u8 *)ent + get_rloc_offs(*dl);
+ return tk->symbol ? tk->symbol : "unknown";
}
-/*
- * Convert data_rloc to data_loc:
- * data_rloc stores the offset from data_rloc itself, but data_loc
- * stores the offset from event entry.
- */
-#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
+static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
+{
+ return tk->rp.kp.offset;
+}
-/* For defining macros, define string/string_size types */
-typedef u32 string;
-typedef u32 string_size;
+static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
+{
+ return !!(kprobe_gone(&tk->rp.kp));
+}
-/* Print type function for string type */
-static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
- const char *name,
- void *data, void *ent)
+static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
+ struct module *mod)
{
- int len = *(u32 *)data >> 16;
+ int len = strlen(mod->name);
+ const char *name = trace_kprobe_symbol(tk);
+ return strncmp(mod->name, name, len) == 0 && name[len] == ':';
+}
- if (!len)
- return trace_seq_printf(s, " %s=(fault)", name);
- else
- return trace_seq_printf(s, " %s=\"%s\"", name,
- (const char *)get_loc_data(data, ent));
+static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
+{
+ return !!strchr(trace_kprobe_symbol(tk), ':');
}
-static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
-/* Data fetch function type */
-typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+static int register_kprobe_event(struct trace_kprobe *tk);
+static int unregister_kprobe_event(struct trace_kprobe *tk);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
-struct fetch_param {
- fetch_func_t fn;
- void *data;
+/* Memory fetching by symbol */
+struct symbol_cache {
+ char *symbol;
+ long offset;
+ unsigned long addr;
};
-static __kprobes void call_fetch(struct fetch_param *fprm,
- struct pt_regs *regs, void *dest)
+unsigned long update_symbol_cache(struct symbol_cache *sc)
{
- return fprm->fn(regs, fprm->data, dest);
+ sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+
+ if (sc->addr)
+ sc->addr += sc->offset;
+
+ return sc->addr;
}
-#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
-/*
- * Define macro for basic types - we don't need to define s* types, because
- * we have to care only about bitwidth at recording time.
- */
-#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##method(u8) \
-DEFINE_FETCH_##method(u16) \
-DEFINE_FETCH_##method(u32) \
-DEFINE_FETCH_##method(u64)
-
-#define CHECK_FETCH_FUNCS(method, fn) \
- (((FETCH_FUNC_NAME(method, u8) == fn) || \
- (FETCH_FUNC_NAME(method, u16) == fn) || \
- (FETCH_FUNC_NAME(method, u32) == fn) || \
- (FETCH_FUNC_NAME(method, u64) == fn) || \
- (FETCH_FUNC_NAME(method, string) == fn) || \
- (FETCH_FUNC_NAME(method, string_size) == fn)) \
- && (fn != NULL))
-
-/* Data fetch function templates */
-#define DEFINE_FETCH_reg(type) \
-static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
-{ \
- *(type *)dest = (type)regs_get_register(regs, \
- (unsigned int)((unsigned long)offset)); \
+void free_symbol_cache(struct symbol_cache *sc)
+{
+ kfree(sc->symbol);
+ kfree(sc);
+}
+
+struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+ struct symbol_cache *sc;
+
+ if (!sym || strlen(sym) == 0)
+ return NULL;
+
+ sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+ if (!sc)
+ return NULL;
+
+ sc->symbol = kstrdup(sym, GFP_KERNEL);
+ if (!sc->symbol) {
+ kfree(sc);
+ return NULL;
+ }
+ sc->offset = offset;
+ update_symbol_cache(sc);
+
+ return sc;
}
-DEFINE_BASIC_FETCH_FUNCS(reg)
-/* No string on the register */
-#define fetch_reg_string NULL
-#define fetch_reg_string_size NULL
+/*
+ * Kprobes-specific fetch functions
+ */
#define DEFINE_FETCH_stack(type) \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
(unsigned int)((unsigned long)offset)); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
+
DEFINE_BASIC_FETCH_FUNCS(stack)
/* No string on the stack entry */
-#define fetch_stack_string NULL
-#define fetch_stack_string_size NULL
-
-#define DEFINE_FETCH_retval(type) \
-static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
- void *dummy, void *dest) \
-{ \
- *(type *)dest = (type)regs_return_value(regs); \
-}
-DEFINE_BASIC_FETCH_FUNCS(retval)
-/* No string on the retval */
-#define fetch_retval_string NULL
-#define fetch_retval_string_size NULL
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
void *addr, void *dest) \
{ \
type retval; \
@@ -205,31 +154,37 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
*(type *)dest = 0; \
else \
*(type *)dest = retval; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
+
DEFINE_BASIC_FETCH_FUNCS(memory)
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
{
long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
u8 *dst = get_rloc_data(dest);
u8 *src = addr;
mm_segment_t old_fs = get_fs();
+
if (!maxlen)
return;
+
/*
* Try to get string again, since the string can be changed while
* probing.
*/
set_fs(KERNEL_DS);
pagefault_disable();
+
do
ret = __copy_from_user_inatomic(dst++, src++, 1);
while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+
dst[-1] = '\0';
pagefault_enable();
set_fs(old_fs);
@@ -237,24 +192,30 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
if (ret < 0) { /* Failed to fetch string */
((u8 *)get_rloc_data(dest))[0] = '\0';
*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
- } else
+ } else {
*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
get_rloc_offs(*(u32 *)dest));
+ }
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
+
/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
{
+ mm_segment_t old_fs;
int ret, len = 0;
u8 c;
- mm_segment_t old_fs = get_fs();
+ old_fs = get_fs();
set_fs(KERNEL_DS);
pagefault_disable();
+
do {
ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
+
pagefault_enable();
set_fs(old_fs);
@@ -263,210 +224,33 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
else
*(u32 *)dest = len;
}
-
-/* Memory fetching by symbol */
-struct symbol_cache {
- char *symbol;
- long offset;
- unsigned long addr;
-};
-
-static unsigned long update_symbol_cache(struct symbol_cache *sc)
-{
- sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
- if (sc->addr)
- sc->addr += sc->offset;
- return sc->addr;
-}
-
-static void free_symbol_cache(struct symbol_cache *sc)
-{
- kfree(sc->symbol);
- kfree(sc);
-}
-
-static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
-{
- struct symbol_cache *sc;
-
- if (!sym || strlen(sym) == 0)
- return NULL;
- sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
- if (!sc)
- return NULL;
-
- sc->symbol = kstrdup(sym, GFP_KERNEL);
- if (!sc->symbol) {
- kfree(sc);
- return NULL;
- }
- sc->offset = offset;
-
- update_symbol_cache(sc);
- return sc;
-}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
#define DEFINE_FETCH_symbol(type) \
-static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
- void *data, void *dest) \
+void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
{ \
struct symbol_cache *sc = data; \
if (sc->addr) \
fetch_memory_##type(regs, (void *)sc->addr, dest); \
else \
*(type *)dest = 0; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
+
DEFINE_BASIC_FETCH_FUNCS(symbol)
DEFINE_FETCH_symbol(string)
DEFINE_FETCH_symbol(string_size)
-/* Dereference memory access function */
-struct deref_fetch_param {
- struct fetch_param orig;
- long offset;
-};
-
-#define DEFINE_FETCH_deref(type) \
-static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
- void *data, void *dest) \
-{ \
- struct deref_fetch_param *dprm = data; \
- unsigned long addr; \
- call_fetch(&dprm->orig, regs, &addr); \
- if (addr) { \
- addr += dprm->offset; \
- fetch_memory_##type(regs, (void *)addr, dest); \
- } else \
- *(type *)dest = 0; \
-}
-DEFINE_BASIC_FETCH_FUNCS(deref)
-DEFINE_FETCH_deref(string)
-DEFINE_FETCH_deref(string_size)
-
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
-{
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- update_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- update_symbol_cache(data->orig.data);
-}
-
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
-{
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- free_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- free_symbol_cache(data->orig.data);
- kfree(data);
-}
-
-/* Bitfield fetch function */
-struct bitfield_fetch_param {
- struct fetch_param orig;
- unsigned char hi_shift;
- unsigned char low_shift;
-};
-
-#define DEFINE_FETCH_bitfield(type) \
-static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
- void *data, void *dest) \
-{ \
- struct bitfield_fetch_param *bprm = data; \
- type buf = 0; \
- call_fetch(&bprm->orig, regs, &buf); \
- if (buf) { \
- buf <<= bprm->hi_shift; \
- buf >>= bprm->low_shift; \
- } \
- *(type *)dest = buf; \
-}
-DEFINE_BASIC_FETCH_FUNCS(bitfield)
-#define fetch_bitfield_string NULL
-#define fetch_bitfield_string_size NULL
-
-static __kprobes void
-update_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
- /*
- * Don't check the bitfield itself, because this must be the
- * last fetch function.
- */
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- update_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- update_symbol_cache(data->orig.data);
-}
-
-static __kprobes void
-free_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
- /*
- * Don't check the bitfield itself, because this must be the
- * last fetch function.
- */
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- free_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- free_symbol_cache(data->orig.data);
- kfree(data);
-}
-
-/* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
-#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
-#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
-#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-
-/* Fetch types */
-enum {
- FETCH_MTD_reg = 0,
- FETCH_MTD_stack,
- FETCH_MTD_retval,
- FETCH_MTD_memory,
- FETCH_MTD_symbol,
- FETCH_MTD_deref,
- FETCH_MTD_bitfield,
- FETCH_MTD_END,
-};
-
-#define ASSIGN_FETCH_FUNC(method, type) \
- [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
-
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- {.name = _name, \
- .size = _size, \
- .is_signed = sign, \
- .print = PRINT_TYPE_FUNC_NAME(ptype), \
- .fmt = PRINT_TYPE_FMT_NAME(ptype), \
- .fmttype = _fmttype, \
- .fetch = { \
-ASSIGN_FETCH_FUNC(reg, ftype), \
-ASSIGN_FETCH_FUNC(stack, ftype), \
-ASSIGN_FETCH_FUNC(retval, ftype), \
-ASSIGN_FETCH_FUNC(memory, ftype), \
-ASSIGN_FETCH_FUNC(symbol, ftype), \
-ASSIGN_FETCH_FUNC(deref, ftype), \
-ASSIGN_FETCH_FUNC(bitfield, ftype), \
- } \
- }
-
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
- __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
-
-#define FETCH_TYPE_STRING 0
-#define FETCH_TYPE_STRSIZE 1
+/* kprobes don't support file_offset fetch methods */
+#define fetch_file_offset_u8 NULL
+#define fetch_file_offset_u16 NULL
+#define fetch_file_offset_u32 NULL
+#define fetch_file_offset_u64 NULL
+#define fetch_file_offset_string NULL
+#define fetch_file_offset_string_size NULL
/* Fetch type information table */
-static const struct fetch_type {
- const char *name; /* Name of type */
- size_t size; /* Byte size of type */
- int is_signed; /* Signed flag */
- print_type_func_t print; /* Print functions */
- const char *fmt; /* Fromat string */
- const char *fmttype; /* Name in format file */
- /* Fetch functions */
- fetch_func_t fetch[FETCH_MTD_END];
-} fetch_type_table[] = {
+const struct fetch_type kprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -481,207 +265,49 @@ static const struct fetch_type {
ASSIGN_FETCH_TYPE(s16, u16, 1),
ASSIGN_FETCH_TYPE(s32, u32, 1),
ASSIGN_FETCH_TYPE(s64, u64, 1),
-};
-
-static const struct fetch_type *find_fetch_type(const char *type)
-{
- int i;
- if (!type)
- type = DEFAULT_FETCH_TYPE_STR;
-
- /* Special case: bitfield */
- if (*type == 'b') {
- unsigned long bs;
- type = strchr(type, '/');
- if (!type)
- goto fail;
- type++;
- if (strict_strtoul(type, 0, &bs))
- goto fail;
- switch (bs) {
- case 8:
- return find_fetch_type("u8");
- case 16:
- return find_fetch_type("u16");
- case 32:
- return find_fetch_type("u32");
- case 64:
- return find_fetch_type("u64");
- default:
- goto fail;
- }
- }
-
- for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
- if (strcmp(type, fetch_type_table[i].name) == 0)
- return &fetch_type_table[i];
-fail:
- return NULL;
-}
-
-/* Special function : only accept unsigned long */
-static __kprobes void fetch_stack_address(struct pt_regs *regs,
- void *dummy, void *dest)
-{
- *(unsigned long *)dest = kernel_stack_pointer(regs);
-}
-
-static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
- fetch_func_t orig_fn)
-{
- int i;
-
- if (type != &fetch_type_table[FETCH_TYPE_STRING])
- return NULL; /* Only string type needs size function */
- for (i = 0; i < FETCH_MTD_END; i++)
- if (type->fetch[i] == orig_fn)
- return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
-
- WARN_ON(1); /* This should not happen */
- return NULL;
-}
-
-/**
- * Kprobe event core functions
- */
-
-struct probe_arg {
- struct fetch_param fetch;
- struct fetch_param fetch_size;
- unsigned int offset; /* Offset from argument entry */
- const char *name; /* Name of this argument */
- const char *comm; /* Command of this argument */
- const struct fetch_type *type; /* Type of this argument */
+ ASSIGN_FETCH_TYPE_END
};
-/* Flags for trace_probe */
-#define TP_FLAG_TRACE 1
-#define TP_FLAG_PROFILE 2
-#define TP_FLAG_REGISTERED 4
-
-struct trace_probe {
- struct list_head list;
- struct kretprobe rp; /* Use rp.kp for kprobe use */
- unsigned long nhit;
- unsigned int flags; /* For TP_FLAG_* */
- const char *symbol; /* symbol name */
- struct ftrace_event_class class;
- struct ftrace_event_call call;
- ssize_t size; /* trace entry size */
- unsigned int nr_args;
- struct probe_arg args[];
-};
-
-#define SIZEOF_TRACE_PROBE(n) \
- (offsetof(struct trace_probe, args) + \
- (sizeof(struct probe_arg) * (n)))
-
-
-static __kprobes int trace_probe_is_return(struct trace_probe *tp)
-{
- return tp->rp.handler != NULL;
-}
-
-static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
-{
- return tp->symbol ? tp->symbol : "unknown";
-}
-
-static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
-{
- return tp->rp.kp.offset;
-}
-
-static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
-{
- return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
-}
-
-static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
-{
- return !!(tp->flags & TP_FLAG_REGISTERED);
-}
-
-static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
-{
- return !!(kprobe_gone(&tp->rp.kp));
-}
-
-static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
- struct module *mod)
-{
- int len = strlen(mod->name);
- const char *name = trace_probe_symbol(tp);
- return strncmp(mod->name, name, len) == 0 && name[len] == ':';
-}
-
-static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
-{
- return !!strchr(trace_probe_symbol(tp), ':');
-}
-
-static int register_probe_event(struct trace_probe *tp);
-static void unregister_probe_event(struct trace_probe *tp);
-
-static DEFINE_MUTEX(probe_lock);
-static LIST_HEAD(probe_list);
-
-static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
-static int kretprobe_dispatcher(struct kretprobe_instance *ri,
- struct pt_regs *regs);
-
-/* Check the name is good for event/group/fields */
-static int is_good_name(const char *name)
-{
- if (!isalpha(*name) && *name != '_')
- return 0;
- while (*++name != '\0') {
- if (!isalpha(*name) && !isdigit(*name) && *name != '_')
- return 0;
- }
- return 1;
-}
-
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
-static struct trace_probe *alloc_trace_probe(const char *group,
+static struct trace_kprobe *alloc_trace_kprobe(const char *group,
const char *event,
void *addr,
const char *symbol,
unsigned long offs,
- int nargs, int is_return)
+ int nargs, bool is_return)
{
- struct trace_probe *tp;
+ struct trace_kprobe *tk;
int ret = -ENOMEM;
- tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
- if (!tp)
+ tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
+ if (!tk)
return ERR_PTR(ret);
if (symbol) {
- tp->symbol = kstrdup(symbol, GFP_KERNEL);
- if (!tp->symbol)
+ tk->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tk->symbol)
goto error;
- tp->rp.kp.symbol_name = tp->symbol;
- tp->rp.kp.offset = offs;
+ tk->rp.kp.symbol_name = tk->symbol;
+ tk->rp.kp.offset = offs;
} else
- tp->rp.kp.addr = addr;
+ tk->rp.kp.addr = addr;
if (is_return)
- tp->rp.handler = kretprobe_dispatcher;
+ tk->rp.handler = kretprobe_dispatcher;
else
- tp->rp.kp.pre_handler = kprobe_dispatcher;
+ tk->rp.kp.pre_handler = kprobe_dispatcher;
if (!event || !is_good_name(event)) {
ret = -EINVAL;
goto error;
}
- tp->call.class = &tp->class;
- tp->call.name = kstrdup(event, GFP_KERNEL);
- if (!tp->call.name)
+ tk->tp.call.class = &tk->tp.class;
+ tk->tp.call.name = kstrdup(event, GFP_KERNEL);
+ if (!tk->tp.call.name)
goto error;
if (!group || !is_good_name(group)) {
@@ -689,130 +315,166 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
- tp->class.system = kstrdup(group, GFP_KERNEL);
- if (!tp->class.system)
+ tk->tp.class.system = kstrdup(group, GFP_KERNEL);
+ if (!tk->tp.class.system)
goto error;
- INIT_LIST_HEAD(&tp->list);
- return tp;
+ INIT_LIST_HEAD(&tk->list);
+ INIT_LIST_HEAD(&tk->tp.files);
+ return tk;
error:
- kfree(tp->call.name);
- kfree(tp->symbol);
- kfree(tp);
+ kfree(tk->tp.call.name);
+ kfree(tk->symbol);
+ kfree(tk);
return ERR_PTR(ret);
}
-static void update_probe_arg(struct probe_arg *arg)
-{
- if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
- update_bitfield_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
- update_deref_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
- update_symbol_cache(arg->fetch.data);
-}
-
-static void free_probe_arg(struct probe_arg *arg)
-{
- if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
- free_bitfield_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
- free_deref_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
- free_symbol_cache(arg->fetch.data);
- kfree(arg->name);
- kfree(arg->comm);
-}
-
-static void free_trace_probe(struct trace_probe *tp)
+static void free_trace_kprobe(struct trace_kprobe *tk)
{
int i;
- for (i = 0; i < tp->nr_args; i++)
- free_probe_arg(&tp->args[i]);
+ for (i = 0; i < tk->tp.nr_args; i++)
+ traceprobe_free_probe_arg(&tk->tp.args[i]);
- kfree(tp->call.class->system);
- kfree(tp->call.name);
- kfree(tp->symbol);
- kfree(tp);
+ kfree(tk->tp.call.class->system);
+ kfree(tk->tp.call.name);
+ kfree(tk->symbol);
+ kfree(tk);
}
-static struct trace_probe *find_trace_probe(const char *event,
- const char *group)
+static struct trace_kprobe *find_trace_kprobe(const char *event,
+ const char *group)
{
- struct trace_probe *tp;
+ struct trace_kprobe *tk;
- list_for_each_entry(tp, &probe_list, list)
- if (strcmp(tp->call.name, event) == 0 &&
- strcmp(tp->call.class->system, group) == 0)
- return tp;
+ list_for_each_entry(tk, &probe_list, list)
+ if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
+ strcmp(tk->tp.call.class->system, group) == 0)
+ return tk;
return NULL;
}
-/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static int enable_trace_probe(struct trace_probe *tp, int flag)
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int
+enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
{
int ret = 0;
- tp->flags |= flag;
- if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
- !trace_probe_has_gone(tp)) {
- if (trace_probe_is_return(tp))
- ret = enable_kretprobe(&tp->rp);
+ if (file) {
+ struct event_file_link *link;
+
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ link->file = file;
+ list_add_tail_rcu(&link->list, &tk->tp.files);
+
+ tk->tp.flags |= TP_FLAG_TRACE;
+ } else
+ tk->tp.flags |= TP_FLAG_PROFILE;
+
+ if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
+ if (trace_kprobe_is_return(tk))
+ ret = enable_kretprobe(&tk->rp);
else
- ret = enable_kprobe(&tp->rp.kp);
+ ret = enable_kprobe(&tk->rp.kp);
}
-
+ out:
return ret;
}
-/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static void disable_trace_probe(struct trace_probe *tp, int flag)
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int
+disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
{
- tp->flags &= ~flag;
- if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
- if (trace_probe_is_return(tp))
- disable_kretprobe(&tp->rp);
+ struct event_file_link *link = NULL;
+ int wait = 0;
+ int ret = 0;
+
+ if (file) {
+ link = find_event_file_link(&tk->tp, file);
+ if (!link) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ list_del_rcu(&link->list);
+ wait = 1;
+ if (!list_empty(&tk->tp.files))
+ goto out;
+
+ tk->tp.flags &= ~TP_FLAG_TRACE;
+ } else
+ tk->tp.flags &= ~TP_FLAG_PROFILE;
+
+ if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) {
+ if (trace_kprobe_is_return(tk))
+ disable_kretprobe(&tk->rp);
else
- disable_kprobe(&tp->rp.kp);
+ disable_kprobe(&tk->rp.kp);
+ wait = 1;
}
+ out:
+ if (wait) {
+ /*
+ * Synchronize with kprobe_trace_func/kretprobe_trace_func
+ * to ensure disabled (all running handlers are finished).
+ * This is not only for kfree(), but also the caller,
+ * trace_remove_event_call() supposes it for releasing
+ * event_call related objects, which will be accessed in
+ * the kprobe_trace_func/kretprobe_trace_func.
+ */
+ synchronize_sched();
+ kfree(link); /* Ignored if link == NULL */
+ }
+
+ return ret;
}
/* Internal register function - just handle k*probes and flags */
-static int __register_trace_probe(struct trace_probe *tp)
+static int __register_trace_kprobe(struct trace_kprobe *tk)
{
int i, ret;
- if (trace_probe_is_registered(tp))
+ if (trace_probe_is_registered(&tk->tp))
return -EINVAL;
- for (i = 0; i < tp->nr_args; i++)
- update_probe_arg(&tp->args[i]);
+ for (i = 0; i < tk->tp.nr_args; i++)
+ traceprobe_update_arg(&tk->tp.args[i]);
/* Set/clear disabled flag according to tp->flag */
- if (trace_probe_is_enabled(tp))
- tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
+ if (trace_probe_is_enabled(&tk->tp))
+ tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
else
- tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+ tk->rp.kp.flags |= KPROBE_FLAG_DISABLED;
- if (trace_probe_is_return(tp))
- ret = register_kretprobe(&tp->rp);
+ if (trace_kprobe_is_return(tk))
+ ret = register_kretprobe(&tk->rp);
else
- ret = register_kprobe(&tp->rp.kp);
+ ret = register_kprobe(&tk->rp.kp);
if (ret == 0)
- tp->flags |= TP_FLAG_REGISTERED;
+ tk->tp.flags |= TP_FLAG_REGISTERED;
else {
pr_warning("Could not insert probe at %s+%lu: %d\n",
- trace_probe_symbol(tp), trace_probe_offset(tp), ret);
- if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
+ trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
+ if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
pr_warning("This probe might be able to register after"
"target module is loaded. Continue.\n");
ret = 0;
} else if (ret == -EILSEQ) {
pr_warning("Probing address(0x%p) is not an "
"instruction boundary.\n",
- tp->rp.kp.addr);
+ tk->rp.kp.addr);
ret = -EINVAL;
}
}
@@ -821,64 +483,68 @@ static int __register_trace_probe(struct trace_probe *tp)
}
/* Internal unregister function - just handle k*probes and flags */
-static void __unregister_trace_probe(struct trace_probe *tp)
+static void __unregister_trace_kprobe(struct trace_kprobe *tk)
{
- if (trace_probe_is_registered(tp)) {
- if (trace_probe_is_return(tp))
- unregister_kretprobe(&tp->rp);
+ if (trace_probe_is_registered(&tk->tp)) {
+ if (trace_kprobe_is_return(tk))
+ unregister_kretprobe(&tk->rp);
else
- unregister_kprobe(&tp->rp.kp);
- tp->flags &= ~TP_FLAG_REGISTERED;
+ unregister_kprobe(&tk->rp.kp);
+ tk->tp.flags &= ~TP_FLAG_REGISTERED;
/* Cleanup kprobe for reuse */
- if (tp->rp.kp.symbol_name)
- tp->rp.kp.addr = NULL;
+ if (tk->rp.kp.symbol_name)
+ tk->rp.kp.addr = NULL;
}
}
/* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static int unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_kprobe(struct trace_kprobe *tk)
{
/* Enabled event can not be unregistered */
- if (trace_probe_is_enabled(tp))
+ if (trace_probe_is_enabled(&tk->tp))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (unregister_kprobe_event(tk))
return -EBUSY;
- __unregister_trace_probe(tp);
- list_del(&tp->list);
- unregister_probe_event(tp);
+ __unregister_trace_kprobe(tk);
+ list_del(&tk->list);
return 0;
}
/* Register a trace_probe and probe_event */
-static int register_trace_probe(struct trace_probe *tp)
+static int register_trace_kprobe(struct trace_kprobe *tk)
{
- struct trace_probe *old_tp;
+ struct trace_kprobe *old_tk;
int ret;
mutex_lock(&probe_lock);
/* Delete old (same name) event if exist */
- old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
- if (old_tp) {
- ret = unregister_trace_probe(old_tp);
+ old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+ tk->tp.call.class->system);
+ if (old_tk) {
+ ret = unregister_trace_kprobe(old_tk);
if (ret < 0)
goto end;
- free_trace_probe(old_tp);
+ free_trace_kprobe(old_tk);
}
/* Register new event */
- ret = register_probe_event(tp);
+ ret = register_kprobe_event(tk);
if (ret) {
pr_warning("Failed to register probe event(%d)\n", ret);
goto end;
}
/* Register k*probe */
- ret = __register_trace_probe(tp);
+ ret = __register_trace_kprobe(tk);
if (ret < 0)
- unregister_probe_event(tp);
+ unregister_kprobe_event(tk);
else
- list_add_tail(&tp->list, &probe_list);
+ list_add_tail(&tk->list, &probe_list);
end:
mutex_unlock(&probe_lock);
@@ -886,11 +552,11 @@ end:
}
/* Module notifier call back, checking event on the module */
-static int trace_probe_module_callback(struct notifier_block *nb,
+static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
{
struct module *mod = data;
- struct trace_probe *tp;
+ struct trace_kprobe *tk;
int ret;
if (val != MODULE_STATE_COMING)
@@ -898,15 +564,16 @@ static int trace_probe_module_callback(struct notifier_block *nb,
/* Update probes on coming module */
mutex_lock(&probe_lock);
- list_for_each_entry(tp, &probe_list, list) {
- if (trace_probe_within_module(tp, mod)) {
+ list_for_each_entry(tk, &probe_list, list) {
+ if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
- __unregister_trace_probe(tp);
- ret = __register_trace_probe(tp);
+ __unregister_trace_kprobe(tk);
+ ret = __register_trace_kprobe(tk);
if (ret)
pr_warning("Failed to re-register probe %s on"
"%s: %d\n",
- tp->call.name, mod->name, ret);
+ ftrace_event_name(&tk->tp.call),
+ mod->name, ret);
}
}
mutex_unlock(&probe_lock);
@@ -914,233 +581,12 @@ static int trace_probe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
}
-static struct notifier_block trace_probe_module_nb = {
- .notifier_call = trace_probe_module_callback,
+static struct notifier_block trace_kprobe_module_nb = {
+ .notifier_call = trace_kprobe_module_callback,
.priority = 1 /* Invoked after kprobe module callback */
};
-/* Split symbol and offset. */
-static int split_symbol_offset(char *symbol, unsigned long *offset)
-{
- char *tmp;
- int ret;
-
- if (!offset)
- return -EINVAL;
-
- tmp = strchr(symbol, '+');
- if (tmp) {
- /* skip sign because strict_strtol doesn't accept '+' */
- ret = strict_strtoul(tmp + 1, 0, offset);
- if (ret)
- return ret;
- *tmp = '\0';
- } else
- *offset = 0;
- return 0;
-}
-
-#define PARAM_MAX_ARGS 16
-#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_param *f, int is_return)
-{
- int ret = 0;
- unsigned long param;
-
- if (strcmp(arg, "retval") == 0) {
- if (is_return)
- f->fn = t->fetch[FETCH_MTD_retval];
- else
- ret = -EINVAL;
- } else if (strncmp(arg, "stack", 5) == 0) {
- if (arg[5] == '\0') {
- if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
- f->fn = fetch_stack_address;
- else
- ret = -EINVAL;
- } else if (isdigit(arg[5])) {
- ret = strict_strtoul(arg + 5, 10, &param);
- if (ret || param > PARAM_MAX_STACK)
- ret = -EINVAL;
- else {
- f->fn = t->fetch[FETCH_MTD_stack];
- f->data = (void *)param;
- }
- } else
- ret = -EINVAL;
- } else
- ret = -EINVAL;
- return ret;
-}
-
-/* Recursive argument parser */
-static int __parse_probe_arg(char *arg, const struct fetch_type *t,
- struct fetch_param *f, int is_return)
-{
- int ret = 0;
- unsigned long param;
- long offset;
- char *tmp;
-
- switch (arg[0]) {
- case '$':
- ret = parse_probe_vars(arg + 1, t, f, is_return);
- break;
- case '%': /* named register */
- ret = regs_query_register_offset(arg + 1);
- if (ret >= 0) {
- f->fn = t->fetch[FETCH_MTD_reg];
- f->data = (void *)(unsigned long)ret;
- ret = 0;
- }
- break;
- case '@': /* memory or symbol */
- if (isdigit(arg[1])) {
- ret = strict_strtoul(arg + 1, 0, &param);
- if (ret)
- break;
- f->fn = t->fetch[FETCH_MTD_memory];
- f->data = (void *)param;
- } else {
- ret = split_symbol_offset(arg + 1, &offset);
- if (ret)
- break;
- f->data = alloc_symbol_cache(arg + 1, offset);
- if (f->data)
- f->fn = t->fetch[FETCH_MTD_symbol];
- }
- break;
- case '+': /* deref memory */
- arg++; /* Skip '+', because strict_strtol() rejects it. */
- case '-':
- tmp = strchr(arg, '(');
- if (!tmp)
- break;
- *tmp = '\0';
- ret = strict_strtol(arg, 0, &offset);
- if (ret)
- break;
- arg = tmp + 1;
- tmp = strrchr(arg, ')');
- if (tmp) {
- struct deref_fetch_param *dprm;
- const struct fetch_type *t2 = find_fetch_type(NULL);
- *tmp = '\0';
- dprm = kzalloc(sizeof(struct deref_fetch_param),
- GFP_KERNEL);
- if (!dprm)
- return -ENOMEM;
- dprm->offset = offset;
- ret = __parse_probe_arg(arg, t2, &dprm->orig,
- is_return);
- if (ret)
- kfree(dprm);
- else {
- f->fn = t->fetch[FETCH_MTD_deref];
- f->data = (void *)dprm;
- }
- }
- break;
- }
- if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
- pr_info("%s type has no corresponding fetch method.\n",
- t->name);
- ret = -EINVAL;
- }
- return ret;
-}
-
-#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
-
-/* Bitfield type needs to be parsed into a fetch function */
-static int __parse_bitfield_probe_arg(const char *bf,
- const struct fetch_type *t,
- struct fetch_param *f)
-{
- struct bitfield_fetch_param *bprm;
- unsigned long bw, bo;
- char *tail;
-
- if (*bf != 'b')
- return 0;
-
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
- if (!bprm)
- return -ENOMEM;
- bprm->orig = *f;
- f->fn = t->fetch[FETCH_MTD_bitfield];
- f->data = (void *)bprm;
-
- bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
- if (bw == 0 || *tail != '@')
- return -EINVAL;
-
- bf = tail + 1;
- bo = simple_strtoul(bf, &tail, 0);
- if (tail == bf || *tail != '/')
- return -EINVAL;
-
- bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
- bprm->low_shift = bprm->hi_shift + bo;
- return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
-}
-
-/* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct trace_probe *tp,
- struct probe_arg *parg, int is_return)
-{
- const char *t;
- int ret;
-
- if (strlen(arg) > MAX_ARGSTR_LEN) {
- pr_info("Argument is too long.: %s\n", arg);
- return -ENOSPC;
- }
- parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm) {
- pr_info("Failed to allocate memory for command '%s'.\n", arg);
- return -ENOMEM;
- }
- t = strchr(parg->comm, ':');
- if (t) {
- arg[t - parg->comm] = '\0';
- t++;
- }
- parg->type = find_fetch_type(t);
- if (!parg->type) {
- pr_info("Unsupported type: %s\n", t);
- return -EINVAL;
- }
- parg->offset = tp->size;
- tp->size += parg->type->size;
- ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
- if (ret >= 0 && t != NULL)
- ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
- if (ret >= 0) {
- parg->fetch_size.fn = get_fetch_size_function(parg->type,
- parg->fetch.fn);
- parg->fetch_size.data = parg->fetch.data;
- }
- return ret;
-}
-
-/* Return 1 if name is reserved or already used by another argument */
-static int conflict_field_name(const char *name,
- struct probe_arg *args, int narg)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
- if (strcmp(reserved_field_names[i], name) == 0)
- return 1;
- for (i = 0; i < narg; i++)
- if (strcmp(args[i].name, name) == 0)
- return 1;
- return 0;
-}
-
-static int create_trace_probe(int argc, char **argv)
+static int create_trace_kprobe(int argc, char **argv)
{
/*
* Argument syntax:
@@ -1160,9 +606,9 @@ static int create_trace_probe(int argc, char **argv)
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_probe *tp;
+ struct trace_kprobe *tk;
int i, ret = 0;
- int is_return = 0, is_delete = 0;
+ bool is_return = false, is_delete = false;
char *symbol = NULL, *event = NULL, *group = NULL;
char *arg;
unsigned long offset = 0;
@@ -1171,11 +617,11 @@ static int create_trace_probe(int argc, char **argv)
/* argc must be >= 1 */
if (argv[0][0] == 'p')
- is_return = 0;
+ is_return = false;
else if (argv[0][0] == 'r')
- is_return = 1;
+ is_return = true;
else if (argv[0][0] == '-')
- is_delete = 1;
+ is_delete = true;
else {
pr_info("Probe definition must be started with 'p', 'r' or"
" '-'.\n");
@@ -1207,16 +653,16 @@ static int create_trace_probe(int argc, char **argv)
return -EINVAL;
}
mutex_lock(&probe_lock);
- tp = find_trace_probe(event, group);
- if (!tp) {
+ tk = find_trace_kprobe(event, group);
+ if (!tk) {
mutex_unlock(&probe_lock);
pr_info("Event %s/%s doesn't exist.\n", group, event);
return -ENOENT;
}
/* delete an event */
- ret = unregister_trace_probe(tp);
+ ret = unregister_trace_kprobe(tk);
if (ret == 0)
- free_trace_probe(tp);
+ free_trace_kprobe(tk);
mutex_unlock(&probe_lock);
return ret;
}
@@ -1231,7 +677,7 @@ static int create_trace_probe(int argc, char **argv)
return -EINVAL;
}
/* an address specified */
- ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
+ ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
if (ret) {
pr_info("Failed to parse address.\n");
return ret;
@@ -1240,7 +686,7 @@ static int create_trace_probe(int argc, char **argv)
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
- ret = split_symbol_offset(symbol, &offset);
+ ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
pr_info("Failed to parse symbol.\n");
return ret;
@@ -1263,46 +709,49 @@ static int create_trace_probe(int argc, char **argv)
is_return ? 'r' : 'p', addr);
event = buf;
}
- tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
is_return);
- if (IS_ERR(tp)) {
+ if (IS_ERR(tk)) {
pr_info("Failed to allocate trace_probe.(%d)\n",
- (int)PTR_ERR(tp));
- return PTR_ERR(tp);
+ (int)PTR_ERR(tk));
+ return PTR_ERR(tk);
}
/* parse arguments */
ret = 0;
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ struct probe_arg *parg = &tk->tp.args[i];
+
/* Increment count for freeing args in error case */
- tp->nr_args++;
+ tk->tp.nr_args++;
/* Parse argument name */
arg = strchr(argv[i], '=');
if (arg) {
*arg++ = '\0';
- tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+ parg->name = kstrdup(argv[i], GFP_KERNEL);
} else {
arg = argv[i];
/* If argument name is omitted, set "argN" */
snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
- tp->args[i].name = kstrdup(buf, GFP_KERNEL);
+ parg->name = kstrdup(buf, GFP_KERNEL);
}
- if (!tp->args[i].name) {
+ if (!parg->name) {
pr_info("Failed to allocate argument[%d] name.\n", i);
ret = -ENOMEM;
goto error;
}
- if (!is_good_name(tp->args[i].name)) {
+ if (!is_good_name(parg->name)) {
pr_info("Invalid argument[%d] name: %s\n",
- i, tp->args[i].name);
+ i, parg->name);
ret = -EINVAL;
goto error;
}
- if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+ if (traceprobe_conflict_field_name(parg->name,
+ tk->tp.args, i)) {
pr_info("Argument[%d] name '%s' conflicts with "
"another field.\n", i, argv[i]);
ret = -EINVAL;
@@ -1310,40 +759,43 @@ static int create_trace_probe(int argc, char **argv)
}
/* Parse fetch argument */
- ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
+ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
+ is_return, true);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
}
}
- ret = register_trace_probe(tp);
+ ret = register_trace_kprobe(tk);
if (ret)
goto error;
return 0;
error:
- free_trace_probe(tp);
+ free_trace_kprobe(tk);
return ret;
}
-static int release_all_trace_probes(void)
+static int release_all_trace_kprobes(void)
{
- struct trace_probe *tp;
+ struct trace_kprobe *tk;
int ret = 0;
mutex_lock(&probe_lock);
/* Ensure no probe is in use. */
- list_for_each_entry(tp, &probe_list, list)
- if (trace_probe_is_enabled(tp)) {
+ list_for_each_entry(tk, &probe_list, list)
+ if (trace_probe_is_enabled(&tk->tp)) {
ret = -EBUSY;
goto end;
}
/* TODO: Use batch unregistration */
while (!list_empty(&probe_list)) {
- tp = list_entry(probe_list.next, struct trace_probe, list);
- unregister_trace_probe(tp);
- free_trace_probe(tp);
+ tk = list_entry(probe_list.next, struct trace_kprobe, list);
+ ret = unregister_trace_kprobe(tk);
+ if (ret)
+ goto end;
+ free_trace_kprobe(tk);
}
end:
@@ -1371,22 +823,23 @@ static void probes_seq_stop(struct seq_file *m, void *v)
static int probes_seq_show(struct seq_file *m, void *v)
{
- struct trace_probe *tp = v;
+ struct trace_kprobe *tk = v;
int i;
- seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
+ seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
+ seq_printf(m, ":%s/%s", tk->tp.call.class->system,
+ ftrace_event_name(&tk->tp.call));
- if (!tp->symbol)
- seq_printf(m, " 0x%p", tp->rp.kp.addr);
- else if (tp->rp.kp.offset)
- seq_printf(m, " %s+%u", trace_probe_symbol(tp),
- tp->rp.kp.offset);
+ if (!tk->symbol)
+ seq_printf(m, " 0x%p", tk->rp.kp.addr);
+ else if (tk->rp.kp.offset)
+ seq_printf(m, " %s+%u", trace_kprobe_symbol(tk),
+ tk->rp.kp.offset);
else
- seq_printf(m, " %s", trace_probe_symbol(tp));
+ seq_printf(m, " %s", trace_kprobe_symbol(tk));
- for (i = 0; i < tp->nr_args; i++)
- seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
+ for (i = 0; i < tk->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
seq_printf(m, "\n");
return 0;
@@ -1404,7 +857,7 @@ static int probes_open(struct inode *inode, struct file *file)
int ret;
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = release_all_trace_probes();
+ ret = release_all_trace_kprobes();
if (ret < 0)
return ret;
}
@@ -1412,70 +865,11 @@ static int probes_open(struct inode *inode, struct file *file)
return seq_open(file, &probes_seq_op);
}
-static int command_trace_probe(const char *buf)
-{
- char **argv;
- int argc = 0, ret = 0;
-
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = create_trace_probe(argc, argv);
-
- argv_free(argv);
- return ret;
-}
-
-#define WRITE_BUFSIZE 4096
-
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char *kbuf, *tmp;
- int ret;
- size_t done;
- size_t size;
-
- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- ret = done = 0;
- while (done < count) {
- size = count - done;
- if (size >= WRITE_BUFSIZE)
- size = WRITE_BUFSIZE - 1;
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
- kbuf[size] = '\0';
- tmp = strchr(kbuf, '\n');
- if (tmp) {
- *tmp = '\0';
- size = tmp - kbuf + 1;
- } else if (done + size < count) {
- pr_warning("Line length is too long: "
- "Should be less than %d.", WRITE_BUFSIZE);
- ret = -EINVAL;
- goto out;
- }
- done += size;
- /* Remove comments */
- tmp = strchr(kbuf, '#');
- if (tmp)
- *tmp = '\0';
-
- ret = command_trace_probe(kbuf);
- if (ret)
- goto out;
- }
- ret = done;
-out:
- kfree(kbuf);
- return ret;
+ return traceprobe_probes_write(file, buffer, count, ppos,
+ create_trace_kprobe);
}
static const struct file_operations kprobe_events_ops = {
@@ -1490,10 +884,11 @@ static const struct file_operations kprobe_events_ops = {
/* Probes profiling interfaces */
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
- struct trace_probe *tp = v;
+ struct trace_kprobe *tk = v;
- seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
- tp->rp.kp.nmissed);
+ seq_printf(m, " %-44s %15lu %15lu\n",
+ ftrace_event_name(&tk->tp.call), tk->nhit,
+ tk->rp.kp.nmissed);
return 0;
}
@@ -1518,122 +913,105 @@ static const struct file_operations kprobe_profile_ops = {
.release = seq_release,
};
-/* Sum up total data length for dynamic arraies (strings) */
-static __kprobes int __get_data_size(struct trace_probe *tp,
- struct pt_regs *regs)
-{
- int i, ret = 0;
- u32 len;
-
- for (i = 0; i < tp->nr_args; i++)
- if (unlikely(tp->args[i].fetch_size.fn)) {
- call_fetch(&tp->args[i].fetch_size, regs, &len);
- ret += len;
- }
-
- return ret;
-}
-
-/* Store the value of each argument */
-static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
- struct pt_regs *regs,
- u8 *data, int maxlen)
-{
- int i;
- u32 end = tp->size;
- u32 *dl; /* Data (relative) location */
-
- for (i = 0; i < tp->nr_args; i++) {
- if (unlikely(tp->args[i].fetch_size.fn)) {
- /*
- * First, we set the relative location and
- * maximum data length to *dl
- */
- dl = (u32 *)(data + tp->args[i].offset);
- *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
- /* Then try to fetch string or dynamic array data */
- call_fetch(&tp->args[i].fetch, regs, dl);
- /* Reduce maximum length */
- end += get_rloc_len(*dl);
- maxlen -= get_rloc_len(*dl);
- /* Trick here, convert data_rloc to data_loc */
- *dl = convert_rloc_to_loc(*dl,
- ent_size + tp->args[i].offset);
- } else
- /* Just fetching data normally */
- call_fetch(&tp->args[i].fetch, regs,
- data + tp->args[i].offset);
- }
-}
-
/* Kprobe handler */
-static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static nokprobe_inline void
+__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
+ struct ftrace_event_file *ftrace_file)
{
- struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int size, dsize, pc;
unsigned long irq_flags;
- struct ftrace_event_call *call = &tp->call;
+ struct ftrace_event_call *call = &tk->tp.call;
- tp->nhit++;
+ WARN_ON(call != ftrace_file->event_call);
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
local_save_flags(irq_flags);
pc = preempt_count();
- dsize = __get_data_size(tp, regs);
- size = sizeof(*entry) + tp->size + dsize;
+ dsize = __get_data_size(&tk->tp, regs);
+ size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
- size, irq_flags, pc);
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
- entry->ip = (unsigned long)kp->addr;
- store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+ entry->ip = (unsigned long)tk->rp.kp.addr;
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+
+ event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ entry, irq_flags, pc, regs);
+}
+
+static void
+kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
+{
+ struct event_file_link *link;
- if (!filter_current_check_discard(buffer, call, entry, event))
- trace_nowake_buffer_unlock_commit_regs(buffer, event,
- irq_flags, pc, regs);
+ list_for_each_entry_rcu(link, &tk->tp.files, list)
+ __kprobe_trace_func(tk, regs, link->file);
}
+NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
-static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+static nokprobe_inline void
+__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
+ struct pt_regs *regs,
+ struct ftrace_event_file *ftrace_file)
{
- struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int size, pc, dsize;
unsigned long irq_flags;
- struct ftrace_event_call *call = &tp->call;
+ struct ftrace_event_call *call = &tk->tp.call;
+
+ WARN_ON(call != ftrace_file->event_call);
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
local_save_flags(irq_flags);
pc = preempt_count();
- dsize = __get_data_size(tp, regs);
- size = sizeof(*entry) + tp->size + dsize;
+ dsize = __get_data_size(&tk->tp, regs);
+ size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
- size, irq_flags, pc);
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
- entry->func = (unsigned long)tp->rp.kp.addr;
+ entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
- store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+
+ event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ entry, irq_flags, pc, regs);
+}
+
+static void
+kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct event_file_link *link;
- if (!filter_current_check_discard(buffer, call, entry, event))
- trace_nowake_buffer_unlock_commit_regs(buffer, event,
- irq_flags, pc, regs);
+ list_for_each_entry_rcu(link, &tk->tp.files, list)
+ __kretprobe_trace_func(tk, ri, regs, link->file);
}
+NOKPROBE_SYMBOL(kretprobe_trace_func);
/* Event entry printers */
-enum print_line_t
+static enum print_line_t
print_kprobe_event(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -1646,7 +1024,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
goto partial;
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1669,7 +1047,7 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
-enum print_line_t
+static enum print_line_t
print_kretprobe_event(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -1682,7 +1060,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
goto partial;
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1711,31 +1089,23 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
-#undef DEFINE_FIELD
-#define DEFINE_FIELD(type, item, name, is_signed) \
- do { \
- ret = trace_define_field(event_call, #type, name, \
- offsetof(typeof(field), item), \
- sizeof(field.item), is_signed, \
- FILTER_OTHER); \
- if (ret) \
- return ret; \
- } while (0)
static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
{
int ret, i;
struct kprobe_trace_entry_head field;
- struct trace_probe *tp = (struct trace_probe *)event_call->data;
+ struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
/* Set argument names as fields */
- for (i = 0; i < tp->nr_args; i++) {
- ret = trace_define_field(event_call, tp->args[i].type->fmttype,
- tp->args[i].name,
- sizeof(field) + tp->args[i].offset,
- tp->args[i].type->size,
- tp->args[i].type->is_signed,
+ for (i = 0; i < tk->tp.nr_args; i++) {
+ struct probe_arg *parg = &tk->tp.args[i];
+
+ ret = trace_define_field(event_call, parg->type->fmttype,
+ parg->name,
+ sizeof(field) + parg->offset,
+ parg->type->size,
+ parg->type->is_signed,
FILTER_OTHER);
if (ret)
return ret;
@@ -1747,17 +1117,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
{
int ret, i;
struct kretprobe_trace_entry_head field;
- struct trace_probe *tp = (struct trace_probe *)event_call->data;
+ struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
/* Set argument names as fields */
- for (i = 0; i < tp->nr_args; i++) {
- ret = trace_define_field(event_call, tp->args[i].type->fmttype,
- tp->args[i].name,
- sizeof(field) + tp->args[i].offset,
- tp->args[i].type->size,
- tp->args[i].type->is_signed,
+ for (i = 0; i < tk->tp.nr_args; i++) {
+ struct probe_arg *parg = &tk->tp.args[i];
+
+ ret = trace_define_field(event_call, parg->type->fmttype,
+ parg->name,
+ sizeof(field) + parg->offset,
+ parg->type->size,
+ parg->type->is_signed,
FILTER_OTHER);
if (ret)
return ret;
@@ -1765,182 +1137,135 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
return 0;
}
-static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
-{
- int i;
- int pos = 0;
-
- const char *fmt, *arg;
-
- if (!trace_probe_is_return(tp)) {
- fmt = "(%lx)";
- arg = "REC->" FIELD_STRING_IP;
- } else {
- fmt = "(%lx <- %lx)";
- arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
- }
-
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
-
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-
- for (i = 0; i < tp->nr_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
- tp->args[i].name, tp->args[i].type->fmt);
- }
-
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-
- for (i = 0; i < tp->nr_args; i++) {
- if (strcmp(tp->args[i].type->name, "string") == 0)
- pos += snprintf(buf + pos, LEN_OR_ZERO,
- ", __get_str(%s)",
- tp->args[i].name);
- else
- pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
- tp->args[i].name);
- }
-
-#undef LEN_OR_ZERO
-
- /* return the length of print_fmt */
- return pos;
-}
-
-static int set_print_fmt(struct trace_probe *tp)
-{
- int len;
- char *print_fmt;
-
- /* First: called with 0 length to calculate the needed length */
- len = __set_print_fmt(tp, NULL, 0);
- print_fmt = kmalloc(len + 1, GFP_KERNEL);
- if (!print_fmt)
- return -ENOMEM;
-
- /* Second: actually write the @print_fmt */
- __set_print_fmt(tp, print_fmt, len + 1);
- tp->call.print_fmt = print_fmt;
-
- return 0;
-}
-
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static __kprobes void kprobe_perf_func(struct kprobe *kp,
- struct pt_regs *regs)
+static void
+kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
- struct ftrace_event_call *call = &tp->call;
+ struct ftrace_event_call *call = &tk->tp.call;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- dsize = __get_data_size(tp, regs);
- __size = sizeof(*entry) + tp->size + dsize;
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ dsize = __get_data_size(&tk->tp, regs);
+ __size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "profile buffer not large enough"))
- return;
entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
return;
- entry->ip = (unsigned long)kp->addr;
+ entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
- store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-
- head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
+NOKPROBE_SYMBOL(kprobe_perf_func);
/* Kretprobe profile handler */
-static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+static void
+kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
- struct ftrace_event_call *call = &tp->call;
+ struct ftrace_event_call *call = &tk->tp.call;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- dsize = __get_data_size(tp, regs);
- __size = sizeof(*entry) + tp->size + dsize;
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ dsize = __get_data_size(&tk->tp, regs);
+ __size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "profile buffer not large enough"))
- return;
entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
return;
- entry->func = (unsigned long)tp->rp.kp.addr;
+ entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
- store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
-
- head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
-static __kprobes
-int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+/*
+ * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
+ *
+ * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
+ * lockless, but we can't race with this __init function.
+ */
+static int kprobe_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data)
{
- struct trace_probe *tp = (struct trace_probe *)event->data;
+ struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
+ struct ftrace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
- return enable_trace_probe(tp, TP_FLAG_TRACE);
+ return enable_trace_kprobe(tk, file);
case TRACE_REG_UNREGISTER:
- disable_trace_probe(tp, TP_FLAG_TRACE);
- return 0;
+ return disable_trace_kprobe(tk, file);
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return enable_trace_probe(tp, TP_FLAG_PROFILE);
+ return enable_trace_kprobe(tk, NULL);
case TRACE_REG_PERF_UNREGISTER:
- disable_trace_probe(tp, TP_FLAG_PROFILE);
+ return disable_trace_kprobe(tk, NULL);
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
return 0;
#endif
}
return 0;
}
-static __kprobes
-int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+ struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
- if (tp->flags & TP_FLAG_TRACE)
- kprobe_trace_func(kp, regs);
+ tk->nhit++;
+
+ if (tk->tp.flags & TP_FLAG_TRACE)
+ kprobe_trace_func(tk, regs);
#ifdef CONFIG_PERF_EVENTS
- if (tp->flags & TP_FLAG_PROFILE)
- kprobe_perf_func(kp, regs);
+ if (tk->tp.flags & TP_FLAG_PROFILE)
+ kprobe_perf_func(tk, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
+NOKPROBE_SYMBOL(kprobe_dispatcher);
-static __kprobes
-int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+static int
+kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
- struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+ struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
+
+ tk->nhit++;
- if (tp->flags & TP_FLAG_TRACE)
- kretprobe_trace_func(ri, regs);
+ if (tk->tp.flags & TP_FLAG_TRACE)
+ kretprobe_trace_func(tk, ri, regs);
#ifdef CONFIG_PERF_EVENTS
- if (tp->flags & TP_FLAG_PROFILE)
- kretprobe_perf_func(ri, regs);
+ if (tk->tp.flags & TP_FLAG_PROFILE)
+ kretprobe_perf_func(tk, ri, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
+NOKPROBE_SYMBOL(kretprobe_dispatcher);
static struct trace_event_functions kretprobe_funcs = {
.trace = print_kretprobe_event
@@ -1950,21 +1275,21 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};
-static int register_probe_event(struct trace_probe *tp)
+static int register_kprobe_event(struct trace_kprobe *tk)
{
- struct ftrace_event_call *call = &tp->call;
+ struct ftrace_event_call *call = &tk->tp.call;
int ret;
/* Initialize ftrace_event_call */
INIT_LIST_HEAD(&call->class->fields);
- if (trace_probe_is_return(tp)) {
+ if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
call->class->define_fields = kretprobe_event_define_fields;
} else {
call->event.funcs = &kprobe_funcs;
call->class->define_fields = kprobe_event_define_fields;
}
- if (set_print_fmt(tp) < 0)
+ if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
ret = register_ftrace_event(&call->event);
if (!ret) {
@@ -1973,21 +1298,26 @@ static int register_probe_event(struct trace_probe *tp)
}
call->flags = 0;
call->class->reg = kprobe_register;
- call->data = tp;
+ call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
- pr_info("Failed to register kprobe event: %s\n", call->name);
+ pr_info("Failed to register kprobe event: %s\n",
+ ftrace_event_name(call));
kfree(call->print_fmt);
unregister_ftrace_event(&call->event);
}
return ret;
}
-static void unregister_probe_event(struct trace_probe *tp)
+static int unregister_kprobe_event(struct trace_kprobe *tk)
{
+ int ret;
+
/* tp->event is unregistered in trace_remove_event_call() */
- trace_remove_event_call(&tp->call);
- kfree(tp->call.print_fmt);
+ ret = trace_remove_event_call(&tk->tp.call);
+ if (!ret)
+ kfree(tk->tp.call.print_fmt);
+ return ret;
}
/* Make a debugfs interface for controlling probe points */
@@ -1996,7 +1326,7 @@ static __init int init_kprobe_trace(void)
struct dentry *d_tracer;
struct dentry *entry;
- if (register_module_notifier(&trace_probe_module_nb))
+ if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
d_tracer = tracing_init_dentry();
@@ -2035,44 +1365,77 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
return a1 + a2 + a3 + a4 + a5 + a6;
}
+static struct ftrace_event_file *
+find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list)
+ if (file->event_call == &tk->tp.call)
+ return file;
+
+ return NULL;
+}
+
+/*
+ * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this
+ * stage, we can do this lockless.
+ */
static __init int kprobe_trace_self_tests_init(void)
{
int ret, warn = 0;
int (*target)(int, int, int, int, int, int);
- struct trace_probe *tp;
+ struct trace_kprobe *tk;
+ struct ftrace_event_file *file;
+
+ if (tracing_is_disabled())
+ return -ENODEV;
target = kprobe_trace_selftest_target;
pr_info("Testing kprobe tracing: ");
- ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
- "$stack $stack0 +0($stack)");
+ ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)",
+ create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on probing function entry.\n");
+ pr_warn("error on probing function entry.\n");
warn++;
} else {
/* Enable trace point */
- tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting new probe.\n");
+ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting new probe.\n");
warn++;
- } else
- enable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ enable_trace_kprobe(tk, file);
+ }
}
- ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
- "$retval");
+ ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on probing function return.\n");
+ pr_warn("error on probing function return.\n");
warn++;
} else {
/* Enable trace point */
- tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting new probe.\n");
+ tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting 2nd new probe.\n");
warn++;
- } else
- enable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ enable_trace_kprobe(tk, file);
+ }
}
if (warn)
@@ -2081,34 +1444,46 @@ static __init int kprobe_trace_self_tests_init(void)
ret = target(1, 2, 3, 4, 5, 6);
/* Disable trace points before removing it */
- tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting test probe.\n");
+ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting test probe.\n");
warn++;
- } else
- disable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ disable_trace_kprobe(tk, file);
+ }
- tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tp == NULL)) {
- pr_warning("error on getting 2nd test probe.\n");
+ tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting 2nd test probe.\n");
warn++;
- } else
- disable_trace_probe(tp, TP_FLAG_TRACE);
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ disable_trace_kprobe(tk, file);
+ }
- ret = command_trace_probe("-:testprobe");
+ ret = traceprobe_command("-:testprobe", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on deleting a probe.\n");
+ pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = command_trace_probe("-:testprobe2");
+ ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error on deleting a probe.\n");
+ pr_warn("error on deleting a probe.\n");
warn++;
}
end:
- release_all_trace_probes();
+ release_all_trace_kprobes();
if (warn)
pr_cont("NG: Some tests are failed. Please check them.\n");
else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e..0abd9b86347 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
overrun_detected = false;
prev_overruns = 0;
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
}
static int mmio_trace_init(struct trace_array *tr)
@@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
if (drv)
ret += trace_seq_printf(s, " %s\n", drv->name);
else
- ret += trace_seq_printf(s, " \n");
+ ret += trace_seq_puts(s, " \n");
return ret;
}
@@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)
struct header_iter *hiter;
struct trace_seq *s = &iter->seq;
- trace_seq_printf(s, "VERSION 20070824\n");
+ trace_seq_puts(s, "VERSION 20070824\n");
hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
if (!hiter)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
unsigned long cnt = atomic_xchg(&dropped_count, 0);
- unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+ unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
if (over > prev_overruns)
cnt += over - prev_overruns;
@@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
(rw->value >> 0) & 0xff, rw->pc, 0);
break;
default:
- ret = trace_seq_printf(s, "rw what?\n");
+ ret = trace_seq_puts(s, "rw what?\n");
break;
}
if (ret)
@@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
- ret = trace_seq_printf(s, "map what?\n");
+ ret = trace_seq_puts(s, "map what?\n");
break;
}
if (ret)
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct mmiotrace_rw *rw)
{
struct ftrace_event_call *call = &event_mmiotrace_rw;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
int pc = preempt_count();
@@ -323,14 +323,14 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->rw = *rw;
- if (!filter_check_discard(call, entry, buffer, event))
+ if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data = tr->data[smp_processor_id()];
+ struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
__trace_mmiotrace_rw(tr, data, rw);
}
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct mmiotrace_map *map)
{
struct ftrace_event_call *call = &event_mmiotrace_map;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
int pc = preempt_count();
@@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->map = *map;
- if (!filter_check_discard(call, entry, buffer, event))
+ if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, 0, pc);
}
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
struct trace_array_cpu *data;
preempt_disable();
- data = tr->data[smp_processor_id()];
+ data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
__trace_mmiotrace_map(tr, data, map);
preempt_enable();
}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 394f94417e2..fcf0a9e4891 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)
* If you don't implement it, then the flag setting will be
* automatically accepted.
*/
-static int nop_set_flag(u32 old_flags, u32 bit, int set)
+static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
/*
* Note that you don't need to update nop_flags.val yourself.
@@ -91,11 +91,11 @@ struct tracer nop_trace __read_mostly =
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
- .wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif
.flags = &nop_flags,
- .set_flag = nop_set_flag
+ .set_flag = nop_set_flag,
+ .allow_instances = true,
};
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0d6ff355594..f3dad80c20b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
/* must be a power of 2 */
#define EVENT_HASHSIZE 128
-DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
return ret;
}
+enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct bputs_entry *field;
+ int ret;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_puts(s, field->str);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -62,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
trace_assign_type(field, entry);
- ret = trace_seq_printf(s, "%s", field->buf);
+ ret = trace_seq_puts(s, field->buf);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -110,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
EXPORT_SYMBOL_GPL(trace_seq_printf);
/**
+ * trace_seq_bitmask - put a list of longs as a bitmask print output
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ */
+int
+trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
+/**
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
* @fmt: printf format string
@@ -264,7 +308,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
return ret;
}
-int trace_seq_path(struct trace_seq *s, struct path *path)
+int trace_seq_path(struct trace_seq *s, const struct path *path)
{
unsigned char *p;
@@ -300,7 +344,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long mask;
const char *str;
const char *ret = p->buffer + p->len;
- int i;
+ int i, first = 1;
for (i = 0; flag_array[i].name && flags; i++) {
@@ -310,14 +354,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
str = flag_array[i].name;
flags &= ~mask;
- if (p->len && delim)
+ if (!first && delim)
trace_seq_puts(p, delim);
+ else
+ first = 0;
trace_seq_puts(p, str);
}
/* check for left over flags */
if (flags) {
- if (p->len && delim)
+ if (!first && delim)
trace_seq_puts(p, delim);
trace_seq_printf(p, "0x%lx", flags);
}
@@ -344,7 +390,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
break;
}
- if (!p->len)
+ if (ret == (const char *)(p->buffer + p->len))
trace_seq_printf(p, "0x%lx", val);
trace_seq_putc(p, 0);
@@ -370,7 +416,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
break;
}
- if (!p->len)
+ if (ret == (const char *)(p->buffer + p->len))
trace_seq_printf(p, "0x%llx", val);
trace_seq_putc(p, 0);
@@ -381,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
#endif
const char *
+ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+ unsigned int bitmask_size)
+{
+ const char *ret = p->buffer + p->len;
+
+ trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+
+const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
@@ -395,6 +454,63 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
}
EXPORT_SYMBOL(ftrace_print_hex_seq);
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+ struct trace_event *trace_event)
+{
+ struct ftrace_event_call *event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_seq *p = &iter->tmp_seq;
+ struct trace_entry *entry;
+ int ret;
+
+ event = container_of(trace_event, struct ftrace_event_call, event);
+ entry = iter->ent;
+
+ if (entry->type != event->event.type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
+ trace_seq_init(p);
+ ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+EXPORT_SYMBOL(ftrace_raw_output_prep);
+
+static int ftrace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
+{
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ ret = trace_seq_printf(s, "%s: ", name);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ret = trace_seq_vprintf(s, fmt, ap);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = ftrace_output_raw(iter, name, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_output_call);
+
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
{
@@ -514,14 +630,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
if (ret)
ret = trace_seq_puts(s, "??");
if (ret)
- ret = trace_seq_puts(s, "\n");
+ ret = trace_seq_putc(s, '\n');
continue;
}
if (!ret)
break;
if (ret)
ret = seq_print_user_ip(s, mm, ip, sym_flags);
- ret = trace_seq_puts(s, "\n");
+ ret = trace_seq_putc(s, '\n');
}
if (mm)
@@ -535,7 +651,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
int ret;
if (!ip)
- return trace_seq_printf(s, "0");
+ return trace_seq_putc(s, '0');
if (sym_flags & TRACE_ITER_SYM_OFFSET)
ret = seq_print_sym_offset(s, "%s", ip);
@@ -574,8 +690,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
- need_resched =
- (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+
+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+ TRACE_FLAG_PREEMPT_RESCHED)) {
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'N';
+ break;
+ case TRACE_FLAG_NEED_RESCHED:
+ need_resched = 'n';
+ break;
+ case TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'p';
+ break;
+ default:
+ need_resched = '.';
+ break;
+ }
+
hardsoft_irq =
(hardirq && softirq) ? 'H' :
hardirq ? 'h' :
@@ -608,24 +739,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
return trace_print_lat_fmt(s, entry);
}
-static unsigned long preempt_mark_thresh = 100;
+static unsigned long preempt_mark_thresh_us = 100;
static int
-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
- unsigned long rel_usecs)
+lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
- return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
- rel_usecs > preempt_mark_thresh ? '!' :
- rel_usecs > 1 ? '+' : ' ');
+ unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
+ unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
+ unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
+ unsigned long long rel_ts = next_ts - iter->ts;
+ struct trace_seq *s = &iter->seq;
+
+ if (in_ns) {
+ abs_ts = ns2usecs(abs_ts);
+ rel_ts = ns2usecs(rel_ts);
+ }
+
+ if (verbose && in_ns) {
+ unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
+ unsigned long abs_msec = (unsigned long)abs_ts;
+ unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
+ unsigned long rel_msec = (unsigned long)rel_ts;
+
+ return trace_seq_printf(
+ s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+ ns2usecs(iter->ts),
+ abs_msec, abs_usec,
+ rel_msec, rel_usec);
+ } else if (verbose && !in_ns) {
+ return trace_seq_printf(
+ s, "[%016llx] %lld (+%lld): ",
+ iter->ts, abs_ts, rel_ts);
+ } else if (!verbose && in_ns) {
+ return trace_seq_printf(
+ s, " %4lldus%c: ",
+ abs_ts,
+ rel_ts > preempt_mark_thresh_us ? '!' :
+ rel_ts > 1 ? '+' : ' ');
+ } else { /* !verbose && !in_ns */
+ return trace_seq_printf(s, " %4lld: ", abs_ts);
+ }
}
int trace_print_context(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
- unsigned long long t = ns2usecs(iter->ts);
- unsigned long usec_rem = do_div(t, USEC_PER_SEC);
- unsigned long secs = (unsigned long)t;
+ unsigned long long t;
+ unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
int ret;
@@ -642,46 +803,49 @@ int trace_print_context(struct trace_iterator *iter)
return 0;
}
- return trace_seq_printf(s, " %5lu.%06lu: ",
- secs, usec_rem);
+ if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
+ t = ns2usecs(iter->ts);
+ usec_rem = do_div(t, USEC_PER_SEC);
+ secs = (unsigned long)t;
+ return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+ } else
+ return trace_seq_printf(s, " %12llu: ", iter->ts);
}
int trace_print_lat_context(struct trace_iterator *iter)
{
u64 next_ts;
int ret;
+ /* trace_find_next_entry will reset ent_size */
+ int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent,
*next_entry = trace_find_next_entry(iter, NULL,
&next_ts);
unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
- unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
- unsigned long rel_usecs;
+
+ /* Restore the original ent_size */
+ iter->ent_size = ent_size;
if (!next_entry)
next_ts = iter->ts;
- rel_usecs = ns2usecs(next_ts - iter->ts);
if (verbose) {
char comm[TASK_COMM_LEN];
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
- " %ld.%03ldms (+%ld.%03ldms): ", comm,
- entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx,
- ns2usecs(iter->ts),
- abs_usecs / USEC_PER_MSEC,
- abs_usecs % USEC_PER_MSEC,
- rel_usecs / USEC_PER_MSEC,
- rel_usecs % USEC_PER_MSEC);
+ ret = trace_seq_printf(
+ s, "%16s %5d %3d %d %08x %08lx ",
+ comm, entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx);
} else {
ret = lat_print_generic(s, entry, iter->cpu);
- if (ret)
- ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
}
+ if (ret)
+ ret = lat_print_timestamp(iter, next_ts);
+
return ret;
}
@@ -704,12 +868,11 @@ static int task_state_char(unsigned long state)
struct trace_event *ftrace_find_event(int type)
{
struct trace_event *event;
- struct hlist_node *n;
unsigned key;
key = type & (EVENT_HASHSIZE - 1);
- hlist_for_each_entry(event, n, &event_hash[key], node) {
+ hlist_for_each_entry(event, &event_hash[key], node) {
if (event->type == type)
return event;
}
@@ -749,12 +912,12 @@ static int trace_search_list(struct list_head **list)
void trace_event_read_lock(void)
{
- down_read(&trace_event_mutex);
+ down_read(&trace_event_sem);
}
void trace_event_read_unlock(void)
{
- up_read(&trace_event_mutex);
+ up_read(&trace_event_sem);
}
/**
@@ -777,7 +940,7 @@ int register_ftrace_event(struct trace_event *event)
unsigned key;
int ret = 0;
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
if (WARN_ON(!event))
goto out;
@@ -832,14 +995,14 @@ int register_ftrace_event(struct trace_event *event)
ret = event->type;
out:
- up_write(&trace_event_mutex);
+ up_write(&trace_event_sem);
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_event);
/*
- * Used by module code with the trace_event_mutex held for write.
+ * Used by module code with the trace_event_sem held for write.
*/
int __unregister_ftrace_event(struct trace_event *event)
{
@@ -854,9 +1017,9 @@ int __unregister_ftrace_event(struct trace_event *event)
*/
int unregister_ftrace_event(struct trace_event *event)
{
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
__unregister_ftrace_event(event);
- up_write(&trace_event_mutex);
+ up_write(&trace_event_sem);
return 0;
}
@@ -888,14 +1051,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
goto partial;
if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- if (!trace_seq_printf(s, " <-"))
+ if (!trace_seq_puts(s, " <-"))
goto partial;
if (!seq_print_ip_sym(s,
field->parent_ip,
flags))
goto partial;
}
- if (!trace_seq_printf(s, "\n"))
+ if (!trace_seq_putc(s, '\n'))
goto partial;
return TRACE_TYPE_HANDLED;
@@ -1134,7 +1297,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
if (!seq_print_ip_sym(s, *p, flags))
goto partial;
- if (!trace_seq_puts(s, "\n"))
+ if (!trace_seq_putc(s, '\n'))
goto partial;
}
@@ -1183,6 +1346,64 @@ static struct trace_event trace_user_stack_event = {
.funcs = &trace_user_stack_funcs,
};
+/* TRACE_BPUTS */
+static enum print_line_t
+trace_bputs_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct bputs_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if (!trace_seq_puts(s, ": "))
+ goto partial;
+
+ if (!trace_seq_puts(s, field->str))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+
+static enum print_line_t
+trace_bputs_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct bputs_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(s, ": %lx : ", field->ip))
+ goto partial;
+
+ if (!trace_seq_puts(s, field->str))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event_functions trace_bputs_funcs = {
+ .trace = trace_bputs_print,
+ .raw = trace_bputs_raw,
+};
+
+static struct trace_event trace_bputs_event = {
+ .type = TRACE_BPUTS,
+ .funcs = &trace_bputs_funcs,
+};
+
/* TRACE_BPRINT */
static enum print_line_t
trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1295,6 +1516,7 @@ static struct trace_event *events[] __initdata = {
&trace_wake_event,
&trace_stack_event,
&trace_user_stack_event,
+ &trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
NULL
@@ -1318,4 +1540,4 @@ __init static int init_events(void)
return 0;
}
-device_initcall(init_events);
+early_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492..127a9d8c835 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
#include "trace.h"
extern enum print_line_t
+trace_print_bputs_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
trace_print_bprintk_msg_only(struct trace_iterator *iter);
extern enum print_line_t
trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
/* used by module unregistering */
extern int __unregister_ftrace_event(struct trace_event *event);
-extern struct rw_semaphore trace_event_mutex;
+extern struct rw_semaphore trace_event_sem;
#define MAX_MEMHEX_BYTES 8
#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 6fd4ffd042f..2900817ba65 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
const char **iter;
char *fmt;
+ /* allocate the trace_printk per cpu buffers */
+ if (start != end)
+ trace_printk_init_buffers();
+
mutex_lock(&btrace_mutex);
for (iter = start; iter < end; iter++) {
struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
@@ -240,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
{
const char **fmt = v;
int start_index;
+ int last_index;
start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
if (*pos < start_index)
return __start___trace_bprintk_fmt + *pos;
+ /*
+ * The __tracepoint_str section is treated the same as the
+ * __trace_printk_fmt section. The difference is that the
+ * __trace_printk_fmt section should only be used by trace_printk()
+ * in a debugging environment, as if anything exists in that section
+ * the trace_prink() helper buffers are allocated, which would just
+ * waste space in a production environment.
+ *
+ * The __tracepoint_str sections on the other hand are used by
+ * tracepoints which need to map pointers to their strings to
+ * the ASCII text for userspace.
+ */
+ last_index = start_index;
+ start_index = __stop___tracepoint_str - __start___tracepoint_str;
+
+ if (*pos < last_index + start_index)
+ return __start___tracepoint_str + (*pos - last_index);
+
return find_next_mod_format(start_index, v, fmt, pos);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 00000000000..d4b9fc22cd2
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,726 @@
+/*
+ * Common code for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author: Srikar Dronamraju
+ */
+
+#include "trace_probe.h"
+
+const char *reserved_field_names[] = {
+ "common_type",
+ "common_flags",
+ "common_preempt_count",
+ "common_pid",
+ "common_tgid",
+ FIELD_STRING_IP,
+ FIELD_STRING_RETIP,
+ FIELD_STRING_FUNC,
+};
+
+/* Printing in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent) \
+{ \
+ return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+} \
+const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
+
+/* Print type function for string type */
+int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
+ void *data, void *ent)
+{
+ int len = *(u32 *)data >> 16;
+
+ if (!len)
+ return trace_seq_printf(s, " %s=(fault)", name);
+ else
+ return trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+}
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
+
+const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
+#define CHECK_FETCH_FUNCS(method, fn) \
+ (((FETCH_FUNC_NAME(method, u8) == fn) || \
+ (FETCH_FUNC_NAME(method, u16) == fn) || \
+ (FETCH_FUNC_NAME(method, u32) == fn) || \
+ (FETCH_FUNC_NAME(method, u64) == fn) || \
+ (FETCH_FUNC_NAME(method, string) == fn) || \
+ (FETCH_FUNC_NAME(method, string_size) == fn)) \
+ && (fn != NULL))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type) \
+void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_register(regs, \
+ (unsigned int)((unsigned long)offset)); \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+#define DEFINE_FETCH_retval(type) \
+void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
+ void *dummy, void *dest) \
+{ \
+ *(type *)dest = (type)regs_return_value(regs); \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+/* Dereference memory access function */
+struct deref_fetch_param {
+ struct fetch_param orig;
+ long offset;
+ fetch_func_t fetch;
+ fetch_func_t fetch_size;
+};
+
+#define DEFINE_FETCH_deref(type) \
+void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
+{ \
+ struct deref_fetch_param *dprm = data; \
+ unsigned long addr; \
+ call_fetch(&dprm->orig, regs, &addr); \
+ if (addr) { \
+ addr += dprm->offset; \
+ dprm->fetch(regs, (void *)addr, dest); \
+ } else \
+ *(type *)dest = 0; \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+
+void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ struct deref_fetch_param *dprm = data;
+ unsigned long addr;
+
+ call_fetch(&dprm->orig, regs, &addr);
+ if (addr && dprm->fetch_size) {
+ addr += dprm->offset;
+ dprm->fetch_size(regs, (void *)addr, dest);
+ } else
+ *(string_size *)dest = 0;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
+
+static void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ update_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ update_symbol_cache(data->orig.data);
+}
+NOKPROBE_SYMBOL(update_deref_fetch_param);
+
+static void free_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
+NOKPROBE_SYMBOL(free_deref_fetch_param);
+
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+ struct fetch_param orig;
+ unsigned char hi_shift;
+ unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type) \
+void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
+{ \
+ struct bitfield_fetch_param *bprm = data; \
+ type buf = 0; \
+ call_fetch(&bprm->orig, regs, &buf); \
+ if (buf) { \
+ buf <<= bprm->hi_shift; \
+ buf >>= bprm->low_shift; \
+ } \
+ *(type *)dest = buf; \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ update_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ update_symbol_cache(data->orig.data);
+}
+
+static void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+
+ kfree(data);
+}
+
+static const struct fetch_type *find_fetch_type(const char *type,
+ const struct fetch_type *ftbl)
+{
+ int i;
+
+ if (!type)
+ type = DEFAULT_FETCH_TYPE_STR;
+
+ /* Special case: bitfield */
+ if (*type == 'b') {
+ unsigned long bs;
+
+ type = strchr(type, '/');
+ if (!type)
+ goto fail;
+
+ type++;
+ if (kstrtoul(type, 0, &bs))
+ goto fail;
+
+ switch (bs) {
+ case 8:
+ return find_fetch_type("u8", ftbl);
+ case 16:
+ return find_fetch_type("u16", ftbl);
+ case 32:
+ return find_fetch_type("u32", ftbl);
+ case 64:
+ return find_fetch_type("u64", ftbl);
+ default:
+ goto fail;
+ }
+ }
+
+ for (i = 0; ftbl[i].name; i++) {
+ if (strcmp(type, ftbl[i].name) == 0)
+ return &ftbl[i];
+ }
+
+fail:
+ return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
+{
+ *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+NOKPROBE_SYMBOL(fetch_kernel_stack_address);
+
+static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
+{
+ *(unsigned long *)dest = user_stack_pointer(regs);
+}
+NOKPROBE_SYMBOL(fetch_user_stack_address);
+
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+ fetch_func_t orig_fn,
+ const struct fetch_type *ftbl)
+{
+ int i;
+
+ if (type != &ftbl[FETCH_TYPE_STRING])
+ return NULL; /* Only string type needs size function */
+
+ for (i = 0; i < FETCH_MTD_END; i++)
+ if (type->fetch[i] == orig_fn)
+ return ftbl[FETCH_TYPE_STRSIZE].fetch[i];
+
+ WARN_ON(1); /* This should not happen */
+
+ return NULL;
+}
+
+/* Split symbol and offset. */
+int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+{
+ char *tmp;
+ int ret;
+
+ if (!offset)
+ return -EINVAL;
+
+ tmp = strchr(symbol, '+');
+ if (tmp) {
+ /* skip sign because kstrtoul doesn't accept '+' */
+ ret = kstrtoul(tmp + 1, 0, offset);
+ if (ret)
+ return ret;
+
+ *tmp = '\0';
+ } else
+ *offset = 0;
+
+ return 0;
+}
+
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, bool is_return,
+ bool is_kprobe)
+{
+ int ret = 0;
+ unsigned long param;
+
+ if (strcmp(arg, "retval") == 0) {
+ if (is_return)
+ f->fn = t->fetch[FETCH_MTD_retval];
+ else
+ ret = -EINVAL;
+ } else if (strncmp(arg, "stack", 5) == 0) {
+ if (arg[5] == '\0') {
+ if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR))
+ return -EINVAL;
+
+ if (is_kprobe)
+ f->fn = fetch_kernel_stack_address;
+ else
+ f->fn = fetch_user_stack_address;
+ } else if (isdigit(arg[5])) {
+ ret = kstrtoul(arg + 5, 10, &param);
+ if (ret || (is_kprobe && param > PARAM_MAX_STACK))
+ ret = -EINVAL;
+ else {
+ f->fn = t->fetch[FETCH_MTD_stack];
+ f->data = (void *)param;
+ }
+ } else
+ ret = -EINVAL;
+ } else
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/* Recursive argument parser */
+static int parse_probe_arg(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, bool is_return, bool is_kprobe)
+{
+ const struct fetch_type *ftbl;
+ unsigned long param;
+ long offset;
+ char *tmp;
+ int ret = 0;
+
+ ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
+ BUG_ON(ftbl == NULL);
+
+ switch (arg[0]) {
+ case '$':
+ ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
+ break;
+
+ case '%': /* named register */
+ ret = regs_query_register_offset(arg + 1);
+ if (ret >= 0) {
+ f->fn = t->fetch[FETCH_MTD_reg];
+ f->data = (void *)(unsigned long)ret;
+ ret = 0;
+ }
+ break;
+
+ case '@': /* memory, file-offset or symbol */
+ if (isdigit(arg[1])) {
+ ret = kstrtoul(arg + 1, 0, &param);
+ if (ret)
+ break;
+
+ f->fn = t->fetch[FETCH_MTD_memory];
+ f->data = (void *)param;
+ } else if (arg[1] == '+') {
+ /* kprobes don't support file offsets */
+ if (is_kprobe)
+ return -EINVAL;
+
+ ret = kstrtol(arg + 2, 0, &offset);
+ if (ret)
+ break;
+
+ f->fn = t->fetch[FETCH_MTD_file_offset];
+ f->data = (void *)offset;
+ } else {
+ /* uprobes don't support symbols */
+ if (!is_kprobe)
+ return -EINVAL;
+
+ ret = traceprobe_split_symbol_offset(arg + 1, &offset);
+ if (ret)
+ break;
+
+ f->data = alloc_symbol_cache(arg + 1, offset);
+ if (f->data)
+ f->fn = t->fetch[FETCH_MTD_symbol];
+ }
+ break;
+
+ case '+': /* deref memory */
+ arg++; /* Skip '+', because kstrtol() rejects it. */
+ case '-':
+ tmp = strchr(arg, '(');
+ if (!tmp)
+ break;
+
+ *tmp = '\0';
+ ret = kstrtol(arg, 0, &offset);
+
+ if (ret)
+ break;
+
+ arg = tmp + 1;
+ tmp = strrchr(arg, ')');
+
+ if (tmp) {
+ struct deref_fetch_param *dprm;
+ const struct fetch_type *t2;
+
+ t2 = find_fetch_type(NULL, ftbl);
+ *tmp = '\0';
+ dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
+
+ if (!dprm)
+ return -ENOMEM;
+
+ dprm->offset = offset;
+ dprm->fetch = t->fetch[FETCH_MTD_memory];
+ dprm->fetch_size = get_fetch_size_function(t,
+ dprm->fetch, ftbl);
+ ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
+ is_kprobe);
+ if (ret)
+ kfree(dprm);
+ else {
+ f->fn = t->fetch[FETCH_MTD_deref];
+ f->data = (void *)dprm;
+ }
+ }
+ break;
+ }
+ if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
+ pr_info("%s type has no corresponding fetch method.\n", t->name);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+ const struct fetch_type *t,
+ struct fetch_param *f)
+{
+ struct bitfield_fetch_param *bprm;
+ unsigned long bw, bo;
+ char *tail;
+
+ if (*bf != 'b')
+ return 0;
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm)
+ return -ENOMEM;
+
+ bprm->orig = *f;
+ f->fn = t->fetch[FETCH_MTD_bitfield];
+ f->data = (void *)bprm;
+ bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
+
+ if (bw == 0 || *tail != '@')
+ return -EINVAL;
+
+ bf = tail + 1;
+ bo = simple_strtoul(bf, &tail, 0);
+
+ if (tail == bf || *tail != '/')
+ return -EINVAL;
+
+ bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+ bprm->low_shift = bprm->hi_shift + bo;
+
+ return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
+/* String length checking wrapper */
+int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+ struct probe_arg *parg, bool is_return, bool is_kprobe)
+{
+ const struct fetch_type *ftbl;
+ const char *t;
+ int ret;
+
+ ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
+ BUG_ON(ftbl == NULL);
+
+ if (strlen(arg) > MAX_ARGSTR_LEN) {
+ pr_info("Argument is too long.: %s\n", arg);
+ return -ENOSPC;
+ }
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ pr_info("Failed to allocate memory for command '%s'.\n", arg);
+ return -ENOMEM;
+ }
+ t = strchr(parg->comm, ':');
+ if (t) {
+ arg[t - parg->comm] = '\0';
+ t++;
+ }
+ parg->type = find_fetch_type(t, ftbl);
+ if (!parg->type) {
+ pr_info("Unsupported type: %s\n", t);
+ return -EINVAL;
+ }
+ parg->offset = *size;
+ *size += parg->type->size;
+ ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+
+ if (ret >= 0 && t != NULL)
+ ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
+
+ if (ret >= 0) {
+ parg->fetch_size.fn = get_fetch_size_function(parg->type,
+ parg->fetch.fn,
+ ftbl);
+ parg->fetch_size.data = parg->fetch.data;
+ }
+
+ return ret;
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+ if (strcmp(reserved_field_names[i], name) == 0)
+ return 1;
+
+ for (i = 0; i < narg; i++)
+ if (strcmp(args[i].name, name) == 0)
+ return 1;
+
+ return 0;
+}
+
+void traceprobe_update_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ update_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ update_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ update_symbol_cache(arg->fetch.data);
+}
+
+void traceprobe_free_probe_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ free_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ free_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ free_symbol_cache(arg->fetch.data);
+
+ kfree(arg->name);
+ kfree(arg->comm);
+}
+
+int traceprobe_command(const char *buf, int (*createfn)(int, char **))
+{
+ char **argv;
+ int argc, ret;
+
+ argc = 0;
+ ret = 0;
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+#define WRITE_BUFSIZE 4096
+
+ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos,
+ int (*createfn)(int, char **))
+{
+ char *kbuf, *tmp;
+ int ret = 0;
+ size_t done = 0;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ while (done < count) {
+ size = count - done;
+
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ tmp = strchr(kbuf, '\n');
+
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - kbuf + 1;
+ } else if (done + size < count) {
+ pr_warning("Line length is too long: "
+ "Should be less than %d.", WRITE_BUFSIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+ done += size;
+ /* Remove comments */
+ tmp = strchr(kbuf, '#');
+
+ if (tmp)
+ *tmp = '\0';
+
+ ret = traceprobe_command(kbuf, createfn);
+ if (ret)
+ goto out;
+ }
+ ret = done;
+
+out:
+ kfree(kbuf);
+
+ return ret;
+}
+
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
+ bool is_return)
+{
+ int i;
+ int pos = 0;
+
+ const char *fmt, *arg;
+
+ if (!is_return) {
+ fmt = "(%lx)";
+ arg = "REC->" FIELD_STRING_IP;
+ } else {
+ fmt = "(%lx <- %lx)";
+ arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ }
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+
+ for (i = 0; i < tp->nr_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+ tp->args[i].name, tp->args[i].type->fmt);
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (strcmp(tp->args[i].type->name, "string") == 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_str(%s)",
+ tp->args[i].name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+ tp->args[i].name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+int set_print_fmt(struct trace_probe *tp, bool is_return)
+{
+ int len;
+ char *print_fmt;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_print_fmt(tp, NULL, 0, is_return);
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_print_fmt(tp, print_fmt, len + 1, is_return);
+ tp->call.print_fmt = print_fmt;
+
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 00000000000..4f815fbce16
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,400 @@
+/*
+ * Common header file for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.h written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author: Srikar Dronamraju
+ */
+
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
+#include <asm/bitsperlong.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed) \
+ do { \
+ ret = trace_define_field(event_call, #type, name, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed, \
+ FILTER_OTHER); \
+ if (ret) \
+ return ret; \
+ } while (0)
+
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE 1
+#define TP_FLAG_PROFILE 2
+#define TP_FLAG_REGISTERED 4
+
+
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs) \
+ (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl) ((u32)(dl) >> 16)
+#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
+
+/*
+ * Convert data_rloc to data_loc:
+ * data_rloc stores the offset from data_rloc itself, but data_loc
+ * stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
+
+static nokprobe_inline void *get_rloc_data(u32 *dl)
+{
+ return (u8 *)dl + get_rloc_offs(*dl);
+}
+
+/* For data_loc conversion */
+static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
+{
+ return (u8 *)ent + get_rloc_offs(*dl);
+}
+
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
+
+/* Fetch types */
+enum {
+ FETCH_MTD_reg = 0,
+ FETCH_MTD_stack,
+ FETCH_MTD_retval,
+ FETCH_MTD_memory,
+ FETCH_MTD_symbol,
+ FETCH_MTD_deref,
+ FETCH_MTD_bitfield,
+ FETCH_MTD_file_offset,
+ FETCH_MTD_END,
+};
+
+/* Fetch type information table */
+struct fetch_type {
+ const char *name; /* Name of type */
+ size_t size; /* Byte size of type */
+ int is_signed; /* Signed flag */
+ print_type_func_t print; /* Print functions */
+ const char *fmt; /* Fromat string */
+ const char *fmttype; /* Name in format file */
+ /* Fetch functions */
+ fetch_func_t fetch[FETCH_MTD_END];
+};
+
+struct fetch_param {
+ fetch_func_t fn;
+ void *data;
+};
+
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
+#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
+
+/* Printing in basic type function template */
+#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent); \
+extern const char PRINT_TYPE_FMT_NAME(type)[]
+
+DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(string);
+
+#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
+
+/* Declare macro for basic types */
+#define DECLARE_FETCH_FUNC(method, type) \
+extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \
+ void *data, void *dest)
+
+#define DECLARE_BASIC_FETCH_FUNCS(method) \
+DECLARE_FETCH_FUNC(method, u8); \
+DECLARE_FETCH_FUNC(method, u16); \
+DECLARE_FETCH_FUNC(method, u32); \
+DECLARE_FETCH_FUNC(method, u64)
+
+DECLARE_BASIC_FETCH_FUNCS(reg);
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+DECLARE_BASIC_FETCH_FUNCS(retval);
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+DECLARE_BASIC_FETCH_FUNCS(symbol);
+DECLARE_FETCH_FUNC(symbol, string);
+DECLARE_FETCH_FUNC(symbol, string_size);
+
+DECLARE_BASIC_FETCH_FUNCS(deref);
+DECLARE_FETCH_FUNC(deref, string);
+DECLARE_FETCH_FUNC(deref, string_size);
+
+DECLARE_BASIC_FETCH_FUNCS(bitfield);
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8) \
+DEFINE_FETCH_##method(u16) \
+DEFINE_FETCH_##method(u32) \
+DEFINE_FETCH_##method(u64)
+
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+#define ASSIGN_FETCH_FUNC(method, type) \
+ [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+ {.name = _name, \
+ .size = _size, \
+ .is_signed = sign, \
+ .print = PRINT_TYPE_FUNC_NAME(ptype), \
+ .fmt = PRINT_TYPE_FMT_NAME(ptype), \
+ .fmttype = _fmttype, \
+ .fetch = { \
+ASSIGN_FETCH_FUNC(reg, ftype), \
+ASSIGN_FETCH_FUNC(stack, ftype), \
+ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(memory, ftype), \
+ASSIGN_FETCH_FUNC(symbol, ftype), \
+ASSIGN_FETCH_FUNC(deref, ftype), \
+ASSIGN_FETCH_FUNC(bitfield, ftype), \
+ASSIGN_FETCH_FUNC(file_offset, ftype), \
+ } \
+ }
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define ASSIGN_FETCH_TYPE_END {}
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
+/*
+ * Fetch type information table.
+ * It's declared as a weak symbol due to conditional compilation.
+ */
+extern __weak const struct fetch_type kprobes_fetch_type_table[];
+extern __weak const struct fetch_type uprobes_fetch_type_table[];
+
+#ifdef CONFIG_KPROBE_EVENT
+struct symbol_cache;
+unsigned long update_symbol_cache(struct symbol_cache *sc);
+void free_symbol_cache(struct symbol_cache *sc);
+struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+#else
+/* uprobes do not support symbol fetch methods */
+#define fetch_symbol_u8 NULL
+#define fetch_symbol_u16 NULL
+#define fetch_symbol_u32 NULL
+#define fetch_symbol_u64 NULL
+#define fetch_symbol_string NULL
+#define fetch_symbol_string_size NULL
+
+struct symbol_cache {
+};
+static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc)
+{
+ return 0;
+}
+
+static inline void __used free_symbol_cache(struct symbol_cache *sc)
+{
+}
+
+static inline struct symbol_cache * __used
+alloc_symbol_cache(const char *sym, long offset)
+{
+ return NULL;
+}
+#endif /* CONFIG_KPROBE_EVENT */
+
+struct probe_arg {
+ struct fetch_param fetch;
+ struct fetch_param fetch_size;
+ unsigned int offset; /* Offset from argument entry */
+ const char *name; /* Name of this argument */
+ const char *comm; /* Command of this argument */
+ const struct fetch_type *type; /* Type of this argument */
+};
+
+struct trace_probe {
+ unsigned int flags; /* For TP_FLAG_* */
+ struct ftrace_event_class class;
+ struct ftrace_event_call call;
+ struct list_head files;
+ ssize_t size; /* trace entry size */
+ unsigned int nr_args;
+ struct probe_arg args[];
+};
+
+struct event_file_link {
+ struct ftrace_event_file *file;
+ struct list_head list;
+};
+
+static inline bool trace_probe_is_enabled(struct trace_probe *tp)
+{
+ return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+}
+
+static inline bool trace_probe_is_registered(struct trace_probe *tp)
+{
+ return !!(tp->flags & TP_FLAG_REGISTERED);
+}
+
+static nokprobe_inline void call_fetch(struct fetch_param *fprm,
+ struct pt_regs *regs, void *dest)
+{
+ return fprm->fn(regs, fprm->data, dest);
+}
+
+/* Check the name is good for event/group/fields */
+static inline int is_good_name(const char *name)
+{
+ if (!isalpha(*name) && *name != '_')
+ return 0;
+ while (*++name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ return 0;
+ }
+ return 1;
+}
+
+static inline struct event_file_link *
+find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+{
+ struct event_file_link *link;
+
+ list_for_each_entry(link, &tp->files, list)
+ if (link->file == file)
+ return link;
+
+ return NULL;
+}
+
+extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+ struct probe_arg *parg, bool is_return, bool is_kprobe);
+
+extern int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg);
+
+extern void traceprobe_update_arg(struct probe_arg *arg);
+extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+
+extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+
+extern ssize_t traceprobe_probes_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos,
+ int (*createfn)(int, char**));
+
+extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
+
+/* Sum up total data length for dynamic arraies (strings) */
+static nokprobe_inline int
+__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+{
+ int i, ret = 0;
+ u32 len;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ call_fetch(&tp->args[i].fetch_size, regs, &len);
+ ret += len;
+ }
+
+ return ret;
+}
+
+/* Store the value of each argument */
+static nokprobe_inline void
+store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
+ u8 *data, int maxlen)
+{
+ int i;
+ u32 end = tp->size;
+ u32 *dl; /* Data (relative) location */
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ /*
+ * First, we set the relative location and
+ * maximum data length to *dl
+ */
+ dl = (u32 *)(data + tp->args[i].offset);
+ *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+ /* Then try to fetch string or dynamic array data */
+ call_fetch(&tp->args[i].fetch, regs, dl);
+ /* Reduce maximum length */
+ end += get_rloc_len(*dl);
+ maxlen -= get_rloc_len(*dl);
+ /* Trick here, convert data_rloc to data_loc */
+ *dl = convert_rloc_to_loc(*dl,
+ ent_size + tp->args[i].offset);
+ } else
+ /* Just fetching data normally */
+ call_fetch(&tp->args[i].fetch, regs,
+ data + tp->args[i].offset);
+ }
+}
+
+extern int set_print_fmt(struct trace_probe *tp, bool is_return);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7e62c0a1845..3f34dc9b40f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
unsigned long flags, int pc)
{
struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_state = next->state;
entry->next_cpu = task_cpu(next);
- if (!filter_check_discard(call, entry, buffer, event))
+ if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, flags, pc);
}
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = ctx_trace->data[cpu];
+ data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
if (likely(!atomic_read(&data->disabled)))
tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct ftrace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
sizeof(*entry), flags, pc);
@@ -101,10 +101,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_state = wakee->state;
entry->next_cpu = task_cpu(wakee);
- if (!filter_check_discard(call, entry, buffer, event))
- ring_buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr->buffer, flags, 6, pc);
- ftrace_trace_userstack(tr->buffer, flags, pc);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
}
static void
@@ -125,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = ctx_trace->data[cpu];
+ data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
if (likely(!atomic_read(&data->disabled)))
tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ff791ea48b5..19bd8928ce9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
* Based on code from the latency_tracer, that is:
*
* Copyright (C) 2004-2006 Ingo Molnar
- * Copyright (C) 2004 William Lee Irwin III
+ * Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/module.h>
#include <linux/fs.h>
@@ -15,8 +15,9 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
#include <trace/events/sched.h>
-
#include "trace.h"
static struct trace_array *wakeup_trace;
@@ -27,6 +28,8 @@ static int wakeup_cpu;
static int wakeup_current_cpu;
static unsigned wakeup_prio = -1;
static int wakeup_rt;
+static int wakeup_dl;
+static int tracing_dl = 0;
static arch_spinlock_t wakeup_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -36,7 +39,8 @@ static void __wakeup_reset(struct trace_array *tr);
static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-static int save_lat_flag;
+static int save_flags;
+static bool function_enabled;
#define TRACE_DISPLAY_GRAPH 1
@@ -89,7 +93,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (cpu != wakeup_current_cpu)
goto out_enable;
- *data = tr->data[cpu];
+ *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -108,7 +112,8 @@ out_enable:
* wakeup uses its own tracer function to keep the overhead down:
*/
static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
@@ -125,23 +130,64 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
atomic_dec(&data->disabled);
preempt_enable_notrace();
}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = wakeup_tracer_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
-};
#endif /* CONFIG_FUNCTION_TRACER */
-static int start_func_tracer(int graph)
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
- if (!graph)
- ret = register_ftrace_function(&trace_ops);
- else
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
ret = register_ftrace_graph(&wakeup_graph_return,
&wakeup_graph_entry);
+ else
+ ret = register_ftrace_function(tr->ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
+}
+
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph();
+ else
+ unregister_ftrace_function(tr->ops);
+
+ function_enabled = false;
+}
+
+static void wakeup_function_set(struct trace_array *tr, int set)
+{
+ if (set)
+ register_wakeup_function(tr, is_graph(), 1);
+ else
+ unregister_wakeup_function(tr, is_graph());
+}
+
+static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
+{
+ struct tracer *tracer = tr->current_trace;
+
+ if (mask & TRACE_ITER_FUNCTION)
+ wakeup_function_set(tr, set);
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_func_tracer(struct trace_array *tr, int graph)
+{
+ int ret;
+
+ ret = register_wakeup_function(tr, graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -151,18 +197,16 @@ static int start_func_tracer(int graph)
return ret;
}
-static void stop_func_tracer(int graph)
+static void stop_func_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- if (!graph)
- unregister_ftrace_function(&trace_ops);
- else
- unregister_ftrace_graph();
+ unregister_wakeup_function(tr, graph);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
if (!(bit & TRACE_DISPLAY_GRAPH))
@@ -171,12 +215,12 @@ static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
if (!(is_graph() ^ set))
return 0;
- stop_func_tracer(!set);
+ stop_func_tracer(tr, !set);
wakeup_reset(wakeup_trace);
- tracing_max_latency = 0;
+ tr->max_latency = 0;
- return start_func_tracer(set);
+ return start_func_tracer(tr, set);
}
static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -264,7 +308,8 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
return -EINVAL;
}
@@ -299,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(cycle_t delta)
+static int report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
return 0;
} else {
- if (delta <= tracing_max_latency)
+ if (delta <= tr->max_latency)
return 0;
}
return 1;
@@ -352,7 +397,7 @@ probe_wakeup_sched_switch(void *ignore,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -364,7 +409,7 @@ probe_wakeup_sched_switch(void *ignore,
goto out_unlock;
/* The task we are waiting for is waking up */
- data = wakeup_trace->data[wakeup_cpu];
+ data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -373,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
T1 = ftrace_now(cpu);
delta = T1-T0;
- if (!report_latency(delta))
+ if (!report_latency(wakeup_trace, delta))
goto out_unlock;
if (likely(!is_tracing_stopped())) {
- tracing_max_latency = delta;
+ wakeup_trace->max_latency = delta;
update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
}
@@ -386,13 +431,14 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&wakeup_trace->data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
{
wakeup_cpu = -1;
wakeup_prio = -1;
+ tracing_dl = 0;
if (wakeup_task)
put_task_struct(wakeup_task);
@@ -404,7 +450,7 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
local_irq_save(flags);
arch_spin_lock(&wakeup_lock);
@@ -428,13 +474,21 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
tracing_record_cmdline(p);
tracing_record_cmdline(current);
- if ((wakeup_rt && !rt_task(p)) ||
- p->prio >= wakeup_prio ||
- p->prio >= current->prio)
+ /*
+ * Semantic is like this:
+ * - wakeup tracer handles all tasks in the system, independently
+ * from their scheduling class;
+ * - wakeup_rt tracer handles tasks belonging to sched_dl and
+ * sched_rt class;
+ * - wakeup_dl handles tasks belonging to sched_dl class only.
+ */
+ if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
+ (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
pc = preempt_count();
- disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -442,7 +496,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
arch_spin_lock(&wakeup_lock);
/* check for races. */
- if (!tracer_enabled || p->prio >= wakeup_prio)
+ if (!tracer_enabled || tracing_dl ||
+ (!dl_task(p) && p->prio >= wakeup_prio))
goto out_locked;
/* reset the trace */
@@ -452,12 +507,21 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
wakeup_current_cpu = wakeup_cpu;
wakeup_prio = p->prio;
+ /*
+ * Once you start tracing a -deadline task, don't bother tracing
+ * another task until the first one wakes up.
+ */
+ if (dl_task(p))
+ tracing_dl = 1;
+ else
+ tracing_dl = 0;
+
wakeup_task = p;
get_task_struct(wakeup_task);
local_save_flags(flags);
- data = wakeup_trace->data[wakeup_cpu];
+ data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
@@ -471,7 +535,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&wakeup_trace->data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
@@ -517,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
- if (start_func_tracer(is_graph()))
+ if (start_func_tracer(tr, is_graph()))
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
@@ -530,44 +594,75 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- stop_func_tracer(is_graph());
+ stop_func_tracer(tr, is_graph());
unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
unregister_trace_sched_wakeup(probe_wakeup, NULL);
unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
+static bool wakeup_busy;
+
static int __wakeup_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
- tracing_max_latency = 0;
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+
+ tr->max_latency = 0;
wakeup_trace = tr;
+ ftrace_init_array_ops(tr, wakeup_tracer_call);
start_wakeup_tracer(tr);
+
+ wakeup_busy = true;
return 0;
}
static int wakeup_tracer_init(struct trace_array *tr)
{
+ if (wakeup_busy)
+ return -EBUSY;
+
+ wakeup_dl = 0;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
}
static int wakeup_rt_tracer_init(struct trace_array *tr)
{
+ if (wakeup_busy)
+ return -EBUSY;
+
+ wakeup_dl = 0;
wakeup_rt = 1;
return __wakeup_tracer_init(tr);
}
+static int wakeup_dl_tracer_init(struct trace_array *tr)
+{
+ if (wakeup_busy)
+ return -EBUSY;
+
+ wakeup_dl = 1;
+ wakeup_rt = 0;
+ return __wakeup_tracer_init(tr);
+}
+
static void wakeup_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ ftrace_reset_array_ops(tr);
+ wakeup_busy = false;
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -588,17 +683,19 @@ static struct tracer wakeup_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
- .print_max = 1,
+ .print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
- .use_max_tr = 1,
+ .allow_instances = true,
+ .use_max_tr = true,
};
static struct tracer wakeup_rt_tracer __read_mostly =
@@ -608,18 +705,40 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
- .wait_pipe = poll_wait_pipe,
- .print_max = 1,
+ .print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
- .use_max_tr = 1,
+ .allow_instances = true,
+ .use_max_tr = true,
+};
+
+static struct tracer wakeup_dl_tracer __read_mostly =
+{
+ .name = "wakeup_dl",
+ .init = wakeup_dl_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
+ .print_max = true,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
+ .use_max_tr = true,
};
__init static int init_wakeup_tracer(void)
@@ -634,6 +753,10 @@ __init static int init_wakeup_tracer(void)
if (ret)
return ret;
+ ret = register_tracer(&wakeup_dl_tracer);
+ if (ret)
+ return ret;
+
return 0;
}
-device_initcall(init_wakeup_tracer);
+core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 288541f977f..5ef60499dc8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
return 0;
}
-static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
+static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
{
struct ring_buffer_event *event;
struct trace_entry *entry;
unsigned int loops = 0;
- while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
+ while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
entry = ring_buffer_event_data(event);
/*
@@ -58,16 +58,16 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
-static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
/* Don't allow flipping of max traces now */
local_irq_save(flags);
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&buf->tr->max_lock);
- cnt = ring_buffer_entries(tr->buffer);
+ cnt = ring_buffer_entries(buf->buffer);
/*
* The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,12 +78,12 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
*/
tracing_off();
for_each_possible_cpu(cpu) {
- ret = trace_test_buffer_cpu(tr, cpu);
+ ret = trace_test_buffer_cpu(buf, cpu);
if (ret)
break;
}
tracing_on();
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&buf->tr->max_lock);
local_irq_restore(flags);
if (count)
@@ -103,54 +103,62 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
static int trace_selftest_test_probe1_cnt;
static void trace_selftest_test_probe1_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_probe1_cnt++;
}
static int trace_selftest_test_probe2_cnt;
static void trace_selftest_test_probe2_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_probe2_cnt++;
}
static int trace_selftest_test_probe3_cnt;
static void trace_selftest_test_probe3_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_probe3_cnt++;
}
static int trace_selftest_test_global_cnt;
static void trace_selftest_test_global_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_global_cnt++;
}
static int trace_selftest_test_dyn_cnt;
static void trace_selftest_test_dyn_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_dyn_cnt++;
}
static struct ftrace_ops test_probe1 = {
.func = trace_selftest_test_probe1_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe2 = {
.func = trace_selftest_test_probe2_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe3 = {
.func = trace_selftest_test_probe3_func,
-};
-
-static struct ftrace_ops test_global = {
- .func = trace_selftest_test_global_func,
- .flags = FTRACE_OPS_FL_GLOBAL,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static void print_counts(void)
@@ -172,7 +180,7 @@ static void reset_counts(void)
trace_selftest_test_dyn_cnt = 0;
}
-static int trace_selftest_ops(int cnt)
+static int trace_selftest_ops(struct trace_array *tr, int cnt)
{
int save_ftrace_enabled = ftrace_enabled;
struct ftrace_ops *dyn_ops;
@@ -207,7 +215,11 @@ static int trace_selftest_ops(int cnt)
register_ftrace_function(&test_probe1);
register_ftrace_function(&test_probe2);
register_ftrace_function(&test_probe3);
- register_ftrace_function(&test_global);
+ /* First time we are running with main function */
+ if (cnt > 1) {
+ ftrace_init_array_ops(tr, trace_selftest_test_global_func);
+ register_ftrace_function(tr->ops);
+ }
DYN_FTRACE_TEST_NAME();
@@ -219,8 +231,10 @@ static int trace_selftest_ops(int cnt)
goto out;
if (trace_selftest_test_probe3_cnt != 1)
goto out;
- if (trace_selftest_test_global_cnt == 0)
- goto out;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ }
DYN_FTRACE_TEST_NAME2();
@@ -256,8 +270,10 @@ static int trace_selftest_ops(int cnt)
goto out_free;
if (trace_selftest_test_probe3_cnt != 3)
goto out_free;
- if (trace_selftest_test_global_cnt == 0)
- goto out;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ }
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
@@ -282,7 +298,9 @@ static int trace_selftest_ops(int cnt)
unregister_ftrace_function(&test_probe1);
unregister_ftrace_function(&test_probe2);
unregister_ftrace_function(&test_probe3);
- unregister_ftrace_function(&test_global);
+ if (cnt > 1)
+ unregister_ftrace_function(tr->ops);
+ ftrace_reset_array_ops(tr);
/* Make sure everything is off */
reset_counts();
@@ -302,12 +320,11 @@ static int trace_selftest_ops(int cnt)
}
/* Test dynamic code modification and ftrace filters */
-int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
- struct trace_array *tr,
- int (*func)(void))
+static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+ struct trace_array *tr,
+ int (*func)(void))
{
int save_ftrace_enabled = ftrace_enabled;
- int save_tracer_enabled = tracer_enabled;
unsigned long count;
char *func_name;
int ret;
@@ -318,7 +335,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* enable tracing, and record the filter function */
ftrace_enabled = 1;
- tracer_enabled = 1;
/* passed in by parameter to fool gcc from optimizing */
func();
@@ -344,7 +360,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
msleep(100);
/* we should have nothing in the buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
if (ret)
goto out;
@@ -365,7 +381,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
ftrace_enabled = 0;
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
tracing_start();
/* we should only have one item */
@@ -377,45 +393,277 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
}
/* Test the ops with global tracing running */
- ret = trace_selftest_ops(1);
+ ret = trace_selftest_ops(tr, 1);
trace->reset(tr);
out:
ftrace_enabled = save_ftrace_enabled;
- tracer_enabled = save_tracer_enabled;
/* Enable tracing on all functions again */
ftrace_set_global_filter(NULL, 0, 1);
/* Test the ops with global tracing off */
if (!ret)
- ret = trace_selftest_ops(2);
+ ret = trace_selftest_ops(tr, 2);
+
+ return ret;
+}
+
+static int trace_selftest_recursion_cnt;
+static void trace_selftest_test_recursion_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ /*
+ * This function is registered without the recursion safe flag.
+ * The ftrace infrastructure should provide the recursion
+ * protection. If not, this will crash the kernel!
+ */
+ if (trace_selftest_recursion_cnt++ > 10)
+ return;
+ DYN_FTRACE_TEST_NAME();
+}
+
+static void trace_selftest_test_recursion_safe_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ /*
+ * We said we would provide our own recursion. By calling
+ * this function again, we should recurse back into this function
+ * and count again. But this only happens if the arch supports
+ * all of ftrace features and nothing else is using the function
+ * tracing utility.
+ */
+ if (trace_selftest_recursion_cnt++)
+ return;
+ DYN_FTRACE_TEST_NAME();
+}
+
+static struct ftrace_ops test_rec_probe = {
+ .func = trace_selftest_test_recursion_func,
+};
+
+static struct ftrace_ops test_recsafe_probe = {
+ .func = trace_selftest_test_recursion_safe_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static int
+trace_selftest_function_recursion(void)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ char *func_name;
+ int len;
+ int ret;
+
+ /* The previous test PASSED */
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace recursion: ");
+
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+
+ /* Handle PPC64 '.' name */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
+ if (ret) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_rec_probe);
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_rec_probe);
+
+ ret = -1;
+ if (trace_selftest_recursion_cnt != 1) {
+ pr_cont("*callback not called once (%d)* ",
+ trace_selftest_recursion_cnt);
+ goto out;
+ }
+
+ trace_selftest_recursion_cnt = 1;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace recursion safe: ");
+
+ ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
+ if (ret) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_recsafe_probe);
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_recsafe_probe);
+
+ ret = -1;
+ if (trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called expected 2 times (%d)* ",
+ trace_selftest_recursion_cnt);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ftrace_enabled = save_ftrace_enabled;
return ret;
}
#else
# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+# define trace_selftest_function_recursion() ({ 0; })
#endif /* CONFIG_DYNAMIC_FTRACE */
+static enum {
+ TRACE_SELFTEST_REGS_START,
+ TRACE_SELFTEST_REGS_FOUND,
+ TRACE_SELFTEST_REGS_NOT_FOUND,
+} trace_selftest_regs_stat;
+
+static void trace_selftest_test_regs_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ if (pt_regs)
+ trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
+ else
+ trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
+}
+
+static struct ftrace_ops test_regs_probe = {
+ .func = trace_selftest_test_regs_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static int
+trace_selftest_function_regs(void)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ char *func_name;
+ int len;
+ int ret;
+ int supported = 0;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ supported = 1;
+#endif
+
+ /* The previous test PASSED */
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace regs%s: ",
+ !supported ? "(no arch support)" : "");
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+
+ /* Handle PPC64 '.' name */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
+ /*
+ * If DYNAMIC_FTRACE is not set, then we just trace all functions.
+ * This test really doesn't care.
+ */
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_regs_probe);
+ /*
+ * Now if the arch does not support passing regs, then this should
+ * have failed.
+ */
+ if (!supported) {
+ if (!ret) {
+ pr_cont("*registered save-regs without arch support* ");
+ goto out;
+ }
+ test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
+ ret = register_ftrace_function(&test_regs_probe);
+ }
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_regs_probe);
+
+ ret = -1;
+
+ switch (trace_selftest_regs_stat) {
+ case TRACE_SELFTEST_REGS_START:
+ pr_cont("*callback never called* ");
+ goto out;
+
+ case TRACE_SELFTEST_REGS_FOUND:
+ if (supported)
+ break;
+ pr_cont("*callback received regs without arch support* ");
+ goto out;
+
+ case TRACE_SELFTEST_REGS_NOT_FOUND:
+ if (!supported)
+ break;
+ pr_cont("*callback received NULL regs* ");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ftrace_enabled = save_ftrace_enabled;
+
+ return ret;
+}
+
/*
* Simple verification test of ftrace function tracer.
* Enable ftrace, sleep 1/10 second, and then read the trace
* buffer to see if all is in order.
*/
-int
+__init int
trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
{
int save_ftrace_enabled = ftrace_enabled;
- int save_tracer_enabled = tracer_enabled;
unsigned long count;
int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (ftrace_filter_param) {
+ printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
+ return 0;
+ }
+#endif
+
/* make sure msleep has been recorded */
msleep(1);
/* start the tracing */
ftrace_enabled = 1;
- tracer_enabled = 1;
ret = tracer_init(trace, tr);
if (ret) {
@@ -430,7 +678,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ftrace_enabled = 0;
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -442,10 +690,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ret = trace_selftest_startup_dynamic_tracing(trace, tr,
DYN_FTRACE_TEST_NAME);
+ if (ret)
+ goto out;
+
+ ret = trace_selftest_function_recursion();
+ if (ret)
+ goto out;
+ ret = trace_selftest_function_regs();
out:
ftrace_enabled = save_ftrace_enabled;
- tracer_enabled = save_tracer_enabled;
/* kill ftrace totally if we failed */
if (ret)
@@ -461,8 +715,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -472,8 +724,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops)
- __ftrace_dump(false, DUMP_ALL);
+ if (ftrace_dump_on_oops) {
+ ftrace_dump(DUMP_ALL);
+ /* ftrace_dump() disables tracing */
+ tracing_on();
+ }
return 0;
}
@@ -484,18 +739,25 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
* Pretty much the same than for the function tracer from which the selftest
* has been borrowed.
*/
-int
+__init int
trace_selftest_startup_function_graph(struct tracer *trace,
struct trace_array *tr)
{
int ret;
unsigned long count;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (ftrace_filter_param) {
+ printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
+ return 0;
+ }
+#endif
+
/*
* Simulate the init() callback but we attach a watchdog callback
* to detect and recover from possible hangs
*/
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
set_graph_array(tr);
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry_watchdog);
@@ -518,7 +780,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -545,7 +807,7 @@ out:
int
trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -557,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
/* disable interrupts for a bit */
local_irq_disable();
udelay(100);
@@ -573,9 +835,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -584,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
ret = -1;
}
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
return ret;
}
@@ -594,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
int
trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -619,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
/* disable preemption for a bit */
preempt_disable();
udelay(100);
@@ -635,9 +897,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -646,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
ret = -1;
}
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
return ret;
}
@@ -656,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
int
trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -681,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
/* disable preemption and interrupts for a bit */
preempt_disable();
@@ -701,11 +963,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (ret)
goto out;
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
if (ret)
goto out;
@@ -716,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* do the test by disabling interrupts first this time */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
tracing_start();
trace->start(tr);
@@ -731,11 +993,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (ret)
goto out;
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -747,7 +1009,7 @@ out:
tracing_start();
out_no_start:
trace->reset(tr);
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
return ret;
}
@@ -765,11 +1027,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
#ifdef CONFIG_SCHED_TRACER
static int trace_wakeup_test_thread(void *data)
{
- /* Make this a RT thread, doesn't need to be too high */
- static const struct sched_param param = { .sched_priority = 5 };
+ /* Make this a -deadline thread */
+ static const struct sched_attr attr = {
+ .sched_policy = SCHED_DEADLINE,
+ .sched_runtime = 100000ULL,
+ .sched_deadline = 10000000ULL,
+ .sched_period = 10000000ULL
+ };
struct completion *x = data;
- sched_setscheduler(current, SCHED_FIFO, &param);
+ sched_setattr(current, &attr);
/* Make it know we have a new prio */
complete(x);
@@ -778,11 +1045,13 @@ static int trace_wakeup_test_thread(void *data)
set_current_state(TASK_INTERRUPTIBLE);
schedule();
+ complete(x);
+
/* we are awake, now wait to disappear */
while (!kthread_should_stop()) {
/*
- * This is an RT task, do short sleeps to let
- * others run.
+ * This will likely be the system top priority
+ * task, do short sleeps to let others run.
*/
msleep(100);
}
@@ -793,23 +1062,23 @@ static int trace_wakeup_test_thread(void *data)
int
trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
struct task_struct *p;
- struct completion isrt;
+ struct completion is_ready;
unsigned long count;
int ret;
- init_completion(&isrt);
+ init_completion(&is_ready);
- /* create a high prio thread */
- p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+ /* create a -deadline thread */
+ p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
if (IS_ERR(p)) {
printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
return -1;
}
- /* make sure the thread is running at an RT prio */
- wait_for_completion(&isrt);
+ /* make sure the thread is running at -deadline policy */
+ wait_for_completion(&is_ready);
/* start the tracing */
ret = tracer_init(trace, tr);
@@ -819,39 +1088,37 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
- /* sleep to let the RT thread sleep too */
- msleep(100);
+ while (p->on_rq) {
+ /*
+ * Sleep to make sure the -deadline thread is asleep too.
+ * On virtual machines we can't rely on timings,
+ * but we want to make sure this test still works.
+ */
+ msleep(100);
+ }
- /*
- * Yes this is slightly racy. It is possible that for some
- * strange reason that the RT thread we created, did not
- * call schedule for 100ms after doing the completion,
- * and we do a wakeup on a task that already is awake.
- * But that is extremely unlikely, and the worst thing that
- * happens in such a case, is that we disable tracing.
- * Honestly, if this race does happen something is horrible
- * wrong with the system.
- */
+ init_completion(&is_ready);
wake_up_process(p);
- /* give a little time to let the thread wake up */
- msleep(100);
+ /* Wait for the task to wake up */
+ wait_for_completion(&is_ready);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ printk("ret = %d\n", ret);
if (!ret)
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
/* kill the thread */
kthread_stop(p);
@@ -884,7 +1151,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -916,7 +1183,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d4545f49242..8a4e5cb66a4 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,7 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/magic.h>
#include <asm/setup.h>
@@ -20,61 +21,114 @@
#define STACK_TRACE_ENTRIES 500
+#ifdef CC_USING_FENTRY
+# define fentry 1
+#else
+# define fentry 0
+#endif
+
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
static struct stack_trace max_stack_trace = {
- .max_entries = STACK_TRACE_ENTRIES,
- .entries = stack_dump_trace,
+ .max_entries = STACK_TRACE_ENTRIES - 1,
+ .entries = &stack_dump_trace[1],
};
static unsigned long max_stack_size;
static arch_spinlock_t max_stack_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-static int stack_trace_disabled __read_mostly;
static DEFINE_PER_CPU(int, trace_active);
static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
-static inline void check_stack(void)
+static inline void print_max_stack(void)
+{
+ long i;
+ int size;
+
+ pr_emerg(" Depth Size Location (%d entries)\n"
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries - 1);
+
+ for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ if (stack_dump_trace[i] == ULONG_MAX)
+ break;
+ if (i+1 == max_stack_trace.nr_entries ||
+ stack_dump_trace[i+1] == ULONG_MAX)
+ size = stack_dump_index[i];
+ else
+ size = stack_dump_index[i] - stack_dump_index[i+1];
+
+ pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
+ size, (void *)stack_dump_trace[i]);
+ }
+}
+
+static inline void
+check_stack(unsigned long ip, unsigned long *stack)
{
- unsigned long this_size, flags;
- unsigned long *p, *top, *start;
+ unsigned long this_size, flags; unsigned long *p, *top, *start;
+ static int tracer_frame;
+ int frame_size = ACCESS_ONCE(tracer_frame);
int i;
- this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+ this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
+ /* Remove the frame of the tracer */
+ this_size -= frame_size;
if (this_size <= max_stack_size)
return;
/* we do not handle interrupt stacks yet */
- if (!object_is_on_stack(&this_size))
+ if (!object_is_on_stack(stack))
return;
local_irq_save(flags);
arch_spin_lock(&max_stack_lock);
+ /* In case another CPU set the tracer_frame on us */
+ if (unlikely(!frame_size))
+ this_size -= tracer_frame;
+
/* a race could have already updated it */
if (this_size <= max_stack_size)
goto out;
max_stack_size = this_size;
- max_stack_trace.nr_entries = 0;
- max_stack_trace.skip = 3;
+ max_stack_trace.nr_entries = 0;
+
+ if (using_ftrace_ops_list_func())
+ max_stack_trace.skip = 4;
+ else
+ max_stack_trace.skip = 3;
save_stack_trace(&max_stack_trace);
/*
+ * Add the passed in ip from the function tracer.
+ * Searching for this on the stack will skip over
+ * most of the overhead from the stack tracer itself.
+ */
+ stack_dump_trace[0] = ip;
+ max_stack_trace.nr_entries++;
+
+ /*
* Now find where in the stack these are.
*/
i = 0;
- start = &this_size;
+ start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -98,6 +152,18 @@ static inline void check_stack(void)
found = 1;
/* Start the search from here */
start = p + 1;
+ /*
+ * We do not want to show the overhead
+ * of the stack tracer stack in the
+ * max stack. If we haven't figured
+ * out what that is, then figure it out
+ * now.
+ */
+ if (unlikely(!tracer_frame) && i == 1) {
+ tracer_frame = (p - stack) *
+ sizeof(unsigned long);
+ max_stack_size -= tracer_frame;
+ }
}
}
@@ -105,19 +171,24 @@ static inline void check_stack(void)
i++;
}
+ if ((current != &init_task &&
+ *(end_of_stack(current)) != STACK_END_MAGIC)) {
+ print_max_stack();
+ BUG();
+ }
+
out:
arch_spin_unlock(&max_stack_lock);
local_irq_restore(flags);
}
static void
-stack_trace_call(unsigned long ip, unsigned long parent_ip)
+stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
+ unsigned long stack;
int cpu;
- if (unlikely(!ftrace_enabled || stack_trace_disabled))
- return;
-
preempt_disable_notrace();
cpu = raw_smp_processor_id();
@@ -125,7 +196,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- check_stack();
+ /*
+ * When fentry is used, the traced function does not get
+ * its stack frame set up, and we lose the parent.
+ * The ip is pretty useless because the function tracer
+ * was called before that function set up its stack frame.
+ * In this case, we use the parent ip.
+ *
+ * By adding the return address of either the parent ip
+ * or the current ip we can disregard most of the stack usage
+ * caused by the stack tracer itself.
+ *
+ * The function tracer always reports the address of where the
+ * mcount call was, but the stack will hold the return address.
+ */
+ if (fentry)
+ ip = parent_ip;
+ else
+ ip += MCOUNT_INSN_SIZE;
+
+ check_stack(ip, &stack);
out:
per_cpu(trace_active, cpu)--;
@@ -136,6 +226,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static ssize_t
@@ -324,7 +415,7 @@ static const struct file_operations stack_trace_filter_fops = {
.open = stack_trace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = tracing_lseek,
.release = ftrace_regex_release,
};
@@ -373,6 +464,8 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
&max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e7..7af67360b33 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
/* The root directory for all stat files */
static struct dentry *stat_dir;
-/*
- * Iterate through the rbtree using a post order traversal path
- * to release the next node.
- * It won't necessary release one at each iteration
- * but it will at least advance closer to the next one
- * to be released.
- */
-static struct rb_node *release_next(struct tracer_stat *ts,
- struct rb_node *node)
+static void __reset_stat_session(struct stat_session *session)
{
- struct stat_node *snode;
- struct rb_node *parent = rb_parent(node);
-
- if (node->rb_left)
- return node->rb_left;
- else if (node->rb_right)
- return node->rb_right;
- else {
- if (!parent)
- ;
- else if (parent->rb_left == node)
- parent->rb_left = NULL;
- else
- parent->rb_right = NULL;
+ struct stat_node *snode, *n;
- snode = container_of(node, struct stat_node, node);
- if (ts->stat_release)
- ts->stat_release(snode->stat);
+ rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) {
+ if (session->ts->stat_release)
+ session->ts->stat_release(snode->stat);
kfree(snode);
-
- return parent;
}
-}
-
-static void __reset_stat_session(struct stat_session *session)
-{
- struct rb_node *node = session->stat_root.rb_node;
-
- while (node)
- node = release_next(session->ts, node);
session->stat_root = RB_ROOT;
}
@@ -307,6 +276,8 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
+ if (!d_tracing)
+ return 0;
stat_dir = debugfs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cb654542c1a..759d5e00451 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
+#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
@@ -11,18 +12,11 @@
#include "trace.h"
static DEFINE_MUTEX(syscall_trace_lock);
-static int sys_refcount_enter;
-static int sys_refcount_exit;
-static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
static int syscall_enter_register(struct ftrace_event_call *event,
- enum trace_reg type);
+ enum trace_reg type, void *data);
static int syscall_exit_register(struct ftrace_event_call *event,
- enum trace_reg type);
-
-static int syscall_enter_define_fields(struct ftrace_event_call *call);
-static int syscall_exit_define_fields(struct ftrace_event_call *call);
+ enum trace_reg type, void *data);
static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
@@ -32,30 +26,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
return &entry->enter_fields;
}
-struct trace_event_functions enter_syscall_print_funcs = {
- .trace = print_syscall_enter,
-};
-
-struct trace_event_functions exit_syscall_print_funcs = {
- .trace = print_syscall_exit,
-};
-
-struct ftrace_event_class event_class_syscall_enter = {
- .system = "syscalls",
- .reg = syscall_enter_register,
- .define_fields = syscall_enter_define_fields,
- .get_fields = syscall_get_enter_fields,
- .raw_init = init_syscall_trace,
-};
-
-struct ftrace_event_class event_class_syscall_exit = {
- .system = "syscalls",
- .reg = syscall_exit_register,
- .define_fields = syscall_exit_define_fields,
- .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
- .raw_init = init_syscall_trace,
-};
-
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
@@ -67,13 +37,45 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
/*
* Only compare after the "sys" prefix. Archs that use
* syscall wrappers may have syscalls symbols aliases prefixed
- * with "SyS" instead of "sys", leading to an unwanted
+ * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
* mismatch.
*/
return !strcmp(sym + 3, name + 3);
}
#endif
+#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
+/*
+ * Some architectures that allow for 32bit applications
+ * to run on a 64bit kernel, do not map the syscalls for
+ * the 32bit tasks the same as they do for 64bit tasks.
+ *
+ * *cough*x86*cough*
+ *
+ * In such a case, instead of reporting the wrong syscalls,
+ * simply ignore them.
+ *
+ * For an arch to ignore the compat syscalls it needs to
+ * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
+ * define the function arch_trace_is_compat_syscall() to let
+ * the tracing system know that it should ignore it.
+ */
+static int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ if (unlikely(arch_trace_is_compat_syscall(regs)))
+ return -1;
+
+ return syscall_get_nr(task, regs);
+}
+#else
+static inline int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ return syscall_get_nr(task, regs);
+}
+#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
+
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
{
@@ -104,7 +106,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
return syscalls_metadata[nr];
}
-enum print_line_t
+static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -157,7 +159,7 @@ end:
return TRACE_TYPE_HANDLED;
}
-enum print_line_t
+static enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -173,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
entry = syscall_nr_to_meta(syscall);
if (!entry) {
- trace_seq_printf(s, "\n");
+ trace_seq_putc(s, '\n');
return TRACE_TYPE_HANDLED;
}
@@ -198,8 +200,8 @@ extern char *__bad_type_size(void);
#type, #name, offsetof(typeof(trace), name), \
sizeof(trace.name), is_signed_type(type)
-static
-int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+static int __init
+__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
int i;
int pos = 0;
@@ -226,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
return pos;
}
-static int set_syscall_print_fmt(struct ftrace_event_call *call)
+static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
{
char *print_fmt;
int len;
@@ -251,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call)
return 0;
}
-static void free_syscall_print_fmt(struct ftrace_event_call *call)
+static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
{
struct syscall_metadata *entry = call->data;
@@ -259,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-static int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -282,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-static int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -297,19 +299,29 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
+ struct trace_array *tr = data;
+ struct ftrace_event_file *ftrace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- int size;
+ unsigned long irq_flags;
+ int pc;
int syscall_nr;
+ int size;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
- if (!test_bit(syscall_nr, enabled_enter_syscalls))
+
+ /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
+ ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+ if (!ftrace_file)
+ return;
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -318,8 +330,12 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->enter_event->event.type, size, 0, 0);
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer,
+ sys_data->enter_event->event.type, size, irq_flags, pc);
if (!event)
return;
@@ -327,31 +343,45 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
- if (!filter_current_check_discard(buffer, sys_data->enter_event,
- entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ irq_flags, pc);
}
-void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
+ struct trace_array *tr = data;
+ struct ftrace_event_file *ftrace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
+ unsigned long irq_flags;
+ int pc;
int syscall_nr;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
- if (!test_bit(syscall_nr, enabled_exit_syscalls))
+
+ /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
+ ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+ if (!ftrace_file)
+ return;
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
- event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer,
+ sys_data->exit_event->event.type, sizeof(*entry),
+ irq_flags, pc);
if (!event)
return;
@@ -359,13 +389,14 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- if (!filter_current_check_discard(buffer, sys_data->exit_event,
- entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ irq_flags, pc);
}
-int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int ret = 0;
int num;
@@ -373,33 +404,37 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
- if (!sys_refcount_enter)
- ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
+ if (!tr->sys_refcount_enter)
+ ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (!ret) {
- set_bit(num, enabled_enter_syscalls);
- sys_refcount_enter++;
+ rcu_assign_pointer(tr->enter_syscall_files[num], file);
+ tr->sys_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
-void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
- sys_refcount_enter--;
- clear_bit(num, enabled_enter_syscalls);
- if (!sys_refcount_enter)
- unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
+ tr->sys_refcount_enter--;
+ rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
+ if (!tr->sys_refcount_enter)
+ unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
}
-int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int ret = 0;
int num;
@@ -407,32 +442,34 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
- if (!sys_refcount_exit)
- ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
+ if (!tr->sys_refcount_exit)
+ ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
- set_bit(num, enabled_exit_syscalls);
- sys_refcount_exit++;
+ rcu_assign_pointer(tr->exit_syscall_files[num], file);
+ tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
-void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
- sys_refcount_exit--;
- clear_bit(num, enabled_exit_syscalls);
- if (!sys_refcount_exit)
- unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
+ tr->sys_refcount_exit--;
+ rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
+ if (!tr->sys_refcount_exit)
+ unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
}
-int init_syscall_trace(struct ftrace_event_call *call)
+static int __init init_syscall_trace(struct ftrace_event_call *call)
{
int id;
int num;
@@ -457,19 +494,43 @@ int init_syscall_trace(struct ftrace_event_call *call)
return id;
}
+struct trace_event_functions enter_syscall_print_funcs = {
+ .trace = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+ .trace = print_syscall_exit,
+};
+
+struct ftrace_event_class __refdata event_class_syscall_enter = {
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
+};
+
+struct ftrace_event_class __refdata event_class_syscall_exit = {
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
+ .raw_init = init_syscall_trace,
+};
+
unsigned long __init __weak arch_syscall_addr(int nr)
{
return (unsigned long)sys_call_table[nr];
}
-int __init init_ftrace_syscalls(void)
+static int __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
int i;
- syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
- NR_syscalls, GFP_KERNEL);
+ syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
+ GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
return -ENOMEM;
@@ -487,7 +548,7 @@ int __init init_ftrace_syscalls(void)
return 0;
}
-core_initcall(init_ftrace_syscalls);
+early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
@@ -505,7 +566,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int rctx;
int size;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
@@ -513,15 +576,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
/* get the size after alignment with the u32 buffer size field */
size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
- return;
-
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
sys_data->enter_event->event.type, regs, &rctx);
if (!rec)
@@ -530,12 +593,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
-
- head = this_cpu_ptr(sys_data->enter_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
-int perf_sysenter_enable(struct ftrace_event_call *call)
+static int perf_sysenter_enable(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -556,7 +617,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
return ret;
}
-void perf_sysenter_disable(struct ftrace_event_call *call)
+static void perf_sysenter_disable(struct ftrace_event_call *call)
{
int num;
@@ -579,7 +640,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int rctx;
int size;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;
@@ -587,18 +650,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- /*
- * Impossible, but be paranoid with the future
- * How to put this check outside runtime?
- */
- if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "exit event has grown above perf buffer size"))
- return;
-
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
sys_data->exit_event->event.type, regs, &rctx);
if (!rec)
@@ -606,12 +665,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
-
- head = this_cpu_ptr(sys_data->exit_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
-int perf_sysexit_enable(struct ftrace_event_call *call)
+static int perf_sysexit_enable(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -632,7 +689,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
return ret;
}
-void perf_sysexit_disable(struct ftrace_event_call *call)
+static void perf_sysexit_disable(struct ftrace_event_call *call)
{
int num;
@@ -649,13 +706,15 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
#endif /* CONFIG_PERF_EVENTS */
static int syscall_enter_register(struct ftrace_event_call *event,
- enum trace_reg type)
+ enum trace_reg type, void *data)
{
+ struct ftrace_event_file *file = data;
+
switch (type) {
case TRACE_REG_REGISTER:
- return reg_event_syscall_enter(event);
+ return reg_event_syscall_enter(file, event);
case TRACE_REG_UNREGISTER:
- unreg_event_syscall_enter(event);
+ unreg_event_syscall_enter(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
@@ -664,19 +723,26 @@ static int syscall_enter_register(struct ftrace_event_call *event,
case TRACE_REG_PERF_UNREGISTER:
perf_sysenter_disable(event);
return 0;
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
#endif
}
return 0;
}
static int syscall_exit_register(struct ftrace_event_call *event,
- enum trace_reg type)
+ enum trace_reg type, void *data)
{
+ struct ftrace_event_file *file = data;
+
switch (type) {
case TRACE_REG_REGISTER:
- return reg_event_syscall_exit(event);
+ return reg_event_syscall_exit(file, event);
case TRACE_REG_UNREGISTER:
- unreg_event_syscall_exit(event);
+ unreg_event_syscall_exit(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
@@ -685,6 +751,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,
case TRACE_REG_PERF_UNREGISTER:
perf_sysexit_disable(event);
return 0;
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
#endif
}
return 0;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 00000000000..3c9b97e6b1f
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,1340 @@
+/*
+ * uprobes-based tracing events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corporation, 2010-2012
+ * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "trace_probe.h"
+
+#define UPROBE_EVENT_SYSTEM "uprobes"
+
+struct uprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long vaddr[];
+};
+
+#define SIZEOF_TRACE_ENTRY(is_return) \
+ (sizeof(struct uprobe_trace_entry_head) + \
+ sizeof(unsigned long) * (is_return ? 2 : 1))
+
+#define DATAOF_TRACE_ENTRY(entry, is_return) \
+ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
+
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
+/*
+ * uprobe event core functions
+ */
+struct trace_uprobe {
+ struct list_head list;
+ struct trace_uprobe_filter filter;
+ struct uprobe_consumer consumer;
+ struct inode *inode;
+ char *filename;
+ unsigned long offset;
+ unsigned long nhit;
+ struct trace_probe tp;
+};
+
+#define SIZEOF_TRACE_UPROBE(n) \
+ (offsetof(struct trace_uprobe, tp.args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+static int register_uprobe_event(struct trace_uprobe *tu);
+static int unregister_uprobe_event(struct trace_uprobe *tu);
+
+static DEFINE_MUTEX(uprobe_lock);
+static LIST_HEAD(uprobe_list);
+
+struct uprobe_dispatch_data {
+ struct trace_uprobe *tu;
+ unsigned long bp_addr;
+};
+
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+ unsigned long func, struct pt_regs *regs);
+
+#ifdef CONFIG_STACK_GROWSUP
+static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
+{
+ return addr - (n * sizeof(long));
+}
+#else
+static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
+{
+ return addr + (n * sizeof(long));
+}
+#endif
+
+static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long ret;
+ unsigned long addr = user_stack_pointer(regs);
+
+ addr = adjust_stack_addr(addr, n);
+
+ if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
+ return 0;
+
+ return ret;
+}
+
+/*
+ * Uprobes-specific fetch functions
+ */
+#define DEFINE_FETCH_stack(type) \
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)get_user_stack_nth(regs, \
+ ((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+
+#define DEFINE_FETCH_memory(type) \
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
+ void *addr, void *dest) \
+{ \
+ type retval; \
+ void __user *vaddr = (void __force __user *) addr; \
+ \
+ if (copy_from_user(&retval, vaddr, sizeof(type))) \
+ *(type *)dest = 0; \
+ else \
+ *(type *) dest = retval; \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ long ret;
+ u32 rloc = *(u32 *)dest;
+ int maxlen = get_rloc_len(rloc);
+ u8 *dst = get_rloc_data(dest);
+ void __user *src = (void __force __user *) addr;
+
+ if (!maxlen)
+ return;
+
+ ret = strncpy_from_user(dst, src, maxlen);
+
+ if (ret < 0) { /* Failed to fetch string */
+ ((u8 *)get_rloc_data(dest))[0] = '\0';
+ *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
+ } else {
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
+ }
+}
+
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ int len;
+ void __user *vaddr = (void __force __user *) addr;
+
+ len = strnlen_user(vaddr, MAX_STRING_SIZE);
+
+ if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */
+ *(u32 *)dest = 0;
+ else
+ *(u32 *)dest = len;
+}
+
+static unsigned long translate_user_vaddr(void *file_offset)
+{
+ unsigned long base_addr;
+ struct uprobe_dispatch_data *udd;
+
+ udd = (void *) current->utask->vaddr;
+
+ base_addr = udd->bp_addr - udd->tu->offset;
+ return base_addr + (unsigned long)file_offset;
+}
+
+#define DEFINE_FETCH_file_offset(type) \
+static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
+ void *offset, void *dest)\
+{ \
+ void *vaddr = (void *)translate_user_vaddr(offset); \
+ \
+ FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \
+}
+DEFINE_BASIC_FETCH_FUNCS(file_offset)
+DEFINE_FETCH_file_offset(string)
+DEFINE_FETCH_file_offset(string_size)
+
+/* Fetch type information table */
+const struct fetch_type uprobes_fetch_type_table[] = {
+ /* Special types */
+ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+ sizeof(u32), 1, "__data_loc char[]"),
+ [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+ string_size, sizeof(u32), 0, "u32"),
+ /* Basic types */
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+
+ ASSIGN_FETCH_TYPE_END
+};
+
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+ rwlock_init(&filter->rwlock);
+ filter->nr_systemwide = 0;
+ INIT_LIST_HEAD(&filter->perf_events);
+}
+
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+ return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
+
+static inline bool is_ret_probe(struct trace_uprobe *tu)
+{
+ return tu->consumer.ret_handler != NULL;
+}
+
+/*
+ * Allocate new trace_uprobe and initialize it (including uprobes).
+ */
+static struct trace_uprobe *
+alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
+{
+ struct trace_uprobe *tu;
+
+ if (!event || !is_good_name(event))
+ return ERR_PTR(-EINVAL);
+
+ if (!group || !is_good_name(group))
+ return ERR_PTR(-EINVAL);
+
+ tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+ if (!tu)
+ return ERR_PTR(-ENOMEM);
+
+ tu->tp.call.class = &tu->tp.class;
+ tu->tp.call.name = kstrdup(event, GFP_KERNEL);
+ if (!tu->tp.call.name)
+ goto error;
+
+ tu->tp.class.system = kstrdup(group, GFP_KERNEL);
+ if (!tu->tp.class.system)
+ goto error;
+
+ INIT_LIST_HEAD(&tu->list);
+ INIT_LIST_HEAD(&tu->tp.files);
+ tu->consumer.handler = uprobe_dispatcher;
+ if (is_ret)
+ tu->consumer.ret_handler = uretprobe_dispatcher;
+ init_trace_uprobe_filter(&tu->filter);
+ tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
+ return tu;
+
+error:
+ kfree(tu->tp.call.name);
+ kfree(tu);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_trace_uprobe(struct trace_uprobe *tu)
+{
+ int i;
+
+ for (i = 0; i < tu->tp.nr_args; i++)
+ traceprobe_free_probe_arg(&tu->tp.args[i]);
+
+ iput(tu->inode);
+ kfree(tu->tp.call.class->system);
+ kfree(tu->tp.call.name);
+ kfree(tu->filename);
+ kfree(tu);
+}
+
+static struct trace_uprobe *find_probe_event(const char *event, const char *group)
+{
+ struct trace_uprobe *tu;
+
+ list_for_each_entry(tu, &uprobe_list, list)
+ if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
+ strcmp(tu->tp.call.class->system, group) == 0)
+ return tu;
+
+ return NULL;
+}
+
+/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
+static int unregister_trace_uprobe(struct trace_uprobe *tu)
+{
+ int ret;
+
+ ret = unregister_uprobe_event(tu);
+ if (ret)
+ return ret;
+
+ list_del(&tu->list);
+ free_trace_uprobe(tu);
+ return 0;
+}
+
+/* Register a trace_uprobe and probe_event */
+static int register_trace_uprobe(struct trace_uprobe *tu)
+{
+ struct trace_uprobe *old_tu;
+ int ret;
+
+ mutex_lock(&uprobe_lock);
+
+ /* register as an event */
+ old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+ tu->tp.call.class->system);
+ if (old_tu) {
+ /* delete old event */
+ ret = unregister_trace_uprobe(old_tu);
+ if (ret)
+ goto end;
+ }
+
+ ret = register_uprobe_event(tu);
+ if (ret) {
+ pr_warning("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ list_add_tail(&tu->list, &uprobe_list);
+
+end:
+ mutex_unlock(&uprobe_lock);
+
+ return ret;
+}
+
+/*
+ * Argument syntax:
+ * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
+ *
+ * - Remove uprobe: -:[GRP/]EVENT
+ */
+static int create_trace_uprobe(int argc, char **argv)
+{
+ struct trace_uprobe *tu;
+ struct inode *inode;
+ char *arg, *event, *group, *filename;
+ char buf[MAX_EVENT_NAME_LEN];
+ struct path path;
+ unsigned long offset;
+ bool is_delete, is_return;
+ int i, ret;
+
+ inode = NULL;
+ ret = 0;
+ is_delete = false;
+ is_return = false;
+ event = NULL;
+ group = NULL;
+
+ /* argc must be >= 1 */
+ if (argv[0][0] == '-')
+ is_delete = true;
+ else if (argv[0][0] == 'r')
+ is_return = true;
+ else if (argv[0][0] != 'p') {
+ pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ arg = strchr(event, '/');
+
+ if (arg) {
+ group = event;
+ event = arg + 1;
+ event[-1] = '\0';
+
+ if (strlen(group) == 0) {
+ pr_info("Group name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (!group)
+ group = UPROBE_EVENT_SYSTEM;
+
+ if (is_delete) {
+ int ret;
+
+ if (!event) {
+ pr_info("Delete command needs an event name.\n");
+ return -EINVAL;
+ }
+ mutex_lock(&uprobe_lock);
+ tu = find_probe_event(event, group);
+
+ if (!tu) {
+ mutex_unlock(&uprobe_lock);
+ pr_info("Event %s/%s doesn't exist.\n", group, event);
+ return -ENOENT;
+ }
+ /* delete an event */
+ ret = unregister_trace_uprobe(tu);
+ mutex_unlock(&uprobe_lock);
+ return ret;
+ }
+
+ if (argc < 2) {
+ pr_info("Probe point is not specified.\n");
+ return -EINVAL;
+ }
+ if (isdigit(argv[1][0])) {
+ pr_info("probe point must be have a filename.\n");
+ return -EINVAL;
+ }
+ arg = strchr(argv[1], ':');
+ if (!arg) {
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+
+ *arg++ = '\0';
+ filename = argv[1];
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_address_parse;
+
+ inode = igrab(path.dentry->d_inode);
+ path_put(&path);
+
+ if (!inode || !S_ISREG(inode->i_mode)) {
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+
+ ret = kstrtoul(arg, 0, &offset);
+ if (ret)
+ goto fail_address_parse;
+
+ argc -= 2;
+ argv += 2;
+
+ /* setup a probe */
+ if (!event) {
+ char *tail;
+ char *ptr;
+
+ tail = kstrdup(kbasename(filename), GFP_KERNEL);
+ if (!tail) {
+ ret = -ENOMEM;
+ goto fail_address_parse;
+ }
+
+ ptr = strpbrk(tail, ".-_");
+ if (ptr)
+ *ptr = '\0';
+
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
+ event = buf;
+ kfree(tail);
+ }
+
+ tu = alloc_trace_uprobe(group, event, argc, is_return);
+ if (IS_ERR(tu)) {
+ pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
+ ret = PTR_ERR(tu);
+ goto fail_address_parse;
+ }
+ tu->offset = offset;
+ tu->inode = inode;
+ tu->filename = kstrdup(filename, GFP_KERNEL);
+
+ if (!tu->filename) {
+ pr_info("Failed to allocate filename.\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* parse arguments */
+ ret = 0;
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ struct probe_arg *parg = &tu->tp.args[i];
+
+ /* Increment count for freeing args in error case */
+ tu->tp.nr_args++;
+
+ /* Parse argument name */
+ arg = strchr(argv[i], '=');
+ if (arg) {
+ *arg++ = '\0';
+ parg->name = kstrdup(argv[i], GFP_KERNEL);
+ } else {
+ arg = argv[i];
+ /* If argument name is omitted, set "argN" */
+ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+ parg->name = kstrdup(buf, GFP_KERNEL);
+ }
+
+ if (!parg->name) {
+ pr_info("Failed to allocate argument[%d] name.\n", i);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (!is_good_name(parg->name)) {
+ pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
+ pr_info("Argument[%d] name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Parse fetch argument */
+ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
+ is_return, false);
+ if (ret) {
+ pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ goto error;
+ }
+ }
+
+ ret = register_trace_uprobe(tu);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_uprobe(tu);
+ return ret;
+
+fail_address_parse:
+ if (inode)
+ iput(inode);
+
+ pr_info("Failed to parse address or file.\n");
+
+ return ret;
+}
+
+static int cleanup_all_probes(void)
+{
+ struct trace_uprobe *tu;
+ int ret = 0;
+
+ mutex_lock(&uprobe_lock);
+ while (!list_empty(&uprobe_list)) {
+ tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
+ ret = unregister_trace_uprobe(tu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&uprobe_lock);
+ return ret;
+}
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&uprobe_lock);
+ return seq_list_start(&uprobe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &uprobe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&uprobe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_uprobe *tu = v;
+ char c = is_ret_probe(tu) ? 'r' : 'p';
+ int i;
+
+ seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
+ ftrace_event_name(&tu->tp.call));
+ seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+
+ for (i = 0; i < tu->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
+
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = cleanup_all_probes();
+ if (ret)
+ return ret;
+ }
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+}
+
+static const struct file_operations uprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_uprobe *tu = v;
+
+ seq_printf(m, " %s %-44s %15lu\n", tu->filename,
+ ftrace_event_name(&tu->tp.call), tu->nhit);
+ return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations uprobe_profile_ops = {
+ .owner = THIS_MODULE,
+ .open = profile_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+struct uprobe_cpu_buffer {
+ struct mutex mutex;
+ void *buf;
+};
+static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
+static int uprobe_buffer_refcnt;
+
+static int uprobe_buffer_init(void)
+{
+ int cpu, err_cpu;
+
+ uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
+ if (uprobe_cpu_buffer == NULL)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct page *p = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL, 0);
+ if (p == NULL) {
+ err_cpu = cpu;
+ goto err;
+ }
+ per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
+ mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
+ }
+
+ return 0;
+
+err:
+ for_each_possible_cpu(cpu) {
+ if (cpu == err_cpu)
+ break;
+ free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
+ }
+
+ free_percpu(uprobe_cpu_buffer);
+ return -ENOMEM;
+}
+
+static int uprobe_buffer_enable(void)
+{
+ int ret = 0;
+
+ BUG_ON(!mutex_is_locked(&event_mutex));
+
+ if (uprobe_buffer_refcnt++ == 0) {
+ ret = uprobe_buffer_init();
+ if (ret < 0)
+ uprobe_buffer_refcnt--;
+ }
+
+ return ret;
+}
+
+static void uprobe_buffer_disable(void)
+{
+ int cpu;
+
+ BUG_ON(!mutex_is_locked(&event_mutex));
+
+ if (--uprobe_buffer_refcnt == 0) {
+ for_each_possible_cpu(cpu)
+ free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
+ cpu)->buf);
+
+ free_percpu(uprobe_cpu_buffer);
+ uprobe_cpu_buffer = NULL;
+ }
+}
+
+static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
+{
+ struct uprobe_cpu_buffer *ucb;
+ int cpu;
+
+ cpu = raw_smp_processor_id();
+ ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
+
+ /*
+ * Use per-cpu buffers for fastest access, but we might migrate
+ * so the mutex makes sure we have sole access to it.
+ */
+ mutex_lock(&ucb->mutex);
+
+ return ucb;
+}
+
+static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
+{
+ mutex_unlock(&ucb->mutex);
+}
+
+static void __uprobe_trace_func(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize,
+ struct ftrace_event_file *ftrace_file)
+{
+ struct uprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ void *data;
+ int size, esize;
+ struct ftrace_event_call *call = &tu->tp.call;
+
+ WARN_ON(call != ftrace_file->event_call);
+
+ if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
+ return;
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ size = esize + tu->tp.size + dsize;
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type, size, 0, 0);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ if (is_ret_probe(tu)) {
+ entry->vaddr[0] = func;
+ entry->vaddr[1] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ entry->vaddr[0] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
+ memcpy(data, ucb->buf, tu->tp.size + dsize);
+
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
+}
+
+/* uprobe handler */
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ struct event_file_link *link;
+
+ if (is_ret_probe(tu))
+ return 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(link, &tu->tp.files, list)
+ __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ struct event_file_link *link;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(link, &tu->tp.files, list)
+ __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ rcu_read_unlock();
+}
+
+/* Event entry printers */
+static enum print_line_t
+print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
+{
+ struct uprobe_trace_entry_head *entry;
+ struct trace_seq *s = &iter->seq;
+ struct trace_uprobe *tu;
+ u8 *data;
+ int i;
+
+ entry = (struct uprobe_trace_entry_head *)iter->ent;
+ tu = container_of(event, struct trace_uprobe, tp.call.event);
+
+ if (is_ret_probe(tu)) {
+ if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[1], entry->vaddr[0]))
+ goto partial;
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ if (!trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[0]))
+ goto partial;
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
+ for (i = 0; i < tu->tp.nr_args; i++) {
+ struct probe_arg *parg = &tu->tp.args[i];
+
+ if (!parg->type->print(s, parg->name, data + parg->offset, entry))
+ goto partial;
+ }
+
+ if (trace_seq_puts(s, "\n"))
+ return TRACE_TYPE_HANDLED;
+
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
+ enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm);
+
+static int
+probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+ filter_func_t filter)
+{
+ bool enabled = trace_probe_is_enabled(&tu->tp);
+ struct event_file_link *link = NULL;
+ int ret;
+
+ if (file) {
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ return -EINTR;
+
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return -ENOMEM;
+
+ link->file = file;
+ list_add_tail_rcu(&link->list, &tu->tp.files);
+
+ tu->tp.flags |= TP_FLAG_TRACE;
+ } else {
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ return -EINTR;
+
+ tu->tp.flags |= TP_FLAG_PROFILE;
+ }
+
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ if (enabled)
+ return 0;
+
+ ret = uprobe_buffer_enable();
+ if (ret)
+ goto err_flags;
+
+ tu->consumer.filter = filter;
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ if (ret)
+ goto err_buffer;
+
+ return 0;
+
+ err_buffer:
+ uprobe_buffer_disable();
+
+ err_flags:
+ if (file) {
+ list_del(&link->list);
+ kfree(link);
+ tu->tp.flags &= ~TP_FLAG_TRACE;
+ } else {
+ tu->tp.flags &= ~TP_FLAG_PROFILE;
+ }
+ return ret;
+}
+
+static void
+probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
+{
+ if (!trace_probe_is_enabled(&tu->tp))
+ return;
+
+ if (file) {
+ struct event_file_link *link;
+
+ link = find_event_file_link(&tu->tp, file);
+ if (!link)
+ return;
+
+ list_del_rcu(&link->list);
+ /* synchronize with u{,ret}probe_trace_func */
+ synchronize_sched();
+ kfree(link);
+
+ if (!list_empty(&tu->tp.files))
+ return;
+ }
+
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
+
+ uprobe_buffer_disable();
+}
+
+static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i, size;
+ struct uprobe_trace_entry_head field;
+ struct trace_uprobe *tu = event_call->data;
+
+ if (is_ret_probe(tu)) {
+ DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
+ size = SIZEOF_TRACE_ENTRY(true);
+ } else {
+ DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+ size = SIZEOF_TRACE_ENTRY(false);
+ }
+ /* Set argument names as fields */
+ for (i = 0; i < tu->tp.nr_args; i++) {
+ struct probe_arg *parg = &tu->tp.args[i];
+
+ ret = trace_define_field(event_call, parg->type->fmttype,
+ parg->name, size + parg->offset,
+ parg->type->size, parg->type->is_signed,
+ FILTER_OTHER);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+ struct perf_event *event;
+
+ if (filter->nr_systemwide)
+ return true;
+
+ list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+ if (event->hw.tp_target->mm == mm)
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+ return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.tp_target) {
+ list_del(&event->hw.tp_list);
+ done = tu->filter.nr_systemwide ||
+ (event->hw.tp_target->flags & PF_EXITING) ||
+ uprobe_filter_event(tu, event);
+ } else {
+ tu->filter.nr_systemwide--;
+ done = tu->filter.nr_systemwide;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ if (!done)
+ return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+
+ return 0;
+}
+
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+ int err;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.tp_target) {
+ /*
+ * event->parent != NULL means copy_process(), we can avoid
+ * uprobe_apply(). current->mm must be probed and we can rely
+ * on dup_mmap() which preserves the already installed bp's.
+ *
+ * attr.enable_on_exec means that exec/mmap will install the
+ * breakpoints we need.
+ */
+ done = tu->filter.nr_systemwide ||
+ event->parent || event->attr.enable_on_exec ||
+ uprobe_filter_event(tu, event);
+ list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ } else {
+ done = tu->filter.nr_systemwide;
+ tu->filter.nr_systemwide++;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ err = 0;
+ if (!done) {
+ err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ if (err)
+ uprobe_perf_close(tu, event);
+ }
+ return err;
+}
+
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct trace_uprobe *tu;
+ int ret;
+
+ tu = container_of(uc, struct trace_uprobe, consumer);
+ read_lock(&tu->filter.rwlock);
+ ret = __uprobe_perf_filter(&tu->filter, mm);
+ read_unlock(&tu->filter.rwlock);
+
+ return ret;
+}
+
+static void __uprobe_perf_func(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ struct ftrace_event_call *call = &tu->tp.call;
+ struct uprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ void *data;
+ int size, esize;
+ int rctx;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ size = esize + tu->tp.size + dsize;
+ size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
+ return;
+
+ preempt_disable();
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ goto out;
+
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ if (!entry)
+ goto out;
+
+ if (is_ret_probe(tu)) {
+ entry->vaddr[0] = func;
+ entry->vaddr[1] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ entry->vaddr[0] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
+ memcpy(data, ucb->buf, tu->tp.size + dsize);
+
+ if (size - esize > tu->tp.size + dsize) {
+ int len = tu->tp.size + dsize;
+
+ memset(data + len, 0, size - esize - len);
+ }
+
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+ out:
+ preempt_enable();
+}
+
+/* uprobe profile handler */
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ return UPROBE_HANDLER_REMOVE;
+
+ if (!is_ret_probe(tu))
+ __uprobe_perf_func(tu, 0, regs, ucb, dsize);
+ return 0;
+}
+
+static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ __uprobe_perf_func(tu, func, regs, ucb, dsize);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static int
+trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+ void *data)
+{
+ struct trace_uprobe *tu = event->data;
+ struct ftrace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(tu, file, NULL);
+
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(tu, file);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_event_enable(tu, NULL, uprobe_perf_filter);
+
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_event_disable(tu, NULL);
+ return 0;
+
+ case TRACE_REG_PERF_OPEN:
+ return uprobe_perf_open(tu, data);
+
+ case TRACE_REG_PERF_CLOSE:
+ return uprobe_perf_close(tu, data);
+
+#endif
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+ struct trace_uprobe *tu;
+ struct uprobe_dispatch_data udd;
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+ int ret = 0;
+
+
+ tu = container_of(con, struct trace_uprobe, consumer);
+ tu->nhit++;
+
+ udd.tu = tu;
+ udd.bp_addr = instruction_pointer(regs);
+
+ current->utask->vaddr = (unsigned long) &udd;
+
+ if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+ return 0;
+
+ dsize = __get_data_size(&tu->tp, regs);
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ ucb = uprobe_buffer_get();
+ store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ ret |= uprobe_trace_func(tu, regs, ucb, dsize);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ ret |= uprobe_perf_func(tu, regs, ucb, dsize);
+#endif
+ uprobe_buffer_put(ucb);
+ return ret;
+}
+
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+ unsigned long func, struct pt_regs *regs)
+{
+ struct trace_uprobe *tu;
+ struct uprobe_dispatch_data udd;
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+
+ tu = container_of(con, struct trace_uprobe, consumer);
+
+ udd.tu = tu;
+ udd.bp_addr = func;
+
+ current->utask->vaddr = (unsigned long) &udd;
+
+ if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+ return 0;
+
+ dsize = __get_data_size(&tu->tp, regs);
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ ucb = uprobe_buffer_get();
+ store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ uretprobe_trace_func(tu, func, regs, ucb, dsize);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ uretprobe_perf_func(tu, func, regs, ucb, dsize);
+#endif
+ uprobe_buffer_put(ucb);
+ return 0;
+}
+
+static struct trace_event_functions uprobe_funcs = {
+ .trace = print_uprobe_event
+};
+
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+ struct ftrace_event_call *call = &tu->tp.call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &uprobe_funcs;
+ call->class->define_fields = uprobe_event_define_fields;
+
+ if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
+ return -ENOMEM;
+
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
+ kfree(call->print_fmt);
+ return -ENODEV;
+ }
+ call->flags = 0;
+ call->class->reg = trace_uprobe_register;
+ call->data = tu;
+ ret = trace_add_event_call(call);
+
+ if (ret) {
+ pr_info("Failed to register uprobe event: %s\n",
+ ftrace_event_name(call));
+ kfree(call->print_fmt);
+ unregister_ftrace_event(&call->event);
+ }
+
+ return ret;
+}
+
+static int unregister_uprobe_event(struct trace_uprobe *tu)
+{
+ int ret;
+
+ /* tu->event is unregistered in trace_remove_event_call() */
+ ret = trace_remove_event_call(&tu->tp.call);
+ if (ret)
+ return ret;
+ kfree(tu->tp.call.print_fmt);
+ tu->tp.call.print_fmt = NULL;
+ return 0;
+}
+
+/* Make a trace interface for controling probe points */
+static __init int init_uprobe_trace(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ trace_create_file("uprobe_events", 0644, d_tracer,
+ NULL, &uprobe_events_ops);
+ /* Profile interface */
+ trace_create_file("uprobe_profile", 0444, d_tracer,
+ NULL, &uprobe_profile_ops);
+ return 0;
+}
+
+fs_initcall(init_uprobe_trace);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a472..00000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Workqueue statistical tracer.
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-
-#include <trace/events/workqueue.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/kref.h>
-#include "trace_stat.h"
-#include "trace.h"
-
-
-/* A cpu workqueue thread */
-struct cpu_workqueue_stats {
- struct list_head list;
- struct kref kref;
- int cpu;
- pid_t pid;
-/* Can be inserted from interrupt or user context, need to be atomic */
- atomic_t inserted;
-/*
- * Don't need to be atomic, works are serialized in a single workqueue thread
- * on a single CPU.
- */
- unsigned int executed;
-};
-
-/* List of workqueue threads on one cpu */
-struct workqueue_global_stats {
- struct list_head list;
- spinlock_t lock;
-};
-
-/* Don't need a global lock because allocated before the workqueues, and
- * never freed.
- */
-static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
-#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
-
-static void cpu_workqueue_stat_free(struct kref *kref)
-{
- kfree(container_of(kref, struct cpu_workqueue_stats, kref));
-}
-
-/* Insertion of a work */
-static void
-probe_workqueue_insertion(void *ignore,
- struct task_struct *wq_thread,
- struct work_struct *work)
-{
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
- if (node->pid == wq_thread->pid) {
- atomic_inc(&node->inserted);
- goto found;
- }
- }
- pr_debug("trace_workqueue: entry not found\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Execution of a work */
-static void
-probe_workqueue_execution(void *ignore,
- struct task_struct *wq_thread,
- struct work_struct *work)
-{
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
- if (node->pid == wq_thread->pid) {
- node->executed++;
- goto found;
- }
- }
- pr_debug("trace_workqueue: entry not found\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(void *ignore,
- struct task_struct *wq_thread, int cpu)
-{
- struct cpu_workqueue_stats *cws;
- unsigned long flags;
-
- WARN_ON(cpu < 0);
-
- /* Workqueues are sometimes created in atomic context */
- cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
- if (!cws) {
- pr_warning("trace_workqueue: not enough memory\n");
- return;
- }
- INIT_LIST_HEAD(&cws->list);
- kref_init(&cws->kref);
- cws->cpu = cpu;
- cws->pid = wq_thread->pid;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Destruction of a cpu workqueue thread */
-static void
-probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
-{
- /* Workqueue only execute on one cpu */
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node, *next;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
- list) {
- if (node->pid == wq_thread->pid) {
- list_del(&node->list);
- kref_put(&node->kref, cpu_workqueue_stat_free);
- goto found;
- }
- }
-
- pr_debug("trace_workqueue: don't find workqueue to destroy\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
-}
-
-static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
-{
- unsigned long flags;
- struct cpu_workqueue_stats *ret = NULL;
-
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-
- if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
- ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
- struct cpu_workqueue_stats, list);
- kref_get(&ret->kref);
- }
-
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
- return ret;
-}
-
-static void *workqueue_stat_start(struct tracer_stat *trace)
-{
- int cpu;
- void *ret = NULL;
-
- for_each_possible_cpu(cpu) {
- ret = workqueue_stat_start_cpu(cpu);
- if (ret)
- return ret;
- }
- return NULL;
-}
-
-static void *workqueue_stat_next(void *prev, int idx)
-{
- struct cpu_workqueue_stats *prev_cws = prev;
- struct cpu_workqueue_stats *ret;
- int cpu = prev_cws->cpu;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
- do {
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- return NULL;
- } while (!(ret = workqueue_stat_start_cpu(cpu)));
- return ret;
- } else {
- ret = list_entry(prev_cws->list.next,
- struct cpu_workqueue_stats, list);
- kref_get(&ret->kref);
- }
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
- return ret;
-}
-
-static int workqueue_stat_show(struct seq_file *s, void *p)
-{
- struct cpu_workqueue_stats *cws = p;
- struct pid *pid;
- struct task_struct *tsk;
-
- pid = find_get_pid(cws->pid);
- if (pid) {
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (tsk) {
- seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
- atomic_read(&cws->inserted), cws->executed,
- tsk->comm);
- put_task_struct(tsk);
- }
- put_pid(pid);
- }
-
- return 0;
-}
-
-static void workqueue_stat_release(void *stat)
-{
- struct cpu_workqueue_stats *node = stat;
-
- kref_put(&node->kref, cpu_workqueue_stat_free);
-}
-
-static int workqueue_stat_headers(struct seq_file *s)
-{
- seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
- seq_printf(s, "# | | | |\n");
- return 0;
-}
-
-struct tracer_stat workqueue_stats __read_mostly = {
- .name = "workqueues",
- .stat_start = workqueue_stat_start,
- .stat_next = workqueue_stat_next,
- .stat_show = workqueue_stat_show,
- .stat_release = workqueue_stat_release,
- .stat_headers = workqueue_stat_headers
-};
-
-
-int __init stat_workqueue_init(void)
-{
- if (register_stat_tracer(&workqueue_stats)) {
- pr_warning("Unable to register workqueue stat tracer\n");
- return 1;
- }
-
- return 0;
-}
-fs_initcall(stat_workqueue_init);
-
-/*
- * Workqueues are created very early, just after pre-smp initcalls.
- * So we must register our tracepoints at this stage.
- */
-int __init trace_workqueue_early_init(void)
-{
- int ret, cpu;
-
- for_each_possible_cpu(cpu) {
- spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
- INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
- }
-
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
- if (ret)
- goto out;
-
- ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
- if (ret)
- goto no_insertion;
-
- ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
- if (ret)
- goto no_execution;
-
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
- if (ret)
- goto no_creation;
-
- return 0;
-
-no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
-no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
-no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-out:
- pr_warning("trace_workqueue: unable to trace workqueues\n");
-
- return 1;
-}
-early_initcall(trace_workqueue_early_init);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f1539decd99..3490407dc7b 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 Mathieu Desnoyers
+ * Copyright (C) 2008-2014 Mathieu Desnoyers
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -25,7 +25,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
extern struct tracepoint * const __start___tracepoints_ptrs[];
extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -33,43 +33,29 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
+#ifdef CONFIG_MODULES
/*
- * Tracepoints mutex protects the builtin and module tracepoints and the hash
- * table, as well as the local module list.
+ * Tracepoint module list mutex protects the local module list.
*/
-static DEFINE_MUTEX(tracepoints_mutex);
+static DEFINE_MUTEX(tracepoint_module_list_mutex);
-#ifdef CONFIG_MODULES
-/* Local list of struct module */
+/* Local list of struct tp_module */
static LIST_HEAD(tracepoint_module_list);
#endif /* CONFIG_MODULES */
/*
- * Tracepoint hash table, containing the active tracepoints.
- * Protected by tracepoints_mutex.
+ * tracepoints_mutex protects the builtin and module tracepoints.
+ * tracepoints_mutex nests inside tracepoint_module_list_mutex.
*/
-#define TRACEPOINT_HASH_BITS 6
-#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+static DEFINE_MUTEX(tracepoints_mutex);
/*
* Note about RCU :
* It is used to delay the free of multiple probes array until a quiescent
* state is reached.
- * Tracepoint entries modifications are protected by the tracepoints_mutex.
*/
-struct tracepoint_entry {
- struct hlist_node hlist;
- struct tracepoint_func *funcs;
- int refcount; /* Number of times armed. 0 if disarmed. */
- char name[0];
-};
-
struct tp_probes {
- union {
- struct rcu_head rcu;
- struct list_head list;
- } u;
+ struct rcu_head rcu;
struct tracepoint_func probes[0];
};
@@ -82,7 +68,7 @@ static inline void *allocate_probes(int count)
static void rcu_free_old_probes(struct rcu_head *head)
{
- kfree(container_of(head, struct tp_probes, u.rcu));
+ kfree(container_of(head, struct tp_probes, rcu));
}
static inline void release_probes(struct tracepoint_func *old)
@@ -90,37 +76,37 @@ static inline void release_probes(struct tracepoint_func *old)
if (old) {
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
- call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
+ call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
}
}
-static void debug_print_probes(struct tracepoint_entry *entry)
+static void debug_print_probes(struct tracepoint_func *funcs)
{
int i;
- if (!tracepoint_debug || !entry->funcs)
+ if (!tracepoint_debug || !funcs)
return;
- for (i = 0; entry->funcs[i].func; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
+ for (i = 0; funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
}
-static struct tracepoint_func *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry,
- void *probe, void *data)
+static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
{
int nr_probes = 0;
struct tracepoint_func *old, *new;
- WARN_ON(!probe);
+ if (WARN_ON(!tp_func->func))
+ return ERR_PTR(-EINVAL);
- debug_print_probes(entry);
- old = entry->funcs;
+ debug_print_probes(*funcs);
+ old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++)
- if (old[nr_probes].func == probe &&
- old[nr_probes].data == data)
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
@@ -129,41 +115,42 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
return ERR_PTR(-ENOMEM);
if (old)
memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- new[nr_probes].func = probe;
- new[nr_probes].data = data;
+ new[nr_probes] = *tp_func;
new[nr_probes + 1].func = NULL;
- entry->refcount = nr_probes + 1;
- entry->funcs = new;
- debug_print_probes(entry);
+ *funcs = new;
+ debug_print_probes(*funcs);
return old;
}
-static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
- void *probe, void *data)
+static void *func_remove(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
{
int nr_probes = 0, nr_del = 0, i;
struct tracepoint_func *old, *new;
- old = entry->funcs;
+ old = *funcs;
if (!old)
return ERR_PTR(-ENOENT);
- debug_print_probes(entry);
+ debug_print_probes(*funcs);
/* (N -> M), (N > 1, M >= 0) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (!probe ||
- (old[nr_probes].func == probe &&
- old[nr_probes].data == data))
- nr_del++;
+ if (tp_func->func) {
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
+ nr_del++;
+ }
}
+ /*
+ * If probe is NULL, then nr_probes = nr_del = 0, and then the
+ * entire entry will be removed.
+ */
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
- entry->funcs = NULL;
- entry->refcount = 0;
- debug_print_probes(entry);
+ *funcs = NULL;
+ debug_print_probes(*funcs);
return old;
} else {
int j = 0;
@@ -173,93 +160,34 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
- if (probe &&
- (old[i].func != probe || old[i].data != data))
+ if (old[i].func != tp_func->func
+ || old[i].data != tp_func->data)
new[j++] = old[i];
new[nr_probes - nr_del].func = NULL;
- entry->refcount = nr_probes - nr_del;
- entry->funcs = new;
+ *funcs = new;
}
- debug_print_probes(entry);
+ debug_print_probes(*funcs);
return old;
}
/*
- * Get tracepoint if the tracepoint is present in the tracepoint hash table.
- * Must be called with tracepoints_mutex held.
- * Returns NULL if not present.
- */
-static struct tracepoint_entry *get_tracepoint(const char *name)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct tracepoint_entry *e;
- u32 hash = jhash(name, strlen(name), 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name))
- return e;
- }
- return NULL;
-}
-
-/*
- * Add the tracepoint to the tracepoint hash table. Must be called with
- * tracepoints_mutex held.
- */
-static struct tracepoint_entry *add_tracepoint(const char *name)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct tracepoint_entry *e;
- size_t name_len = strlen(name) + 1;
- u32 hash = jhash(name, name_len-1, 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name)) {
- printk(KERN_NOTICE
- "tracepoint %s busy\n", name);
- return ERR_PTR(-EEXIST); /* Already there */
- }
- }
- /*
- * Using kmalloc here to allocate a variable length element. Could
- * cause some memory fragmentation if overused.
- */
- e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
- if (!e)
- return ERR_PTR(-ENOMEM);
- memcpy(&e->name[0], name, name_len);
- e->funcs = NULL;
- e->refcount = 0;
- hlist_add_head(&e->hlist, head);
- return e;
-}
-
-/*
- * Remove the tracepoint from the tracepoint hash table. Must be called with
- * mutex_lock held.
+ * Add the probe function to a tracepoint.
*/
-static inline void remove_tracepoint(struct tracepoint_entry *e)
+static int tracepoint_add_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
{
- hlist_del(&e->hlist);
- kfree(e);
-}
+ struct tracepoint_func *old, *tp_funcs;
-/*
- * Sets the probe callback corresponding to one tracepoint.
- */
-static void set_tracepoint(struct tracepoint_entry **entry,
- struct tracepoint *elem, int active)
-{
- WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+ if (tp->regfunc && !static_key_enabled(&tp->key))
+ tp->regfunc();
- if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
- elem->regfunc();
- else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
- elem->unregfunc();
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_add(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
+ }
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -268,421 +196,218 @@ static void set_tracepoint(struct tracepoint_entry **entry,
* include/linux/tracepoints.h. A matching smp_read_barrier_depends()
* is used.
*/
- rcu_assign_pointer(elem->funcs, (*entry)->funcs);
- if (active && !jump_label_enabled(&elem->key))
- jump_label_inc(&elem->key);
- else if (!active && jump_label_enabled(&elem->key))
- jump_label_dec(&elem->key);
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ if (!static_key_enabled(&tp->key))
+ static_key_slow_inc(&tp->key);
+ release_probes(old);
+ return 0;
}
/*
- * Disable a tracepoint and its probe callback.
+ * Remove a probe function from a tracepoint.
* Note: only waiting an RCU period after setting elem->call to the empty
* function insures that the original callback is not used anymore. This insured
* by preempt_disable around the call site.
*/
-static void disable_tracepoint(struct tracepoint *elem)
+static int tracepoint_remove_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
{
- if (elem->unregfunc && jump_label_enabled(&elem->key))
- elem->unregfunc();
-
- if (jump_label_enabled(&elem->key))
- jump_label_dec(&elem->key);
- rcu_assign_pointer(elem->funcs, NULL);
-}
+ struct tracepoint_func *old, *tp_funcs;
-/**
- * tracepoint_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of tracepoints.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probe_range(struct tracepoint * const *begin,
- struct tracepoint * const *end)
-{
- struct tracepoint * const *iter;
- struct tracepoint_entry *mark_entry;
-
- if (!begin)
- return;
-
- for (iter = begin; iter < end; iter++) {
- mark_entry = get_tracepoint((*iter)->name);
- if (mark_entry) {
- set_tracepoint(&mark_entry, *iter,
- !!mark_entry->refcount);
- } else {
- disable_tracepoint(*iter);
- }
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_remove(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
}
-}
-
-#ifdef CONFIG_MODULES
-void module_update_tracepoints(void)
-{
- struct tp_module *tp_mod;
-
- list_for_each_entry(tp_mod, &tracepoint_module_list, list)
- tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
- tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
-}
-#else /* CONFIG_MODULES */
-void module_update_tracepoints(void)
-{
-}
-#endif /* CONFIG_MODULES */
+ if (!tp_funcs) {
+ /* Removed last function */
+ if (tp->unregfunc && static_key_enabled(&tp->key))
+ tp->unregfunc();
-/*
- * Update probes, removing the faulty probes.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probes(void)
-{
- /* Core kernel tracepoints */
- tracepoint_update_probe_range(__start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- /* tracepoints in modules. */
- module_update_tracepoints();
-}
-
-static struct tracepoint_func *
-tracepoint_add_probe(const char *name, void *probe, void *data)
-{
- struct tracepoint_entry *entry;
- struct tracepoint_func *old;
-
- entry = get_tracepoint(name);
- if (!entry) {
- entry = add_tracepoint(name);
- if (IS_ERR(entry))
- return (struct tracepoint_func *)entry;
+ if (static_key_enabled(&tp->key))
+ static_key_slow_dec(&tp->key);
}
- old = tracepoint_entry_add_probe(entry, probe, data);
- if (IS_ERR(old) && !entry->refcount)
- remove_tracepoint(entry);
- return old;
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ release_probes(old);
+ return 0;
}
/**
* tracepoint_probe_register - Connect a probe to a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
* @probe: probe handler
+ * @data: tracepoint data
*
* Returns 0 if ok, error value on error.
- * The probe address must at least be aligned on the architecture pointer size.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
*/
-int tracepoint_probe_register(const char *name, void *probe, void *data)
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
- struct tracepoint_func *old;
+ struct tracepoint_func tp_func;
+ int ret;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_update_probes(); /* may update entry */
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_add_func(tp, &tp_func);
mutex_unlock(&tracepoints_mutex);
- release_probes(old);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static struct tracepoint_func *
-tracepoint_remove_probe(const char *name, void *probe, void *data)
-{
- struct tracepoint_entry *entry;
- struct tracepoint_func *old;
-
- entry = get_tracepoint(name);
- if (!entry)
- return ERR_PTR(-ENOENT);
- old = tracepoint_entry_remove_probe(entry, probe, data);
- if (IS_ERR(old))
- return old;
- if (!entry->refcount)
- remove_tracepoint(entry);
- return old;
-}
-
/**
* tracepoint_probe_unregister - Disconnect a probe from a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
* @probe: probe function pointer
+ * @data: tracepoint data
*
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
+ * Returns 0 if ok, error value on error.
*/
-int tracepoint_probe_unregister(const char *name, void *probe, void *data)
+int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
{
- struct tracepoint_func *old;
+ struct tracepoint_func tp_func;
+ int ret;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_update_probes(); /* may update entry */
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_remove_func(tp, &tp_func);
mutex_unlock(&tracepoints_mutex);
- release_probes(old);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
-static LIST_HEAD(old_probes);
-static int need_update;
-
-static void tracepoint_add_old_probes(void *old)
+#ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
{
- need_update = 1;
- if (old) {
- struct tp_probes *tp_probes = container_of(old,
- struct tp_probes, probes[0]);
- list_add(&tp_probes->u.list, &old_probes);
- }
+ return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
+ (1 << TAINT_UNSIGNED_MODULE));
}
-/**
- * tracepoint_probe_register_noupdate - register a probe but not connect
- * @name: tracepoint name
- * @probe: probe handler
- *
- * caller must call tracepoint_probe_update_all()
- */
-int tracepoint_probe_register_noupdate(const char *name, void *probe,
- void *data)
-{
- struct tracepoint_func *old;
-
- mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_add_old_probes(old);
- mutex_unlock(&tracepoints_mutex);
- return 0;
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
/**
- * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
- * @name: tracepoint name
- * @probe: probe function pointer
+ * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * @nb: notifier block
*
- * caller must call tracepoint_probe_update_all()
+ * Notifiers registered with this function are called on module
+ * coming/going with the tracepoint_module_list_mutex held.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
*/
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
- void *data)
+int register_tracepoint_module_notifier(struct notifier_block *nb)
{
- struct tracepoint_func *old;
-
- mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_add_old_probes(old);
- mutex_unlock(&tracepoints_mutex);
- return 0;
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
-
-/**
- * tracepoint_probe_update_all - update tracepoints
- */
-void tracepoint_probe_update_all(void)
-{
- LIST_HEAD(release_probes);
- struct tp_probes *pos, *next;
+ struct tp_module *tp_mod;
+ int ret;
- mutex_lock(&tracepoints_mutex);
- if (!need_update) {
- mutex_unlock(&tracepoints_mutex);
- return;
- }
- if (!list_empty(&old_probes))
- list_replace_init(&old_probes, &release_probes);
- need_update = 0;
- tracepoint_update_probes();
- mutex_unlock(&tracepoints_mutex);
- list_for_each_entry_safe(pos, next, &release_probes, u.list) {
- list_del(&pos->u.list);
- call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
- }
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
}
-EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
+EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
/**
- * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
- * @tracepoint: current tracepoints (in), next tracepoint (out)
- * @begin: beginning of the range
- * @end: end of the range
+ * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * @nb: notifier block
*
- * Returns whether a next tracepoint has been found (1) or not (0).
- * Will return the first tracepoint in the range if the input tracepoint is
- * NULL.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
*/
-static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
- struct tracepoint * const *begin, struct tracepoint * const *end)
+int unregister_tracepoint_module_notifier(struct notifier_block *nb)
{
- if (!*tracepoint && begin != end) {
- *tracepoint = begin;
- return 1;
- }
- if (*tracepoint >= begin && *tracepoint < end)
- return 1;
- return 0;
-}
+ struct tp_module *tp_mod;
+ int ret;
-#ifdef CONFIG_MODULES
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
- int found = 0;
- struct tp_module *iter_mod;
-
- /* Core kernel tracepoints */
- if (!iter->module) {
- found = tracepoint_get_iter_range(&iter->tracepoint,
- __start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- if (found)
- goto end;
- }
- /* Tracepoints in modules */
- mutex_lock(&tracepoints_mutex);
- list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
- /*
- * Sorted module list
- */
- if (iter_mod < iter->module)
- continue;
- else if (iter_mod > iter->module)
- iter->tracepoint = NULL;
- found = tracepoint_get_iter_range(&iter->tracepoint,
- iter_mod->tracepoints_ptrs,
- iter_mod->tracepoints_ptrs
- + iter_mod->num_tracepoints);
- if (found) {
- iter->module = iter_mod;
- break;
- }
- }
- mutex_unlock(&tracepoints_mutex);
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod);
end:
- if (!found)
- tracepoint_iter_reset(iter);
-}
-#else /* CONFIG_MODULES */
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
- int found = 0;
-
- /* Core kernel tracepoints */
- found = tracepoint_get_iter_range(&iter->tracepoint,
- __start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- if (!found)
- tracepoint_iter_reset(iter);
-}
-#endif /* CONFIG_MODULES */
-
-void tracepoint_iter_start(struct tracepoint_iter *iter)
-{
- tracepoint_get_iter(iter);
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_start);
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
-void tracepoint_iter_next(struct tracepoint_iter *iter)
-{
- iter->tracepoint++;
- /*
- * iter->tracepoint may be invalid because we blindly incremented it.
- * Make sure it is valid by marshalling on the tracepoints, getting the
- * tracepoints from following modules if necessary.
- */
- tracepoint_get_iter(iter);
}
-EXPORT_SYMBOL_GPL(tracepoint_iter_next);
+EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
-void tracepoint_iter_stop(struct tracepoint_iter *iter)
+/*
+ * Ensure the tracer unregistered the module's probes before the module
+ * teardown is performed. Prevents leaks of probe and data pointers.
+ */
+static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
{
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
+ struct tracepoint * const *iter;
-void tracepoint_iter_reset(struct tracepoint_iter *iter)
-{
-#ifdef CONFIG_MODULES
- iter->module = NULL;
-#endif /* CONFIG_MODULES */
- iter->tracepoint = NULL;
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ WARN_ON_ONCE((*iter)->funcs);
}
-EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
-#ifdef CONFIG_MODULES
static int tracepoint_module_coming(struct module *mod)
{
- struct tp_module *tp_mod, *iter;
+ struct tp_module *tp_mod;
int ret = 0;
+ if (!mod->num_tracepoints)
+ return 0;
+
/*
* We skip modules that taint the kernel, especially those with different
* module headers (for forced load), to make sure we don't cause a crash.
- * Staging and out-of-tree GPL modules are fine.
+ * Staging, out-of-tree, and unsigned GPL modules are fine.
*/
- if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
+ if (trace_module_has_bad_taint(mod))
return 0;
- mutex_lock(&tracepoints_mutex);
+ mutex_lock(&tracepoint_module_list_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
if (!tp_mod) {
ret = -ENOMEM;
goto end;
}
- tp_mod->num_tracepoints = mod->num_tracepoints;
- tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
-
- /*
- * tracepoint_module_list is kept sorted by struct module pointer
- * address for iteration on tracepoints from a seq_file that can release
- * the mutex between calls.
- */
- list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
- BUG_ON(iter == tp_mod); /* Should never be in the list twice */
- if (iter < tp_mod) {
- /* We belong to the location right after iter. */
- list_add(&tp_mod->list, &iter->list);
- goto module_added;
- }
- }
- /* We belong to the beginning of the list */
- list_add(&tp_mod->list, &tracepoint_module_list);
-module_added:
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
+ tp_mod->mod = mod;
+ list_add_tail(&tp_mod->list, &tracepoint_module_list);
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_COMING, tp_mod);
end:
- mutex_unlock(&tracepoints_mutex);
+ mutex_unlock(&tracepoint_module_list_mutex);
return ret;
}
-static int tracepoint_module_going(struct module *mod)
+static void tracepoint_module_going(struct module *mod)
{
- struct tp_module *pos;
+ struct tp_module *tp_mod;
- mutex_lock(&tracepoints_mutex);
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
- list_for_each_entry(pos, &tracepoint_module_list, list) {
- if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
- list_del(&pos->list);
- kfree(pos);
+ if (!mod->num_tracepoints)
+ return;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
+ if (tp_mod->mod == mod) {
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_GOING, tp_mod);
+ list_del(&tp_mod->list);
+ kfree(tp_mod);
+ /*
+ * Called the going notifier before checking for
+ * quiescence.
+ */
+ tp_module_going_check_quiescent(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
break;
}
}
@@ -692,12 +417,11 @@ static int tracepoint_module_going(struct module *mod)
* flag on "going", in case a module taints the kernel only after being
* loaded.
*/
- mutex_unlock(&tracepoints_mutex);
- return 0;
+ mutex_unlock(&tracepoint_module_list_mutex);
}
-int tracepoint_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int tracepoint_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
int ret = 0;
@@ -709,24 +433,58 @@ int tracepoint_module_notify(struct notifier_block *self,
case MODULE_STATE_LIVE:
break;
case MODULE_STATE_GOING:
- ret = tracepoint_module_going(mod);
+ tracepoint_module_going(mod);
+ break;
+ case MODULE_STATE_UNFORMED:
break;
}
return ret;
}
-struct notifier_block tracepoint_module_nb = {
+static struct notifier_block tracepoint_module_nb = {
.notifier_call = tracepoint_module_notify,
.priority = 0,
};
-static int init_tracepoints(void)
+static __init int init_tracepoints(void)
{
- return register_module_notifier(&tracepoint_module_nb);
+ int ret;
+
+ ret = register_module_notifier(&tracepoint_module_nb);
+ if (ret)
+ pr_warning("Failed to register tracepoint module enter notifier\n");
+
+ return ret;
}
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end,
+ void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ struct tracepoint * const *iter;
+
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ fct(*iter, priv);
+}
+
+/**
+ * for_each_kernel_tracepoint - iteration on all kernel tracepoints
+ * @fct: callback
+ * @priv: private data
+ */
+void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ for_each_tracepoint_range(__start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs, fct, priv);
+}
+EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
+
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
@@ -734,33 +492,29 @@ static int sys_tracepoint_refcount;
void syscall_regfunc(void)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct task_struct *p, *t;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
- /* Skip kernel threads. */
- if (t->mm)
- set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ }
+ read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
}
void syscall_unregfunc(void)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct task_struct *p, *t;
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ read_unlock(&tasklist_lock);
}
}
#endif
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 23b4d784ebd..a1dd9a1b132 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,10 +26,13 @@
/*
* fill in basic accounting fields
*/
-void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+void bacct_add_tsk(struct user_namespace *user_ns,
+ struct pid_namespace *pid_ns,
+ struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
struct timespec uptime, ts;
+ cputime_t utime, stime, utimescaled, stimescaled;
u64 ac_etime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -55,18 +58,23 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
stats->ac_flag |= AXSIG;
stats->ac_nice = task_nice(tsk);
stats->ac_sched = tsk->policy;
- stats->ac_pid = tsk->pid;
+ stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
rcu_read_lock();
tcred = __task_cred(tsk);
- stats->ac_uid = tcred->uid;
- stats->ac_gid = tcred->gid;
+ stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
+ stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
stats->ac_ppid = pid_alive(tsk) ?
- rcu_dereference(tsk->real_parent)->tgid : 0;
+ task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
rcu_read_unlock();
- stats->ac_utime = cputime_to_usecs(tsk->utime);
- stats->ac_stime = cputime_to_usecs(tsk->stime);
- stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
- stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
+
+ task_cputime(tsk, &utime, &stime);
+ stats->ac_utime = cputime_to_usecs(utime);
+ stats->ac_stime = cputime_to_usecs(stime);
+
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ stats->ac_utimescaled = cputime_to_usecs(utimescaled);
+ stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
@@ -113,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
#undef KB
#undef MB
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
+static void __acct_update_integrals(struct task_struct *tsk,
+ cputime_t utime, cputime_t stime)
{
if (likely(tsk->mm)) {
cputime_t time, dtime;
@@ -126,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
u64 delta;
local_irq_save(flags);
- time = tsk->stime + tsk->utime;
+ time = stime + utime;
dtime = time - tsk->acct_timexpd;
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
delta = value.tv_sec;
@@ -143,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
}
/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
+ __acct_update_integrals(tsk, utime, stime);
+}
+
+/**
+ * acct_account_cputime - update mm integral after cputime update
+ * @tsk: task_struct for accounting
+ */
+void acct_account_cputime(struct task_struct *tsk)
+{
+ __acct_update_integrals(tsk, tsk->utime, tsk->stime);
+}
+
+/**
* acct_clear_integrals - clear the mm integral fields in task_struct
* @tsk: task_struct whose accounting fields are cleared
*/
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 51c6e89e861..602e5bbbcef 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,127 +18,107 @@
SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
- long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, filename, user, group);
- return ret;
+ return sys_chown(filename, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
- long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, filename, user, group);
- return ret;
+ return sys_lchown(filename, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
{
- long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, fd, user, group);
- return ret;
+ return sys_fchown(fd, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
{
- long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(2, ret, rgid, egid);
- return ret;
+ return sys_setregid(low2highgid(rgid), low2highgid(egid));
}
SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
{
- long ret = sys_setgid(low2highgid(gid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, gid);
- return ret;
+ return sys_setgid(low2highgid(gid));
}
SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
{
- long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(2, ret, ruid, euid);
- return ret;
+ return sys_setreuid(low2highuid(ruid), low2highuid(euid));
}
SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
{
- long ret = sys_setuid(low2highuid(uid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, uid);
- return ret;
+ return sys_setuid(low2highuid(uid));
}
SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
{
- long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
+ return sys_setresuid(low2highuid(ruid), low2highuid(euid),
low2highuid(suid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, ruid, euid, suid);
- return ret;
}
-SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
{
const struct cred *cred = current_cred();
int retval;
+ old_uid_t ruid, euid, suid;
- if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
- !(retval = put_user(high2lowuid(cred->euid), euid)))
- retval = put_user(high2lowuid(cred->suid), suid);
+ ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
+ euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
+ suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
+
+ if (!(retval = put_user(ruid, ruidp)) &&
+ !(retval = put_user(euid, euidp)))
+ retval = put_user(suid, suidp);
return retval;
}
SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
{
- long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
+ return sys_setresgid(low2highgid(rgid), low2highgid(egid),
low2highgid(sgid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, rgid, egid, sgid);
- return ret;
}
-SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
{
const struct cred *cred = current_cred();
int retval;
+ old_gid_t rgid, egid, sgid;
+
+ rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
+ egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
+ sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
- if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
- !(retval = put_user(high2lowgid(cred->egid), egid)))
- retval = put_user(high2lowgid(cred->sgid), sgid);
+ if (!(retval = put_user(rgid, rgidp)) &&
+ !(retval = put_user(egid, egidp)))
+ retval = put_user(sgid, sgidp);
return retval;
}
SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
{
- long ret = sys_setfsuid(low2highuid(uid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, uid);
- return ret;
+ return sys_setfsuid(low2highuid(uid));
}
SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
{
- long ret = sys_setfsgid(low2highgid(gid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, gid);
- return ret;
+ return sys_setfsgid(low2highgid(gid));
}
static int groups16_to_user(old_gid_t __user *grouplist,
struct group_info *group_info)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
old_gid_t group;
+ kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
- group = high2lowgid(GROUP_AT(group_info, i));
+ kgid = GROUP_AT(group_info, i);
+ group = high2lowgid(from_kgid_munged(user_ns, kgid));
if (put_user(group, grouplist+i))
return -EFAULT;
}
@@ -149,13 +129,20 @@ static int groups16_to_user(old_gid_t __user *grouplist,
static int groups16_from_user(struct group_info *group_info,
old_gid_t __user *grouplist)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
old_gid_t group;
+ kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
if (get_user(group, grouplist+i))
return -EFAULT;
- GROUP_AT(group_info, i) = low2highgid(group);
+
+ kgid = make_kgid(user_ns, low2highgid(group));
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ GROUP_AT(group_info, i) = kgid;
}
return 0;
@@ -189,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!nsown_capable(CAP_SETGID))
+ if (!ns_capable(current_user_ns(), CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
@@ -211,20 +198,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
SYSCALL_DEFINE0(getuid16)
{
- return high2lowuid(current_uid());
+ return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
}
SYSCALL_DEFINE0(geteuid16)
{
- return high2lowuid(current_euid());
+ return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
}
SYSCALL_DEFINE0(getgid16)
{
- return high2lowgid(current_gid());
+ return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
}
SYSCALL_DEFINE0(getegid16)
{
- return high2lowgid(current_egid());
+ return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
}
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf..1760bf3d146 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,12 +10,75 @@
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
{
+ unsigned long flags;
+
WARN_ON(cpu != 0);
- local_irq_disable();
- (func)(info);
- local_irq_enable();
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
return 0;
}
EXPORT_SYMBOL(smp_call_function_single);
+
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ csd->func(csd->info);
+ local_irq_restore(flags);
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single_async);
+
+int on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ return 0;
+}
+EXPORT_SYMBOL(on_each_cpu);
+
+/*
+ * Note we still need to test the mask even for UP
+ * because we actually can get an empty mask from
+ * code that on SMP might call us without the local
+ * CPU in the mask.
+ */
+void on_each_cpu_mask(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ unsigned long flags;
+
+ if (cpumask_test_cpu(0, mask)) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * Preemption is disabled here to make sure the cond_func is called under the
+ * same condtions in UP and SMP.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+ smp_call_func_t func, void *info, bool wait,
+ gfp_t gfp_flags)
+{
+ unsigned long flags;
+
+ preempt_disable();
+ if (cond_func(0, info)) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1f..394f70b1716 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
void fire_user_return_notifiers(void)
{
struct user_return_notifier *urn;
- struct hlist_node *tmp1, *tmp2;
+ struct hlist_node *tmp2;
struct hlist_head *head;
head = &get_cpu_var(return_notifier_list);
- hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+ hlist_for_each_entry_safe(urn, tmp2, head, link)
urn->on_user_return(urn);
put_cpu_var(return_notifier_list);
}
diff --git a/kernel/user.c b/kernel/user.c
index 71dd2363ab0..4efa39350e4 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,16 +16,45 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
/*
* userns count is 1 for root user, 1 for init_uts_ns,
* and 1 for... ?
*/
struct user_namespace init_user_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(3),
+ .uid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
- .creator = &root_user,
+ .gid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
+ .projid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
+ .count = ATOMIC_INIT(3),
+ .owner = GLOBAL_ROOT_UID,
+ .group = GLOBAL_ROOT_GID,
+ .proc_inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ .persistent_keyring_register_sem =
+ __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -34,11 +63,14 @@ EXPORT_SYMBOL_GPL(init_user_ns);
* when changing user ID's (ie setuid() and friends).
*/
+#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
static struct kmem_cache *uid_cachep;
+struct hlist_head uidhash_table[UIDHASH_SZ];
/*
* The uidhash_lock is mostly taken from process context, but it is
@@ -51,14 +83,13 @@ static struct kmem_cache *uid_cachep;
*/
static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
+/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
- .__count = ATOMIC_INIT(2),
+ .__count = ATOMIC_INIT(1),
.processes = ATOMIC_INIT(1),
- .files = ATOMIC_INIT(0),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
- .user_ns = &init_user_ns,
+ .uid = GLOBAL_ROOT_UID,
};
/*
@@ -72,16 +103,14 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
static void uid_hash_remove(struct user_struct *up)
{
hlist_del_init(&up->uidhash_node);
- put_user_ns(up->user_ns);
}
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
{
struct user_struct *user;
- struct hlist_node *h;
- hlist_for_each_entry(user, h, hashent, uidhash_node) {
- if (user->uid == uid) {
+ hlist_for_each_entry(user, hashent, uidhash_node) {
+ if (uid_eq(user->uid, uid)) {
atomic_inc(&user->__count);
return user;
}
@@ -110,14 +139,13 @@ static void free_user(struct user_struct *up, unsigned long flags)
*
* If the user_struct could not be found, return NULL.
*/
-struct user_struct *find_user(uid_t uid)
+struct user_struct *find_user(kuid_t uid)
{
struct user_struct *ret;
unsigned long flags;
- struct user_namespace *ns = current_user_ns();
spin_lock_irqsave(&uidhash_lock, flags);
- ret = uid_hash_find(uid, uidhashentry(ns, uid));
+ ret = uid_hash_find(uid, uidhashentry(uid));
spin_unlock_irqrestore(&uidhash_lock, flags);
return ret;
}
@@ -136,9 +164,9 @@ void free_uid(struct user_struct *up)
local_irq_restore(flags);
}
-struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
+struct user_struct *alloc_uid(kuid_t uid)
{
- struct hlist_head *hashent = uidhashentry(ns, uid);
+ struct hlist_head *hashent = uidhashentry(uid);
struct user_struct *up, *new;
spin_lock_irq(&uidhash_lock);
@@ -153,8 +181,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
new->uid = uid;
atomic_set(&new->__count, 1);
- new->user_ns = get_user_ns(ns);
-
/*
* Before adding this, check whether we raced
* on adding the same user already..
@@ -162,7 +188,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- put_user_ns(ns);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
@@ -187,14 +212,13 @@ static int __init uid_cache_init(void)
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
for(n = 0; n < UIDHASH_SZ; ++n)
- INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
+ INIT_HLIST_HEAD(uidhash_table + n);
/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
- uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
+ uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);
return 0;
}
-
-module_init(uid_cache_init);
+subsys_initcall(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3b906e98b1d..fcc02560fd6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,11 +9,44 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
+#include <linux/securebits.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/projid.h>
+#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
+ struct uid_gid_map *map);
+
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+ /* Start with the same capabilities as init but useless for doing
+ * anything as the capabilities are bound to the new user namespace.
+ */
+ cred->securebits = SECUREBITS_DEFAULT;
+ cred->cap_inheritable = CAP_EMPTY_SET;
+ cred->cap_permitted = CAP_FULL_SET;
+ cred->cap_effective = CAP_FULL_SET;
+ cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+ key_put(cred->request_key_auth);
+ cred->request_key_auth = NULL;
+#endif
+ /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+ cred->user_ns = user_ns;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -24,114 +57,855 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
*/
int create_user_ns(struct cred *new)
{
- struct user_namespace *ns;
- struct user_struct *root_user;
- int n;
+ struct user_namespace *ns, *parent_ns = new->user_ns;
+ kuid_t owner = new->euid;
+ kgid_t group = new->egid;
+ int ret;
- ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
+ if (parent_ns->level > 32)
+ return -EUSERS;
+
+ /*
+ * Verify that we can not violate the policy of which files
+ * may be accessed that is specified by the root directory,
+ * by verifing that the root directory is at the root of the
+ * mount namespace which allows all files to be accessed.
+ */
+ if (current_chrooted())
+ return -EPERM;
+
+ /* The creator needs a mapping in the parent user namespace
+ * or else we won't be able to reasonably tell userspace who
+ * created a user_namespace.
+ */
+ if (!kuid_has_mapping(parent_ns, owner) ||
+ !kgid_has_mapping(parent_ns, group))
+ return -EPERM;
+
+ ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
return -ENOMEM;
- kref_init(&ns->kref);
+ ret = proc_alloc_inum(&ns->proc_inum);
+ if (ret) {
+ kmem_cache_free(user_ns_cachep, ns);
+ return ret;
+ }
- for (n = 0; n < UIDHASH_SZ; ++n)
- INIT_HLIST_HEAD(ns->uidhash_table + n);
+ atomic_set(&ns->count, 1);
+ /* Leave the new->user_ns reference with the new user namespace. */
+ ns->parent = parent_ns;
+ ns->level = parent_ns->level + 1;
+ ns->owner = owner;
+ ns->group = group;
- /* Alloc new root user. */
- root_user = alloc_uid(ns, 0);
- if (!root_user) {
- kmem_cache_free(user_ns_cachep, ns);
- return -ENOMEM;
+ set_cred_user_ns(new, ns);
+
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ init_rwsem(&ns->persistent_keyring_register_sem);
+#endif
+ return 0;
+}
+
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+{
+ struct cred *cred;
+ int err = -ENOMEM;
+
+ if (!(unshare_flags & CLONE_NEWUSER))
+ return 0;
+
+ cred = prepare_creds();
+ if (cred) {
+ err = create_user_ns(cred);
+ if (err)
+ put_cred(cred);
+ else
+ *new_cred = cred;
}
- /* set the new root user in the credentials under preparation */
- ns->creator = new->user;
- new->user = root_user;
- new->uid = new->euid = new->suid = new->fsuid = 0;
- new->gid = new->egid = new->sgid = new->fsgid = 0;
- put_group_info(new->group_info);
- new->group_info = get_group_info(&init_groups);
-#ifdef CONFIG_KEYS
- key_put(new->request_key_auth);
- new->request_key_auth = NULL;
+ return err;
+}
+
+void free_user_ns(struct user_namespace *ns)
+{
+ struct user_namespace *parent;
+
+ do {
+ parent = ns->parent;
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ key_put(ns->persistent_keyring_register);
#endif
- /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+ proc_free_inum(ns->proc_inum);
+ kmem_cache_free(user_ns_cachep, ns);
+ ns = parent;
+ } while (atomic_dec_and_test(&parent->count));
+}
+EXPORT_SYMBOL(free_user_ns);
- /* root_user holds a reference to ns, our reference can be dropped */
- put_user_ns(ns);
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+ unsigned idx, extents;
+ u32 first, last, id2;
- return 0;
+ id2 = id + count - 1;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_rmb();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last &&
+ (id2 >= first && id2 <= last))
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].lower_first;
+ else
+ id = (u32) -1;
+
+ return id;
}
-/*
- * Deferred destructor for a user namespace. This is required because
- * free_user_ns() may be called with uidhash_lock held, but we need to call
- * back to free_uid() which will want to take the lock again.
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+ unsigned idx, extents;
+ u32 first, last;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_rmb();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last)
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].lower_first;
+ else
+ id = (u32) -1;
+
+ return id;
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+ unsigned idx, extents;
+ u32 first, last;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_rmb();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].lower_first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last)
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].first;
+ else
+ id = (u32) -1;
+
+ return id;
+}
+
+/**
+ * make_kuid - Map a user-namespace uid pair into a kuid.
+ * @ns: User namespace that the uid is in
+ * @uid: User identifier
+ *
+ * Maps a user-namespace uid pair into a kernel internal kuid,
+ * and returns that kuid.
+ *
+ * When there is no mapping defined for the user-namespace uid
+ * pair INVALID_UID is returned. Callers are expected to test
+ * for and handle INVALID_UID being returned. INVALID_UID
+ * may be tested for using uid_valid().
+ */
+kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
+{
+ /* Map the uid to a global kernel uid */
+ return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
+}
+EXPORT_SYMBOL(make_kuid);
+
+/**
+ * from_kuid - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kuid has no mapping in @targ (uid_t)-1 is returned.
*/
-static void free_user_ns_work(struct work_struct *work)
+uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
- struct user_namespace *ns =
- container_of(work, struct user_namespace, destroyer);
- free_uid(ns->creator);
- kmem_cache_free(user_ns_cachep, ns);
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
+EXPORT_SYMBOL(from_kuid);
-void free_user_ns(struct kref *kref)
+/**
+ * from_kuid_munged - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kuid from_kuid_munged never fails and always
+ * returns a valid uid. This makes from_kuid_munged appropriate
+ * for use in syscalls like stat and getuid where failing the
+ * system call and failing to provide a valid uid are not an
+ * options.
+ *
+ * If @kuid has no mapping in @targ overflowuid is returned.
+ */
+uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
- struct user_namespace *ns =
- container_of(kref, struct user_namespace, kref);
+ uid_t uid;
+ uid = from_kuid(targ, kuid);
- INIT_WORK(&ns->destroyer, free_user_ns_work);
- schedule_work(&ns->destroyer);
+ if (uid == (uid_t) -1)
+ uid = overflowuid;
+ return uid;
}
-EXPORT_SYMBOL(free_user_ns);
+EXPORT_SYMBOL(from_kuid_munged);
-uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+/**
+ * make_kgid - Map a user-namespace gid pair into a kgid.
+ * @ns: User namespace that the gid is in
+ * @gid: group identifier
+ *
+ * Maps a user-namespace gid pair into a kernel internal kgid,
+ * and returns that kgid.
+ *
+ * When there is no mapping defined for the user-namespace gid
+ * pair INVALID_GID is returned. Callers are expected to test
+ * for and handle INVALID_GID being returned. INVALID_GID may be
+ * tested for using gid_valid().
+ */
+kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
- struct user_namespace *tmp;
+ /* Map the gid to a global kernel gid */
+ return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
+}
+EXPORT_SYMBOL(make_kgid);
+
+/**
+ * from_kgid - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kgid has no mapping in @targ (gid_t)-1 is returned.
+ */
+gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
+{
+ /* Map the gid from a global kernel gid */
+ return map_id_up(&targ->gid_map, __kgid_val(kgid));
+}
+EXPORT_SYMBOL(from_kgid);
+
+/**
+ * from_kgid_munged - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kgid from_kgid_munged never fails and always
+ * returns a valid gid. This makes from_kgid_munged appropriate
+ * for use in syscalls like stat and getgid where failing the
+ * system call and failing to provide a valid gid are not options.
+ *
+ * If @kgid has no mapping in @targ overflowgid is returned.
+ */
+gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
+{
+ gid_t gid;
+ gid = from_kgid(targ, kgid);
+
+ if (gid == (gid_t) -1)
+ gid = overflowgid;
+ return gid;
+}
+EXPORT_SYMBOL(from_kgid_munged);
+
+/**
+ * make_kprojid - Map a user-namespace projid pair into a kprojid.
+ * @ns: User namespace that the projid is in
+ * @projid: Project identifier
+ *
+ * Maps a user-namespace uid pair into a kernel internal kuid,
+ * and returns that kuid.
+ *
+ * When there is no mapping defined for the user-namespace projid
+ * pair INVALID_PROJID is returned. Callers are expected to test
+ * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
+ * may be tested for using projid_valid().
+ */
+kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
+{
+ /* Map the uid to a global kernel uid */
+ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
+}
+EXPORT_SYMBOL(make_kprojid);
+
+/**
+ * from_kprojid - Create a projid from a kprojid user-namespace pair.
+ * @targ: The user namespace we want a projid in.
+ * @kprojid: The kernel internal project identifier to start with.
+ *
+ * Map @kprojid into the user-namespace specified by @targ and
+ * return the resulting projid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
+ */
+projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
+{
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
+}
+EXPORT_SYMBOL(from_kprojid);
+
+/**
+ * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
+ * @targ: The user namespace we want a projid in.
+ * @kprojid: The kernel internal projid to start with.
+ *
+ * Map @kprojid into the user-namespace specified by @targ and
+ * return the resulting projid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kprojid from_kprojid_munged never fails and always
+ * returns a valid projid. This makes from_kprojid_munged
+ * appropriate for use in syscalls like stat and where
+ * failing the system call and failing to provide a valid projid are
+ * not an options.
+ *
+ * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
+ */
+projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
+{
+ projid_t projid;
+ projid = from_kprojid(targ, kprojid);
+
+ if (projid == (projid_t) -1)
+ projid = OVERFLOW_PROJID;
+ return projid;
+}
+EXPORT_SYMBOL(from_kprojid_munged);
+
+
+static int uid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ uid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static int gid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ gid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
- if (likely(to == cred->user->user_ns))
- return uid;
+ lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
- /* Is cred->user the creator of the target user_ns
- * or the creator of one of it's parents?
+ return 0;
+}
+
+static int projid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ projid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static void *m_start(struct seq_file *seq, loff_t *ppos,
+ struct uid_gid_map *map)
+{
+ struct uid_gid_extent *extent = NULL;
+ loff_t pos = *ppos;
+
+ if (pos < map->nr_extents)
+ extent = &map->extent[pos];
+
+ return extent;
+}
+
+static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->uid_map);
+}
+
+static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->gid_map);
+}
+
+static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->projid_map);
+}
+
+static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return seq->op->start(seq, pos);
+}
+
+static void m_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+struct seq_operations proc_uid_seq_operations = {
+ .start = uid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = uid_m_show,
+};
+
+struct seq_operations proc_gid_seq_operations = {
+ .start = gid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = gid_m_show,
+};
+
+struct seq_operations proc_projid_seq_operations = {
+ .start = projid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = projid_m_show,
+};
+
+static bool mappings_overlap(struct uid_gid_map *new_map,
+ struct uid_gid_extent *extent)
+{
+ u32 upper_first, lower_first, upper_last, lower_last;
+ unsigned idx;
+
+ upper_first = extent->first;
+ lower_first = extent->lower_first;
+ upper_last = upper_first + extent->count - 1;
+ lower_last = lower_first + extent->count - 1;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ u32 prev_upper_first, prev_lower_first;
+ u32 prev_upper_last, prev_lower_last;
+ struct uid_gid_extent *prev;
+
+ prev = &new_map->extent[idx];
+
+ prev_upper_first = prev->first;
+ prev_lower_first = prev->lower_first;
+ prev_upper_last = prev_upper_first + prev->count - 1;
+ prev_lower_last = prev_lower_first + prev->count - 1;
+
+ /* Does the upper range intersect a previous extent? */
+ if ((prev_upper_first <= upper_last) &&
+ (prev_upper_last >= upper_first))
+ return true;
+
+ /* Does the lower range intersect a previous extent? */
+ if ((prev_lower_first <= lower_last) &&
+ (prev_lower_last >= lower_first))
+ return true;
+ }
+ return false;
+}
+
+
+static DEFINE_MUTEX(id_map_mutex);
+
+static ssize_t map_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos,
+ int cap_setid,
+ struct uid_gid_map *map,
+ struct uid_gid_map *parent_map)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_map new_map;
+ unsigned idx;
+ struct uid_gid_extent *extent = NULL;
+ unsigned long page = 0;
+ char *kbuf, *pos, *next_line;
+ ssize_t ret = -EINVAL;
+
+ /*
+ * The id_map_mutex serializes all writes to any given map.
+ *
+ * Any map is only ever written once.
+ *
+ * An id map fits within 1 cache line on most architectures.
+ *
+ * On read nothing needs to be done unless you are on an
+ * architecture with a crazy cache coherency model like alpha.
+ *
+ * There is a one time data dependency between reading the
+ * count of the extents and the values of the extents. The
+ * desired behavior is to see the values of the extents that
+ * were written before the count of the extents.
+ *
+ * To achieve this smp_wmb() is used on guarantee the write
+ * order and smp_rmb() is guaranteed that we don't have crazy
+ * architectures returning stale data.
+ */
+ mutex_lock(&id_map_mutex);
+
+ ret = -EPERM;
+ /* Only allow one successful write to the map */
+ if (map->nr_extents != 0)
+ goto out;
+
+ /*
+ * Adjusting namespace settings requires capabilities on the target.
*/
- for ( tmp = to; tmp != &init_user_ns;
- tmp = tmp->creator->user_ns ) {
- if (cred->user == tmp->creator) {
- return (uid_t)0;
+ if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+ goto out;
+
+ /* Get a buffer */
+ ret = -ENOMEM;
+ page = __get_free_page(GFP_TEMPORARY);
+ kbuf = (char *) page;
+ if (!page)
+ goto out;
+
+ /* Only allow <= page size writes at the beginning of the file */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ goto out;
+
+ /* Slurp in the user data */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ pos = kbuf;
+ new_map.nr_extents = 0;
+ for (; pos; pos = next_line) {
+ extent = &new_map.extent[new_map.nr_extents];
+
+ /* Find the end of line and ensure I don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
}
+
+ pos = skip_spaces(pos);
+ extent->first = simple_strtoul(pos, &pos, 10);
+ if (!isspace(*pos))
+ goto out;
+
+ pos = skip_spaces(pos);
+ extent->lower_first = simple_strtoul(pos, &pos, 10);
+ if (!isspace(*pos))
+ goto out;
+
+ pos = skip_spaces(pos);
+ extent->count = simple_strtoul(pos, &pos, 10);
+ if (*pos && !isspace(*pos))
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ /* Verify we have been given valid starting values */
+ if ((extent->first == (u32) -1) ||
+ (extent->lower_first == (u32) -1))
+ goto out;
+
+ /* Verify count is not zero and does not cause the
+ * extent to wrap
+ */
+ if ((extent->first + extent->count) <= extent->first)
+ goto out;
+ if ((extent->lower_first + extent->count) <=
+ extent->lower_first)
+ goto out;
+
+ /* Do the ranges in extent overlap any previous extents? */
+ if (mappings_overlap(&new_map, extent))
+ goto out;
+
+ new_map.nr_extents++;
+
+ /* Fail if the file contains too many extents */
+ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+ (next_line != NULL))
+ goto out;
}
+ /* Be very certaint the new map actually exists */
+ if (new_map.nr_extents == 0)
+ goto out;
+
+ ret = -EPERM;
+ /* Validate the user is allowed to use user id's mapped to. */
+ if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+ goto out;
+
+ /* Map the lower ids from the parent user namespace to the
+ * kernel global id space.
+ */
+ for (idx = 0; idx < new_map.nr_extents; idx++) {
+ u32 lower_first;
+ extent = &new_map.extent[idx];
+
+ lower_first = map_id_range_down(parent_map,
+ extent->lower_first,
+ extent->count);
+
+ /* Fail if we can not map the specified extent to
+ * the kernel global id space.
+ */
+ if (lower_first == (u32) -1)
+ goto out;
+
+ extent->lower_first = lower_first;
+ }
+
+ /* Install the map */
+ memcpy(map->extent, new_map.extent,
+ new_map.nr_extents*sizeof(new_map.extent[0]));
+ smp_wmb();
+ map->nr_extents = new_map.nr_extents;
- /* No useful relationship so no mapping */
- return overflowuid;
+ *ppos = count;
+ ret = count;
+out:
+ mutex_unlock(&id_map_mutex);
+ if (page)
+ free_page(page);
+ return ret;
}
-gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
{
- struct user_namespace *tmp;
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
- if (likely(to == cred->user->user_ns))
- return gid;
+ if (!ns->parent)
+ return -EPERM;
- /* Is cred->user the creator of the target user_ns
- * or the creator of one of it's parents?
- */
- for ( tmp = to; tmp != &init_user_ns;
- tmp = tmp->creator->user_ns ) {
- if (cred->user == tmp->creator) {
- return (gid_t)0;
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ return map_write(file, buf, size, ppos, CAP_SETUID,
+ &ns->uid_map, &ns->parent->uid_map);
+}
+
+ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
+
+ if (!ns->parent)
+ return -EPERM;
+
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ return map_write(file, buf, size, ppos, CAP_SETGID,
+ &ns->gid_map, &ns->parent->gid_map);
+}
+
+ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
+
+ if (!ns->parent)
+ return -EPERM;
+
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ /* Anyone can set any valid project id no capability needed */
+ return map_write(file, buf, size, ppos, -1,
+ &ns->projid_map, &ns->parent->projid_map);
+}
+
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
+ struct uid_gid_map *new_map)
+{
+ /* Allow mapping to your own filesystem ids */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ u32 id = new_map->extent[0].lower_first;
+ if (cap_setid == CAP_SETUID) {
+ kuid_t uid = make_kuid(ns->parent, id);
+ if (uid_eq(uid, file->f_cred->fsuid))
+ return true;
+ } else if (cap_setid == CAP_SETGID) {
+ kgid_t gid = make_kgid(ns->parent, id);
+ if (gid_eq(gid, file->f_cred->fsgid))
+ return true;
}
}
- /* No useful relationship so no mapping */
- return overflowgid;
+ /* Allow anyone to set a mapping that doesn't require privilege */
+ if (!cap_valid(cap_setid))
+ return true;
+
+ /* Allow the specified ids if we have the appropriate capability
+ * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+ * And the opener of the id file also had the approprpiate capability.
+ */
+ if (ns_capable(ns->parent, cap_setid) &&
+ file_ns_capable(file, ns->parent, cap_setid))
+ return true;
+
+ return false;
+}
+
+static void *userns_get(struct task_struct *task)
+{
+ struct user_namespace *user_ns;
+
+ rcu_read_lock();
+ user_ns = get_user_ns(__task_cred(task)->user_ns);
+ rcu_read_unlock();
+
+ return user_ns;
+}
+
+static void userns_put(void *ns)
+{
+ put_user_ns(ns);
+}
+
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct user_namespace *user_ns = ns;
+ struct cred *cred;
+
+ /* Don't allow gaining capabilities by reentering
+ * the same user namespace.
+ */
+ if (user_ns == current_user_ns())
+ return -EINVAL;
+
+ /* Threaded processes may not enter a different user namespace */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+
+ if (current->fs->users != 1)
+ return -EINVAL;
+
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cred = prepare_creds();
+ if (!cred)
+ return -ENOMEM;
+
+ put_user_ns(cred->user_ns);
+ set_cred_user_ns(cred, get_user_ns(user_ns));
+
+ return commit_creds(cred);
}
+static unsigned int userns_inum(void *ns)
+{
+ struct user_namespace *user_ns = ns;
+ return user_ns->proc_inum;
+}
+
+const struct proc_ns_operations userns_operations = {
+ .name = "user",
+ .type = CLONE_NEWUSER,
+ .get = userns_get,
+ .put = userns_put,
+ .install = userns_install,
+ .inum = userns_inum,
+};
+
static __init int user_namespaces_init(void)
{
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
return 0;
}
-module_init(user_namespaces_init);
+subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 405caf91aad..fd393124e50 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,7 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -30,20 +30,27 @@ static struct uts_namespace *create_uts_ns(void)
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
*/
-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
+ int err;
ns = create_uts_ns();
if (!ns)
return ERR_PTR(-ENOMEM);
+ err = proc_alloc_inum(&ns->proc_inum);
+ if (err) {
+ kfree(ns);
+ return ERR_PTR(err);
+ }
+
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
+ ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
return ns;
}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
* versa.
*/
struct uts_namespace *copy_utsname(unsigned long flags,
- struct task_struct *tsk)
+ struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
if (!(flags & CLONE_NEWUTS))
return old_ns;
- new_ns = clone_uts_ns(tsk, old_ns);
+ new_ns = clone_uts_ns(user_ns, old_ns);
put_uts_ns(old_ns);
return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
ns = container_of(kref, struct uts_namespace, kref);
put_user_ns(ns->user_ns);
+ proc_free_inum(ns->proc_inum);
kfree(ns);
}
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
put_uts_ns(ns);
}
-static int utsns_install(struct nsproxy *nsproxy, void *ns)
+static int utsns_install(struct nsproxy *nsproxy, void *new)
{
+ struct uts_namespace *ns = new;
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
get_uts_ns(ns);
put_uts_ns(nsproxy->uts_ns);
nsproxy->uts_ns = ns;
return 0;
}
+static unsigned int utsns_inum(void *vp)
+{
+ struct uts_namespace *ns = vp;
+
+ return ns->proc_inum;
+}
+
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
+ .inum = utsns_inum,
};
-
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d82..c8eac43267e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,7 +15,9 @@
#include <linux/sysctl.h>
#include <linux/wait.h>
-static void *get_uts(ctl_table *table, int write)
+#ifdef CONFIG_PROC_SYSCTL
+
+static void *get_uts(struct ctl_table *table, int write)
{
char *which = table->data;
struct uts_namespace *uts_ns;
@@ -30,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)
return which;
}
-static void put_uts(ctl_table *table, int write, void *which)
+static void put_uts(struct ctl_table *table, int write, void *which)
{
if (!write)
up_read(&uts_sem);
@@ -38,19 +40,18 @@ static void put_uts(ctl_table *table, int write, void *which)
up_write(&uts_sem);
}
-#ifdef CONFIG_PROC_SYSCTL
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
-static int proc_do_uts_string(ctl_table *table, int write,
+static int proc_do_uts_string(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
memcpy(&uts_table, table, sizeof(uts_table));
uts_table.data = get_uts(table, write);
- r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
+ r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
put_uts(table, write, uts_table.data);
if (write)
@@ -134,4 +135,4 @@ static int __init utsname_sysctl_init(void)
return 0;
}
-__initcall(utsname_sysctl_init);
+device_initcall(utsname_sysctl_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d117262deba..c3319bd1b04 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,15 +3,14 @@
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
- * this code detects hard lockups: incidents in where on a CPU
- * the kernel does not respond to anything except NMI.
- *
- * Note: Most of this code is borrowed heavily from softlockup.c,
- * so thanks to Ingo for the initial implementation.
- * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
* to those contributors as well.
*/
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
@@ -23,25 +22,38 @@
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/smpboot.h>
+#include <linux/sched/rt.h>
#include <asm/irq_regs.h>
+#include <linux/kvm_para.h>
#include <linux/perf_event.h>
-int watchdog_enabled = 1;
+int watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
+
+static int __read_mostly watchdog_running;
+static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
+static unsigned long soft_lockup_nmi_warn;
/* boot commands */
/*
@@ -58,7 +70,7 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_enabled = 0;
+ watchdog_user_enabled = 0;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -77,7 +89,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_enabled = 0;
+ watchdog_user_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
@@ -85,11 +97,20 @@ __setup("nowatchdog", nowatchdog_setup);
/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
- watchdog_enabled = 0;
+ watchdog_user_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
/* */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_softlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -108,33 +129,36 @@ static int get_softlockup_thresh(void)
* resolution, and we don't need to waste time with a big divide when
* 2^30ns == 1.074s.
*/
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
{
- return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+ return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
}
-static unsigned long get_sample_period(void)
+static void set_sample_period(void)
{
/*
* convert watchdog_thresh from seconds to ns
- * the divide by 5 is to give hrtimer 5 chances to
- * increment before the hardlockup detector generates
- * a warning
+ * the divide by 5 is to give hrtimer several chances (two
+ * or three with the current relation between the soft
+ * and hard thresholds) to increment before the
+ * hardlockup detector generates a warning
*/
- return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
+ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
}
/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
- int this_cpu = smp_processor_id();
-
- __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
+ __this_cpu_write(watchdog_touch_ts, get_timestamp());
}
void touch_softlockup_watchdog(void)
{
- __this_cpu_write(watchdog_touch_ts, 0);
+ /*
+ * Preemption can be enabled. It doesn't matter which CPU's timestamp
+ * gets zeroed here, so use the raw_ operation.
+ */
+ raw_cpu_write(watchdog_touch_ts, 0);
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -154,14 +178,14 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- if (watchdog_enabled) {
- unsigned cpu;
-
- for_each_present_cpu(cpu) {
- if (per_cpu(watchdog_nmi_touch, cpu) != true)
- per_cpu(watchdog_nmi_touch, cpu) = true;
- }
- }
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ __raw_get_cpu_var(watchdog_nmi_touch) = true;
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -190,7 +214,7 @@ static int is_hardlockup(void)
static int is_softlockup(unsigned long touch_ts)
{
- unsigned long now = get_timestamp(smp_processor_id());
+ unsigned long now = get_timestamp();
/* Warn about unreasonable delays: */
if (time_after(now, touch_ts + get_softlockup_thresh()))
@@ -247,13 +271,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-#else
-static inline void watchdog_interrupt_count(void) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -261,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
+ int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -269,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
wake_up_process(__this_cpu_read(softlockup_watchdog));
/* .. and repeat */
- hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
if (touch_ts == 0) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -280,6 +307,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
__this_cpu_write(softlockup_touch_sync, false);
sched_clock_tick();
}
+
+ /* Clear the guest paused flag on watchdog reset */
+ kvm_check_and_clear_guest_paused();
__touch_watchdog();
return HRTIMER_RESTART;
}
@@ -292,10 +322,29 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like a soft lockup, check to see if the host
+ * stopped the vm before we issue the warning
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return HRTIMER_RESTART;
+
/* only warn once */
if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
+ if (softlockup_all_cpu_backtrace) {
+ /* Prevent multiple soft-lockup reports if one cpu is already
+ * engaged in dumping cpu back traces
+ */
+ if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+ /* Someone else will report us. Let's give up */
+ __this_cpu_write(soft_watchdog_warn, true);
+ return HRTIMER_RESTART;
+ }
+ }
+
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -306,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
+ if (softlockup_all_cpu_backtrace) {
+ /* Avoid generating two back traces for current
+ * given that one is already made above
+ */
+ trigger_allbutself_cpu_backtrace();
+
+ clear_bit(0, &soft_lockup_nmi_warn);
+ /* Barrier to sync with other cpus */
+ smp_mb__after_atomic();
+ }
+
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -315,49 +375,78 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+ struct sched_param param = { .sched_priority = prio };
-/*
- * The watchdog thread - touches the timestamp.
- */
-static int watchdog(void *unused)
+ sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- sched_setscheduler(current, SCHED_FIFO, &param);
+ /* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
- /* initialize timestamp */
- __touch_watchdog();
+ /* Enable the perf event */
+ watchdog_nmi_enable(cpu);
- /* kick off the timer for the hardlockup detector */
/* done here because hrtimer_start can only pin to smp_processor_id() */
- hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 60 seconds then the
- * debug-printout triggers in watchdog_timer_fn().
- */
- while (!kthread_should_stop()) {
- __touch_watchdog();
- schedule();
+ /* initialize timestamp */
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ __touch_watchdog();
+}
- if (kthread_should_stop())
- break;
+static void watchdog_disable(unsigned int cpu)
+{
+ struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
- param.sched_priority = 0;
- sched_setscheduler(current, SCHED_NORMAL, &param);
- return 0;
+ watchdog_set_prio(SCHED_NORMAL, 0);
+ hrtimer_cancel(hrtimer);
+ /* disable the perf event */
+ watchdog_nmi_disable(cpu);
+}
+
+static void watchdog_cleanup(unsigned int cpu, bool online)
+{
+ watchdog_disable(cpu);
}
+static int watchdog_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(hrtimer_interrupts) !=
+ __this_cpu_read(soft_lockup_hrtimer_cnt);
+}
+
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every sample_period seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int watchdog_nmi_enable(int cpu)
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+static int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -375,19 +464,31 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+ /* save cpu0 error for future comparision */
+ if (cpu == 0 && IS_ERR(event))
+ cpu0_err = PTR_ERR(event);
+
if (!IS_ERR(event)) {
- printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+ /* only print for cpu0 or different than cpu0 */
+ if (cpu == 0 || cpu0_err)
+ pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
goto out_save;
}
+ /* skip displaying the same error again */
+ if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ return PTR_ERR(event);
/* vary the KERN level based on the returned errno */
if (PTR_ERR(event) == -EOPNOTSUPP)
- printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
- printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+ pr_warning("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
else
- printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
return PTR_ERR(event);
/* success path */
@@ -399,7 +500,7 @@ out:
return 0;
}
-static void watchdog_nmi_disable(int cpu)
+static void watchdog_nmi_disable(unsigned int cpu)
{
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -413,104 +514,90 @@ static void watchdog_nmi_disable(int cpu)
return;
}
#else
-static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(int cpu) { return; }
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-/* prepare/enable/disable routines */
-static void watchdog_prepare_cpu(int cpu)
-{
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- WARN_ON(per_cpu(softlockup_watchdog, cpu));
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
-}
+static struct smp_hotplug_thread watchdog_threads = {
+ .store = &softlockup_watchdog,
+ .thread_should_run = watchdog_should_run,
+ .thread_fn = watchdog,
+ .thread_comm = "watchdog/%u",
+ .setup = watchdog_enable,
+ .cleanup = watchdog_cleanup,
+ .park = watchdog_disable,
+ .unpark = watchdog_enable,
+};
-static int watchdog_enable(int cpu)
+static void restart_watchdog_hrtimer(void *info)
{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err = 0;
-
- /* enable the perf event */
- err = watchdog_nmi_enable(cpu);
-
- /* Regardless of err above, fall through and start softlockup */
-
- /* create the watchdog thread */
- if (!p) {
- p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
- if (IS_ERR(p)) {
- printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
- if (!err) {
- /* if hardlockup hasn't already set this */
- err = PTR_ERR(p);
- /* and disable the perf event */
- watchdog_nmi_disable(cpu);
- }
- goto out;
- }
- kthread_bind(p, cpu);
- per_cpu(watchdog_touch_ts, cpu) = 0;
- per_cpu(softlockup_watchdog, cpu) = p;
- wake_up_process(p);
- }
+ struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ int ret;
-out:
- return err;
+ /*
+ * No need to cancel and restart hrtimer if it is currently executing
+ * because it will reprogram itself with the new period now.
+ * We should never see it unqueued here because we are running per-cpu
+ * with interrupts disabled.
+ */
+ ret = hrtimer_try_to_cancel(hrtimer);
+ if (ret == 1)
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);
}
-static void watchdog_disable(int cpu)
+static void update_timers(int cpu)
{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
/*
- * cancel the timer first to stop incrementing the stats
- * and waking up the kthread
+ * Make sure that perf event counter will adopt to a new
+ * sampling period. Updating the sampling period directly would
+ * be much nicer but we do not have an API for that now so
+ * let's use a big hammer.
+ * Hrtimer will adopt the new period on the next tick but this
+ * might be late already so we have to restart the timer as well.
*/
- hrtimer_cancel(hrtimer);
-
- /* disable the perf event */
watchdog_nmi_disable(cpu);
-
- /* stop the watchdog thread */
- if (p) {
- per_cpu(softlockup_watchdog, cpu) = NULL;
- kthread_stop(p);
- }
+ smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
+ watchdog_nmi_enable(cpu);
}
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
-static void watchdog_enable_all_cpus(void)
+static void update_timers_all_cpus(void)
{
int cpu;
- watchdog_enabled = 0;
-
+ get_online_cpus();
for_each_online_cpu(cpu)
- if (!watchdog_enable(cpu))
- /* if any cpu succeeds, watchdog is considered
- enabled for the system */
- watchdog_enabled = 1;
-
- if (!watchdog_enabled)
- printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
-
+ update_timers(cpu);
+ put_online_cpus();
}
-static void watchdog_disable_all_cpus(void)
+static int watchdog_enable_all_cpus(bool sample_period_changed)
{
- int cpu;
+ int err = 0;
- for_each_online_cpu(cpu)
- watchdog_disable(cpu);
+ if (!watchdog_running) {
+ err = smpboot_register_percpu_thread(&watchdog_threads);
+ if (err)
+ pr_err("Failed to create watchdog threads, disabled\n");
+ else
+ watchdog_running = 1;
+ } else if (sample_period_changed) {
+ update_timers_all_cpus();
+ }
- /* if all watchdogs are disabled, then they are disabled for the system */
- watchdog_enabled = 0;
+ return err;
}
+/* prepare/enable/disable routines */
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+static void watchdog_disable_all_cpus(void)
+{
+ if (watchdog_running) {
+ watchdog_running = 0;
+ smpboot_unregister_percpu_thread(&watchdog_threads);
+ }
+}
/*
* proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
@@ -519,75 +606,43 @@ static void watchdog_disable_all_cpus(void)
int proc_dowatchdog(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int ret;
+ int err, old_thresh, old_enabled;
+ static DEFINE_MUTEX(watchdog_proc_mutex);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret || !write)
+ mutex_lock(&watchdog_proc_mutex);
+ old_thresh = ACCESS_ONCE(watchdog_thresh);
+ old_enabled = ACCESS_ONCE(watchdog_user_enabled);
+
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (err || !write)
goto out;
- if (watchdog_enabled && watchdog_thresh)
- watchdog_enable_all_cpus();
+ set_sample_period();
+ /*
+ * Watchdog threads shouldn't be enabled if they are
+ * disabled. The 'watchdog_running' variable check in
+ * watchdog_*_all_cpus() function takes care of this.
+ */
+ if (watchdog_user_enabled && watchdog_thresh)
+ err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
else
watchdog_disable_all_cpus();
+ /* Restore old values on failure */
+ if (err) {
+ watchdog_thresh = old_thresh;
+ watchdog_user_enabled = old_enabled;
+ }
out:
- return ret;
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
}
#endif /* CONFIG_SYSCTL */
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- watchdog_prepare_cpu(hotcpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (watchdog_enabled)
- watchdog_enable(hotcpu);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- watchdog_disable(hotcpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- watchdog_disable(hotcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
-
- /*
- * hardlockup and softlockup are not important enough
- * to block cpu bring up. Just always succeed and
- * rely on printk output to flag problems.
- */
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
void __init lockup_detector_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- WARN_ON(notifier_to_errno(err));
+ set_sample_period();
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
- register_cpu_notifier(&cpu_nfb);
-
- return;
+ if (watchdog_user_enabled)
+ watchdog_enable_all_cpus(false);
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bec7b5b53e0..35974ac6960 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
*
* This is the generic async execution mechanism. Work items as are
* executed in process context. The worker pool is shared and
- * automatically managed. There is one worker pool for each CPU and
- * one extra for works which are better served by workers which are
- * not bound to any specific CPU.
+ * automatically managed. There are two worker pools for each CPU (one for
+ * normal work items and the other for high priority ones) and some extra
+ * pools for workqueues which are not bound to any specific CPU - the
+ * number of these backing pools is dynamic.
*
* Please read Documentation/workqueue.txt for details.
*/
@@ -41,40 +42,49 @@
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
+#include <linux/jhash.h>
+#include <linux/hashtable.h>
+#include <linux/rculist.h>
+#include <linux/nodemask.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
-#include "workqueue_sched.h"
+#include "workqueue_internal.h"
enum {
- /* global_cwq flags */
- GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
- GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
- GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
- GCWQ_FREEZING = 1 << 3, /* freeze in progress */
- GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
+ /*
+ * worker_pool flags
+ *
+ * A bound pool is either associated or disassociated with its CPU.
+ * While associated (!DISASSOCIATED), all workers are bound to the
+ * CPU and none has %WORKER_UNBOUND set and concurrency management
+ * is in effect.
+ *
+ * While DISASSOCIATED, the cpu may be offline and all workers have
+ * %WORKER_UNBOUND set and concurrency management disabled, and may
+ * be executing on any CPU. The pool behaves as an unbound one.
+ *
+ * Note that DISASSOCIATED should be flipped only while holding
+ * attach_mutex to avoid changing binding state while
+ * worker_attach_to_pool() is in progress.
+ */
+ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
/* worker flags */
- WORKER_STARTED = 1 << 0, /* started */
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
WORKER_PREP = 1 << 3, /* preparing to run works */
- WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
- WORKER_REBIND = 1 << 5, /* mom is home, come back */
WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
WORKER_UNBOUND = 1 << 7, /* worker is unbound */
+ WORKER_REBOUND = 1 << 8, /* worker was rebound */
- WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
- WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
+ WORKER_UNBOUND | WORKER_REBOUND,
- /* gcwq->trustee_state */
- TRUSTEE_START = 0, /* start */
- TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
- TRUSTEE_BUTCHER = 2, /* butcher workers */
- TRUSTEE_RELEASE = 3, /* release workers */
- TRUSTEE_DONE = 4, /* trustee is done */
+ NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
+ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
- BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
- BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
@@ -84,13 +94,15 @@ enum {
(min two ticks) */
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
- TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
/*
* Rescue workers are used only on emergencies and shared by
- * all cpus. Give -20.
+ * all cpus. Give MIN_NICE.
*/
- RESCUER_NICE_LEVEL = -20,
+ RESCUER_NICE_LEVEL = MIN_NICE,
+ HIGHPRI_NICE_LEVEL = MIN_NICE,
+
+ WQ_NAME_LEN = 24,
};
/*
@@ -102,218 +114,284 @@ enum {
* P: Preemption protected. Disabling preemption is enough and should
* only be modified and accessed from the local cpu.
*
- * L: gcwq->lock protected. Access with gcwq->lock held.
+ * L: pool->lock protected. Access with pool->lock held.
*
- * X: During normal operation, modification requires gcwq->lock and
- * should be done only from local cpu. Either disabling preemption
- * on local cpu or grabbing gcwq->lock is enough for read access.
- * If GCWQ_DISASSOCIATED is set, it's identical to L.
+ * X: During normal operation, modification requires pool->lock and should
+ * be done only from local cpu. Either disabling preemption on local
+ * cpu or grabbing pool->lock is enough for read access. If
+ * POOL_DISASSOCIATED is set, it's identical to L.
*
- * F: wq->flush_mutex protected.
+ * A: pool->attach_mutex protected.
*
- * W: workqueue_lock protected.
+ * PL: wq_pool_mutex protected.
+ *
+ * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
+ *
+ * WQ: wq->mutex protected.
+ *
+ * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
+ *
+ * MD: wq_mayday_lock protected.
*/
-struct global_cwq;
-
-/*
- * The poor guys doing the actual heavy lifting. All on-duty workers
- * are either serving the manager role, on idle list or on busy hash.
- */
-struct worker {
- /* on idle list while idle, on busy hash table while busy */
- union {
- struct list_head entry; /* L: while idle */
- struct hlist_node hentry; /* L: while busy */
- };
+/* struct worker is defined in workqueue_internal.h */
- struct work_struct *current_work; /* L: work being processed */
- struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
- struct list_head scheduled; /* L: scheduled works */
- struct task_struct *task; /* I: worker task */
- struct global_cwq *gcwq; /* I: the associated gcwq */
- /* 64 bytes boundary on 64bit, 32 on 32bit */
- unsigned long last_active; /* L: last active timestamp */
+struct worker_pool {
+ spinlock_t lock; /* the pool lock */
+ int cpu; /* I: the associated cpu */
+ int node; /* I: the associated node ID */
+ int id; /* I: pool ID */
unsigned int flags; /* X: flags */
- int id; /* I: worker id */
- struct work_struct rebind_work; /* L: rebind worker to cpu */
-};
-/*
- * Global per-cpu workqueue. There's one and only one for each cpu
- * and all works are queued and processed here regardless of their
- * target workqueues.
- */
-struct global_cwq {
- spinlock_t lock; /* the gcwq lock */
struct list_head worklist; /* L: list of pending works */
- unsigned int cpu; /* I: the associated cpu */
- unsigned int flags; /* L: GCWQ_* flags */
-
int nr_workers; /* L: total number of workers */
+
+ /* nr_idle includes the ones off idle_list for rebinding */
int nr_idle; /* L: currently idle ones */
- /* workers are chained either in the idle_list or busy_hash */
struct list_head idle_list; /* X: list of idle workers */
- struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
+ struct timer_list idle_timer; /* L: worker idle timeout */
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
+
+ /* a workers is either on busy_hash or idle_list, or the manager */
+ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
- struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for dworkers */
+ /* see manage_workers() for details on the two manager mutexes */
+ struct mutex manager_arb; /* manager arbitration */
+ struct mutex attach_mutex; /* attach/detach exclusion */
+ struct list_head workers; /* A: attached workers */
+ struct completion *detach_completion; /* all workers detached */
+
+ struct ida worker_ida; /* worker IDs for task name */
- struct ida worker_ida; /* L: for worker IDs */
+ struct workqueue_attrs *attrs; /* I: worker attributes */
+ struct hlist_node hash_node; /* PL: unbound_pool_hash node */
+ int refcnt; /* PL: refcnt for unbound pools */
- struct task_struct *trustee; /* L: for gcwq shutdown */
- unsigned int trustee_state; /* L: trustee state */
- wait_queue_head_t trustee_wait; /* trustee wait */
- struct worker *first_idle; /* L: first idle worker */
+ /*
+ * The current concurrency level. As it's likely to be accessed
+ * from other CPUs during try_to_wake_up(), put it in a separate
+ * cacheline.
+ */
+ atomic_t nr_running ____cacheline_aligned_in_smp;
+
+ /*
+ * Destruction of pool is sched-RCU protected to allow dereferences
+ * from get_work_pool().
+ */
+ struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
/*
- * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
- * work_struct->data are used for flags and thus cwqs need to be
- * aligned at two's power of the number of flag bits.
+ * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
+ * of work_struct->data are used for flags and the remaining high bits
+ * point to the pwq; thus, pwqs need to be aligned at two's power of the
+ * number of flag bits.
*/
-struct cpu_workqueue_struct {
- struct global_cwq *gcwq; /* I: the associated gcwq */
+struct pool_workqueue {
+ struct worker_pool *pool; /* I: the associated pool */
struct workqueue_struct *wq; /* I: the owning workqueue */
int work_color; /* L: current color */
int flush_color; /* L: flushing color */
+ int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
int nr_active; /* L: nr of active works */
int max_active; /* L: max active works */
struct list_head delayed_works; /* L: delayed works */
-};
+ struct list_head pwqs_node; /* WR: node on wq->pwqs */
+ struct list_head mayday_node; /* MD: node on wq->maydays */
+
+ /*
+ * Release of unbound pwq is punted to system_wq. See put_pwq()
+ * and pwq_unbound_release_workfn() for details. pool_workqueue
+ * itself is also sched-RCU protected so that the first pwq can be
+ * determined without grabbing wq->mutex.
+ */
+ struct work_struct unbound_release_work;
+ struct rcu_head rcu;
+} __aligned(1 << WORK_STRUCT_FLAG_BITS);
/*
* Structure used to wait for workqueue flush.
*/
struct wq_flusher {
- struct list_head list; /* F: list of flushers */
- int flush_color; /* F: flush color waiting for */
+ struct list_head list; /* WQ: list of flushers */
+ int flush_color; /* WQ: flush color waiting for */
struct completion done; /* flush completion */
};
-/*
- * All cpumasks are assumed to be always set on UP and thus can't be
- * used to determine whether there's something to be done.
- */
-#ifdef CONFIG_SMP
-typedef cpumask_var_t mayday_mask_t;
-#define mayday_test_and_set_cpu(cpu, mask) \
- cpumask_test_and_set_cpu((cpu), (mask))
-#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
-#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
-#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
-#define free_mayday_mask(mask) free_cpumask_var((mask))
-#else
-typedef unsigned long mayday_mask_t;
-#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
-#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
-#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
-#define alloc_mayday_mask(maskp, gfp) true
-#define free_mayday_mask(mask) do { } while (0)
-#endif
+struct wq_device;
/*
- * The externally visible workqueue abstraction is an array of
- * per-CPU workqueues:
+ * The externally visible workqueue. It relays the issued work items to
+ * the appropriate worker_pool through its pool_workqueues.
*/
struct workqueue_struct {
- unsigned int flags; /* W: WQ_* flags */
- union {
- struct cpu_workqueue_struct __percpu *pcpu;
- struct cpu_workqueue_struct *single;
- unsigned long v;
- } cpu_wq; /* I: cwq's */
- struct list_head list; /* W: list of all workqueues */
-
- struct mutex flush_mutex; /* protects wq flushing */
- int work_color; /* F: current work color */
- int flush_color; /* F: current flush color */
- atomic_t nr_cwqs_to_flush; /* flush in progress */
- struct wq_flusher *first_flusher; /* F: first flusher */
- struct list_head flusher_queue; /* F: flush waiters */
- struct list_head flusher_overflow; /* F: flush overflow list */
-
- mayday_mask_t mayday_mask; /* cpus requesting rescue */
+ struct list_head pwqs; /* WR: all pwqs of this wq */
+ struct list_head list; /* PL: list of all workqueues */
+
+ struct mutex mutex; /* protects this wq */
+ int work_color; /* WQ: current work color */
+ int flush_color; /* WQ: current flush color */
+ atomic_t nr_pwqs_to_flush; /* flush in progress */
+ struct wq_flusher *first_flusher; /* WQ: first flusher */
+ struct list_head flusher_queue; /* WQ: flush waiters */
+ struct list_head flusher_overflow; /* WQ: flush overflow list */
+
+ struct list_head maydays; /* MD: pwqs requesting rescue */
struct worker *rescuer; /* I: rescue worker */
- int nr_drainers; /* W: drain in progress */
- int saved_max_active; /* W: saved cwq max_active */
+ int nr_drainers; /* WQ: drain in progress */
+ int saved_max_active; /* WQ: saved pwq max_active */
+
+ struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
+ struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
+
+#ifdef CONFIG_SYSFS
+ struct wq_device *wq_dev; /* I: for sysfs interface */
+#endif
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
- char name[]; /* I: workqueue name */
+ char name[WQ_NAME_LEN]; /* I: workqueue name */
+
+ /* hot fields used during command issue, aligned to cacheline */
+ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
+ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
+ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
};
+static struct kmem_cache *pwq_cache;
+
+static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
+static cpumask_var_t *wq_numa_possible_cpumask;
+ /* possible CPUs of each node */
+
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
+static bool wq_power_efficient = true;
+#else
+static bool wq_power_efficient;
+#endif
+
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+
+static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
+
+/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+
+static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
+static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+
+static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static bool workqueue_freezing; /* PL: have wqs started freezing? */
+
+/* the per-cpu worker pools */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+ cpu_worker_pools);
+
+static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
+
+/* PL: hash of all unbound pools keyed by pool->attrs */
+static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
+
+/* I: attributes used when instantiating standard unbound pools on demand */
+static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+
struct workqueue_struct *system_wq __read_mostly;
+EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_highpri_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __read_mostly;
-struct workqueue_struct *system_nrt_wq __read_mostly;
-struct workqueue_struct *system_unbound_wq __read_mostly;
-struct workqueue_struct *system_freezable_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_wq);
EXPORT_SYMBOL_GPL(system_long_wq);
-EXPORT_SYMBOL_GPL(system_nrt_wq);
+struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
+
+static int worker_thread(void *__worker);
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+ const struct workqueue_attrs *from);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
-#define for_each_busy_worker(worker, i, pos, gcwq) \
- for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
- hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+#define assert_rcu_or_pool_mutex() \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU or wq_pool_mutex should be held")
-static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
- unsigned int sw)
-{
- if (cpu < nr_cpu_ids) {
- if (sw & 1) {
- cpu = cpumask_next(cpu, mask);
- if (cpu < nr_cpu_ids)
- return cpu;
- }
- if (sw & 2)
- return WORK_CPU_UNBOUND;
- }
- return WORK_CPU_NONE;
-}
+#define assert_rcu_or_wq_mutex(wq) \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq->mutex), \
+ "sched RCU or wq->mutex should be held")
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
- struct workqueue_struct *wq)
-{
- return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
-}
+#define for_each_cpu_worker_pool(pool, cpu) \
+ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
+ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
+ (pool)++)
-/*
- * CPU iterators
+/**
+ * for_each_pool - iterate through all worker_pools in the system
+ * @pool: iteration cursor
+ * @pi: integer used for iteration
*
- * An extra gcwq is defined for an invalid cpu number
- * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
- * specific CPU. The following iterators are similar to
- * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ * This must be called either with wq_pool_mutex held or sched RCU read
+ * locked. If the pool needs to be used beyond the locking in effect, the
+ * caller is responsible for guaranteeing that the pool stays online.
*
- * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
- * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
- * for_each_cwq_cpu() : possible CPUs for bound workqueues,
- * WORK_CPU_UNBOUND for unbound workqueues
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
*/
-#define for_each_gcwq_cpu(cpu) \
- for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
- (cpu) < WORK_CPU_NONE; \
- (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+#define for_each_pool(pool, pi) \
+ idr_for_each_entry(&worker_pool_idr, pool, pi) \
+ if (({ assert_rcu_or_pool_mutex(); false; })) { } \
+ else
-#define for_each_online_gcwq_cpu(cpu) \
- for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
- (cpu) < WORK_CPU_NONE; \
- (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+/**
+ * for_each_pool_worker - iterate through all workers of a worker_pool
+ * @worker: iteration cursor
+ * @pool: worker_pool to iterate workers of
+ *
+ * This must be called with @pool->attach_mutex.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pool_worker(worker, pool) \
+ list_for_each_entry((worker), &(pool)->workers, node) \
+ if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
+ else
-#define for_each_cwq_cpu(cpu, wq) \
- for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
- (cpu) < WORK_CPU_NONE; \
- (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+/**
+ * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
+ * @pwq: iteration cursor
+ * @wq: the target workqueue
+ *
+ * This must be called either with wq->mutex held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pwq(pwq, wq) \
+ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
+ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
+ else
#ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -426,64 +504,56 @@ void destroy_work_on_stack(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+void destroy_delayed_work_on_stack(struct delayed_work *work)
+{
+ destroy_timer_on_stack(&work->timer);
+ debug_object_free(&work->work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
+
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
-/* Serializes the accesses to the list of workqueues. */
-static DEFINE_SPINLOCK(workqueue_lock);
-static LIST_HEAD(workqueues);
-static bool workqueue_freezing; /* W: have wqs started freezing? */
-
-/*
- * The almighty global cpu workqueues. nr_running is the only field
- * which is expected to be used frequently by other cpus via
- * try_to_wake_up(). Put it in a separate cacheline.
- */
-static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
-
-/*
- * Global cpu workqueue and nr_running counter for unbound gcwq. The
- * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
- * workers have WORKER_UNBOUND set.
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
*/
-static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
-
-static int worker_thread(void *__worker);
-
-static struct global_cwq *get_gcwq(unsigned int cpu)
+static int worker_pool_assign_id(struct worker_pool *pool)
{
- if (cpu != WORK_CPU_UNBOUND)
- return &per_cpu(global_cwq, cpu);
- else
- return &unbound_global_cwq;
-}
+ int ret;
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
-{
- if (cpu != WORK_CPU_UNBOUND)
- return &per_cpu(gcwq_nr_running, cpu);
- else
- return &unbound_gcwq_nr_running;
+ lockdep_assert_held(&wq_pool_mutex);
+
+ ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+ GFP_KERNEL);
+ if (ret >= 0) {
+ pool->id = ret;
+ return 0;
+ }
+ return ret;
}
-static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
- struct workqueue_struct *wq)
+/**
+ * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
+ * @wq: the target workqueue
+ * @node: the node ID
+ *
+ * This must be called either with pwq_lock held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ *
+ * Return: The unbound pool_workqueue for @node.
+ */
+static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
+ int node)
{
- if (!(wq->flags & WQ_UNBOUND)) {
- if (likely(cpu < nr_cpu_ids)) {
-#ifdef CONFIG_SMP
- return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
-#else
- return wq->cpu_wq.single;
-#endif
- }
- } else if (likely(cpu == WORK_CPU_UNBOUND))
- return wq->cpu_wq.single;
- return NULL;
+ assert_rcu_or_wq_mutex(wq);
+ return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
static unsigned int work_color_to_flags(int color)
@@ -503,126 +573,197 @@ static int work_next_color(int color)
}
/*
- * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
- * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
- * cleared and the work data contains the cpu number it was last on.
- *
- * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
- * cwq, cpu or clear work->data. These functions should only be
- * called while the work is owned - ie. while the PENDING bit is set.
- *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq
- * corresponding to a work. gcwq is available once the work has been
- * queued anywhere after initialization. cwq is available only from
- * queueing until execution starts.
+ * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued pwq. Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and pool ID.
+ *
+ * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the pwq, pool or clear
+ * work->data. These functions should only be called while the work is
+ * owned - ie. while the PENDING bit is set.
+ *
+ * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
+ * corresponding to a work. Pool is available once the work has been
+ * queued anywhere after initialization until it is sync canceled. pwq is
+ * available only while the work item is queued.
+ *
+ * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * canceled. While being canceled, a work item may have its PENDING set
+ * but stay off timer and worklist for arbitrarily long and nobody should
+ * try to steal the PENDING bit.
*/
static inline void set_work_data(struct work_struct *work, unsigned long data,
unsigned long flags)
{
- BUG_ON(!work_pending(work));
+ WARN_ON_ONCE(!work_pending(work));
atomic_long_set(&work->data, data | flags | work_static(work));
}
-static void set_work_cwq(struct work_struct *work,
- struct cpu_workqueue_struct *cwq,
+static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
unsigned long extra_flags)
{
- set_work_data(work, (unsigned long)cwq,
- WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+ set_work_data(work, (unsigned long)pwq,
+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
}
-static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+static void set_work_pool_and_keep_pending(struct work_struct *work,
+ int pool_id)
{
- set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+ set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
+ WORK_STRUCT_PENDING);
+}
+
+static void set_work_pool_and_clear_pending(struct work_struct *work,
+ int pool_id)
+{
+ /*
+ * The following wmb is paired with the implied mb in
+ * test_and_set_bit(PENDING) and ensures all updates to @work made
+ * here are visible to and precede any updates by the next PENDING
+ * owner.
+ */
+ smp_wmb();
+ set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
}
static void clear_work_data(struct work_struct *work)
{
- set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+ smp_wmb(); /* see set_work_pool_and_clear_pending() */
+ set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
-static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
- if (data & WORK_STRUCT_CWQ)
+ if (data & WORK_STRUCT_PWQ)
return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
else
return NULL;
}
-static struct global_cwq *get_work_gcwq(struct work_struct *work)
+/**
+ * get_work_pool - return the worker_pool a given work was associated with
+ * @work: the work item of interest
+ *
+ * Pools are created and destroyed under wq_pool_mutex, and allows read
+ * access under sched-RCU read lock. As such, this function should be
+ * called under wq_pool_mutex or with preemption disabled.
+ *
+ * All fields of the returned pool are accessible as long as the above
+ * mentioned locking is in effect. If the returned pool needs to be used
+ * beyond the critical section, the caller is responsible for ensuring the
+ * returned pool is and stays online.
+ *
+ * Return: The worker_pool @work was last associated with. %NULL if none.
+ */
+static struct worker_pool *get_work_pool(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
- unsigned int cpu;
+ int pool_id;
+
+ assert_rcu_or_pool_mutex();
- if (data & WORK_STRUCT_CWQ)
- return ((struct cpu_workqueue_struct *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+ if (data & WORK_STRUCT_PWQ)
+ return ((struct pool_workqueue *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
- cpu = data >> WORK_STRUCT_FLAG_BITS;
- if (cpu == WORK_CPU_NONE)
+ pool_id = data >> WORK_OFFQ_POOL_SHIFT;
+ if (pool_id == WORK_OFFQ_POOL_NONE)
return NULL;
- BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
- return get_gcwq(cpu);
+ return idr_find(&worker_pool_idr, pool_id);
+}
+
+/**
+ * get_work_pool_id - return the worker pool ID a given work is associated with
+ * @work: the work item of interest
+ *
+ * Return: The worker_pool ID @work was last associated with.
+ * %WORK_OFFQ_POOL_NONE if none.
+ */
+static int get_work_pool_id(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ if (data & WORK_STRUCT_PWQ)
+ return ((struct pool_workqueue *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+
+ return data >> WORK_OFFQ_POOL_SHIFT;
+}
+
+static void mark_work_canceling(struct work_struct *work)
+{
+ unsigned long pool_id = get_work_pool_id(work);
+
+ pool_id <<= WORK_OFFQ_POOL_SHIFT;
+ set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
+}
+
+static bool work_is_canceling(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
}
/*
- * Policy functions. These define the policies on how the global
- * worker pool is managed. Unless noted otherwise, these functions
- * assume that they're being called with gcwq->lock held.
+ * Policy functions. These define the policies on how the global worker
+ * pools are managed. Unless noted otherwise, these functions assume that
+ * they're being called with pool->lock held.
*/
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
{
- return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
- gcwq->flags & GCWQ_HIGHPRI_PENDING;
+ return !atomic_read(&pool->nr_running);
}
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound pools as long as the
+ * worklist isn't empty.
*/
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
{
- return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+ return !list_empty(&pool->worklist) && __need_more_worker(pool);
}
/* Can I start working? Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
{
- return gcwq->nr_idle;
+ return pool->nr_idle;
}
/* Do I need to keep working? Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
{
- atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
-
- return !list_empty(&gcwq->worklist) &&
- (atomic_read(nr_running) <= 1 ||
- gcwq->flags & GCWQ_HIGHPRI_PENDING);
+ return !list_empty(&pool->worklist) &&
+ atomic_read(&pool->nr_running) <= 1;
}
/* Do we need a new worker? Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
{
- return need_more_worker(gcwq) && !may_start_working(gcwq);
-}
-
-/* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
-{
- return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+ return need_more_worker(pool) && !may_start_working(pool);
}
/* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
- int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
- int nr_busy = gcwq->nr_workers - nr_idle;
+ bool managing = mutex_is_locked(&pool->manager_arb);
+ int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+ int nr_busy = pool->nr_workers - nr_idle;
+
+ /*
+ * nr_idle and idle_list may disagree if idle rebinding is in
+ * progress. Never return %true if idle_list is empty.
+ */
+ if (list_empty(&pool->idle_list))
+ return false;
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -631,27 +772,27 @@ static bool too_many_workers(struct global_cwq *gcwq)
* Wake up functions.
*/
-/* Return the first worker. Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+/* Return the first idle worker. Safe with preemption disabled */
+static struct worker *first_idle_worker(struct worker_pool *pool)
{
- if (unlikely(list_empty(&gcwq->idle_list)))
+ if (unlikely(list_empty(&pool->idle_list)))
return NULL;
- return list_first_entry(&gcwq->idle_list, struct worker, entry);
+ return list_first_entry(&pool->idle_list, struct worker, entry);
}
/**
* wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
*
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
{
- struct worker *worker = first_worker(gcwq);
+ struct worker *worker = first_idle_worker(pool);
if (likely(worker))
wake_up_process(worker->task);
@@ -668,12 +809,14 @@ static void wake_up_worker(struct global_cwq *gcwq)
* CONTEXT:
* spin_lock_irq(rq->lock)
*/
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+void wq_worker_waking_up(struct task_struct *task, int cpu)
{
struct worker *worker = kthread_data(task);
- if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(get_gcwq_nr_running(cpu));
+ if (!(worker->flags & WORKER_NOT_RUNNING)) {
+ WARN_ON_ONCE(worker->pool->cpu != cpu);
+ atomic_inc(&worker->pool->nr_running);
+ }
}
/**
@@ -688,35 +831,42 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
* CONTEXT:
* spin_lock_irq(rq->lock)
*
- * RETURNS:
+ * Return:
* Worker task on @cpu to wake up, %NULL if none.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
- unsigned int cpu)
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
{
struct worker *worker = kthread_data(task), *to_wakeup = NULL;
- struct global_cwq *gcwq = get_gcwq(cpu);
- atomic_t *nr_running = get_gcwq_nr_running(cpu);
+ struct worker_pool *pool;
+ /*
+ * Rescuers, which may not have all the fields set up like normal
+ * workers, also reach here, let's not access anything before
+ * checking NOT_RUNNING.
+ */
if (worker->flags & WORKER_NOT_RUNNING)
return NULL;
+ pool = worker->pool;
+
/* this can only happen on the local cpu */
- BUG_ON(cpu != raw_smp_processor_id());
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+ return NULL;
/*
* The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
*
- * NOT_RUNNING is clear. This means that trustee is not in
- * charge and we're running on the local cpu w/ rq lock held
- * and preemption disabled, which in turn means that none else
- * could be manipulating idle_list, so dereferencing idle_list
- * without gcwq lock is safe.
- */
- if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
- to_wakeup = first_worker(gcwq);
+ * NOT_RUNNING is clear. This means that we're bound to and
+ * running on the local cpu w/ rq lock held and preemption
+ * disabled, which in turn means that none else could be
+ * manipulating idle_list, so dereferencing idle_list without pool
+ * lock is safe.
+ */
+ if (atomic_dec_and_test(&pool->nr_running) &&
+ !list_empty(&pool->worklist))
+ to_wakeup = first_idle_worker(pool);
return to_wakeup ? to_wakeup->task : NULL;
}
@@ -731,12 +881,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
* woken up.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags,
bool wakeup)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
@@ -747,14 +897,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
*/
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
-
if (wakeup) {
- if (atomic_dec_and_test(nr_running) &&
- !list_empty(&gcwq->worklist))
- wake_up_worker(gcwq);
+ if (atomic_dec_and_test(&pool->nr_running) &&
+ !list_empty(&pool->worklist))
+ wake_up_worker(pool);
} else
- atomic_dec(nr_running);
+ atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
@@ -768,11 +916,11 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
WARN_ON_ONCE(worker->task != current);
@@ -786,344 +934,543 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+ atomic_inc(&pool->nr_running);
}
/**
- * busy_worker_head - return the busy hash head for a work
- * @gcwq: gcwq of interest
- * @work: work to be hashed
+ * find_worker_executing_work - find worker which is executing a work
+ * @pool: pool of interest
+ * @work: work to find worker for
*
- * Return hash head of @gcwq for @work.
+ * Find a worker which is executing @work on @pool by searching
+ * @pool->busy_hash which is keyed by the address of @work. For a worker
+ * to match, its current execution should match the address of @work and
+ * its work function. This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky. A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item. If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address and work function to avoid
+ * false positives. Note that this isn't complete as one may construct a
+ * work function which can introduce dependency onto itself through a
+ * recycled work item. Well, if somebody wants to shoot oneself in the
+ * foot that badly, there's only so much we can do, and if such deadlock
+ * actually occurs, it should be easy to locate the culprit work function.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*
- * RETURNS:
- * Pointer to the hash head.
+ * Return:
+ * Pointer to worker which is executing @work if found, %NULL
+ * otherwise.
*/
-static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
- struct work_struct *work)
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
+ struct work_struct *work)
{
- const int base_shift = ilog2(sizeof(struct work_struct));
- unsigned long v = (unsigned long)work;
+ struct worker *worker;
- /* simple shift and fold hash, do we need something better? */
- v >>= base_shift;
- v += v >> BUSY_WORKER_HASH_ORDER;
- v &= BUSY_WORKER_HASH_MASK;
+ hash_for_each_possible(pool->busy_hash, worker, hentry,
+ (unsigned long)work)
+ if (worker->current_work == work &&
+ worker->current_func == work->func)
+ return worker;
- return &gcwq->busy_hash[v];
+ return NULL;
}
/**
- * __find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @bwh: hash head as returned by busy_worker_head()
- * @work: work to find worker for
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
*
- * Find a worker which is executing @work on @gcwq. @bwh should be
- * the hash head obtained by calling busy_worker_head() with the same
- * work.
+ * Schedule linked works starting from @work to @head. Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
*
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work. This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
*
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
- * otherwise.
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
*/
-static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
- struct hlist_head *bwh,
- struct work_struct *work)
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+ struct work_struct **nextp)
{
- struct worker *worker;
- struct hlist_node *tmp;
+ struct work_struct *n;
- hlist_for_each_entry(worker, tmp, bwh, hentry)
- if (worker->current_work == work)
- return worker;
- return NULL;
+ /*
+ * Linked worklist will always end before the end of the list,
+ * use NULL for list head.
+ */
+ list_for_each_entry_safe_from(work, n, NULL, entry) {
+ list_move_tail(&work->entry, head);
+ if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+ break;
+ }
+
+ /*
+ * If we're already inside safe list traversal and have moved
+ * multiple works to the scheduled queue, the next position
+ * needs to be updated.
+ */
+ if (nextp)
+ *nextp = n;
}
/**
- * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @work: work to find worker for
+ * get_pwq - get an extra reference on the specified pool_workqueue
+ * @pwq: pool_workqueue to get
*
- * Find a worker which is executing @work on @gcwq. This function is
- * identical to __find_worker_executing_work() except that this
- * function calculates @bwh itself.
+ * Obtain an extra reference on @pwq. The caller should guarantee that
+ * @pwq has positive refcnt and be holding the matching pool->lock.
+ */
+static void get_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&pwq->pool->lock);
+ WARN_ON_ONCE(pwq->refcnt <= 0);
+ pwq->refcnt++;
+}
+
+/**
+ * put_pwq - put a pool_workqueue reference
+ * @pwq: pool_workqueue to put
*
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
+ * destruction. The caller should be holding the matching pool->lock.
+ */
+static void put_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&pwq->pool->lock);
+ if (likely(--pwq->refcnt))
+ return;
+ if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
+ return;
+ /*
+ * @pwq can't be released under pool->lock, bounce to
+ * pwq_unbound_release_workfn(). This never recurses on the same
+ * pool->lock as this path is taken only for unbound workqueues and
+ * the release work item is scheduled on a per-cpu workqueue. To
+ * avoid lockdep warning, unbound pool->locks are given lockdep
+ * subclass of 1 in get_unbound_pool().
+ */
+ schedule_work(&pwq->unbound_release_work);
+}
+
+/**
+ * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
+ * @pwq: pool_workqueue to put (can be %NULL)
*
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
- * otherwise.
+ * put_pwq() with locking. This function also allows %NULL @pwq.
*/
-static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
- struct work_struct *work)
+static void put_pwq_unlocked(struct pool_workqueue *pwq)
+{
+ if (pwq) {
+ /*
+ * As both pwqs and pools are sched-RCU protected, the
+ * following lock operations are safe.
+ */
+ spin_lock_irq(&pwq->pool->lock);
+ put_pwq(pwq);
+ spin_unlock_irq(&pwq->pool->lock);
+ }
+}
+
+static void pwq_activate_delayed_work(struct work_struct *work)
{
- return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
- work);
+ struct pool_workqueue *pwq = get_work_pwq(work);
+
+ trace_workqueue_activate_work(work);
+ move_linked_works(work, &pwq->pool->worklist, NULL);
+ __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+ pwq->nr_active++;
+}
+
+static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
+{
+ struct work_struct *work = list_first_entry(&pwq->delayed_works,
+ struct work_struct, entry);
+
+ pwq_activate_delayed_work(work);
}
/**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
- * @cwq: cwq a work is being queued for
+ * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
+ * @pwq: pwq of interest
+ * @color: color of work which left the queue
*
- * A work for @cwq is about to be queued on @gcwq, determine insertion
- * position for the work. If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue. This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
- * there are HIGHPRI works pending.
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its pwq and handle workqueue flushing.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
+ * spin_lock_irq(pool->lock).
*/
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
- struct cpu_workqueue_struct *cwq)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
- struct work_struct *twork;
+ /* uncolored work items don't participate in flushing or nr_active */
+ if (color == WORK_NO_COLOR)
+ goto out_put;
- if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
- return &gcwq->worklist;
+ pwq->nr_in_flight[color]--;
- list_for_each_entry(twork, &gcwq->worklist, entry) {
- struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+ pwq->nr_active--;
+ if (!list_empty(&pwq->delayed_works)) {
+ /* one down, submit a delayed one */
+ if (pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
+ }
- if (!(tcwq->wq->flags & WQ_HIGHPRI))
- break;
+ /* is flush in progress and are we at the flushing tip? */
+ if (likely(pwq->flush_color != color))
+ goto out_put;
+
+ /* are there still in-flight works? */
+ if (pwq->nr_in_flight[color])
+ goto out_put;
+
+ /* this pwq is done, clear flush_color */
+ pwq->flush_color = -1;
+
+ /*
+ * If this was the last pwq, wake up the first flusher. It
+ * will handle the rest.
+ */
+ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
+ complete(&pwq->wq->first_flusher->done);
+out_put:
+ put_pwq(pwq);
+}
+
+/**
+ * try_to_grab_pending - steal work item from worklist and disable irq
+ * @work: work item to steal
+ * @is_dwork: @work is a delayed_work
+ * @flags: place to store irq state
+ *
+ * Try to grab PENDING bit of @work. This function can handle @work in any
+ * stable state - idle, on timer or on worklist.
+ *
+ * Return:
+ * 1 if @work was pending and we successfully stole PENDING
+ * 0 if @work was idle and we claimed PENDING
+ * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ * -ENOENT if someone else is canceling @work, this state may persist
+ * for arbitrarily long
+ *
+ * Note:
+ * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
+ * interrupted while holding PENDING and @work off queue, irq must be
+ * disabled on entry. This, combined with delayed_work->timer being
+ * irqsafe, ensures that we return -EAGAIN for finite short period of time.
+ *
+ * On successful return, >= 0, irq is disabled and the caller is
+ * responsible for releasing it using local_irq_restore(*@flags).
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+ unsigned long *flags)
+{
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+
+ local_irq_save(*flags);
+
+ /* try to steal the timer if it exists */
+ if (is_dwork) {
+ struct delayed_work *dwork = to_delayed_work(work);
+
+ /*
+ * dwork->timer is irqsafe. If del_timer() fails, it's
+ * guaranteed that the timer is not queued anywhere and not
+ * running on the local CPU.
+ */
+ if (likely(del_timer(&dwork->timer)))
+ return 1;
}
- gcwq->flags |= GCWQ_HIGHPRI_PENDING;
- return &twork->entry;
+ /* try to claim PENDING the normal way */
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+ return 0;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+ pool = get_work_pool(work);
+ if (!pool)
+ goto fail;
+
+ spin_lock(&pool->lock);
+ /*
+ * work->data is guaranteed to point to pwq only while the work
+ * item is queued on pwq->wq, and both updating work->data to point
+ * to pwq on queueing and to pool on dequeueing are done under
+ * pwq->pool->lock. This in turn guarantees that, if work->data
+ * points to pwq which is associated with a locked pool, the work
+ * item is currently queued on that pool.
+ */
+ pwq = get_work_pwq(work);
+ if (pwq && pwq->pool == pool) {
+ debug_work_deactivate(work);
+
+ /*
+ * A delayed work item cannot be grabbed directly because
+ * it might have linked NO_COLOR work items which, if left
+ * on the delayed_list, will confuse pwq->nr_active
+ * management later on and cause stall. Make sure the work
+ * item is activated before grabbing.
+ */
+ if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+ pwq_activate_delayed_work(work);
+
+ list_del_init(&work->entry);
+ pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+
+ /* work->data points to pwq iff queued, point to pool */
+ set_work_pool_and_keep_pending(work, pool->id);
+
+ spin_unlock(&pool->lock);
+ return 1;
+ }
+ spin_unlock(&pool->lock);
+fail:
+ local_irq_restore(*flags);
+ if (work_is_canceling(work))
+ return -ENOENT;
+ cpu_relax();
+ return -EAGAIN;
}
/**
- * insert_work - insert a work into gcwq
- * @cwq: cwq @work belongs to
+ * insert_work - insert a work into a pool
+ * @pwq: pwq @work belongs to
* @work: work to insert
* @head: insertion point
* @extra_flags: extra WORK_STRUCT_* flags to set
*
- * Insert @work which belongs to @cwq into @gcwq after @head.
- * @extra_flags is or'd to work_struct flags.
+ * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
+ * work_struct flags.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void insert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work, struct list_head *head,
- unsigned int extra_flags)
+static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
+ struct list_head *head, unsigned int extra_flags)
{
- struct global_cwq *gcwq = cwq->gcwq;
+ struct worker_pool *pool = pwq->pool;
/* we own @work, set data and link */
- set_work_cwq(work, cwq, extra_flags);
-
- /*
- * Ensure that we get the right work->data if we see the
- * result of list_add() below, see try_to_grab_pending().
- */
- smp_wmb();
-
+ set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
+ get_pwq(pwq);
/*
- * Ensure either worker_sched_deactivated() sees the above
- * list_add_tail() or we see zero nr_running to avoid workers
- * lying around lazily while there are works to be processed.
+ * Ensure either wq_worker_sleeping() sees the above
+ * list_add_tail() or we see zero nr_running to avoid workers lying
+ * around lazily while there are works to be processed.
*/
smp_mb();
- if (__need_more_worker(gcwq))
- wake_up_worker(gcwq);
+ if (__need_more_worker(pool))
+ wake_up_worker(pool);
}
/*
* Test whether @work is being queued from another work executing on the
- * same workqueue. This is rather expensive and should only be used from
- * cold paths.
+ * same workqueue.
*/
static bool is_chained_work(struct workqueue_struct *wq)
{
- unsigned long flags;
- unsigned int cpu;
-
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
- struct worker *worker;
- struct hlist_node *pos;
- int i;
+ struct worker *worker;
- spin_lock_irqsave(&gcwq->lock, flags);
- for_each_busy_worker(worker, i, pos, gcwq) {
- if (worker->task != current)
- continue;
- spin_unlock_irqrestore(&gcwq->lock, flags);
- /*
- * I'm @worker, no locking necessary. See if @work
- * is headed to the same workqueue.
- */
- return worker->current_cwq->wq == wq;
- }
- spin_unlock_irqrestore(&gcwq->lock, flags);
- }
- return false;
+ worker = current_wq_worker();
+ /*
+ * Return %true iff I'm a worker execuing a work item on @wq. If
+ * I'm @worker, it's safe to dereference it without locking.
+ */
+ return worker && worker->current_pwq->wq == wq;
}
-static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
+static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
- struct global_cwq *gcwq;
- struct cpu_workqueue_struct *cwq;
+ struct pool_workqueue *pwq;
+ struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
- unsigned long flags;
+ unsigned int req_cpu = cpu;
+
+ /*
+ * While a work item is PENDING && off queue, a task trying to
+ * steal the PENDING will busy-loop waiting for it to either get
+ * queued or lose PENDING. Grabbing PENDING and queueing should
+ * happen with IRQ disabled.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
debug_work_activate(work);
- /* if dying, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & WQ_DRAINING) &&
+ /* if draining, only works from the same workqueue are allowed */
+ if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
+retry:
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
- /* determine gcwq to use */
- if (!(wq->flags & WQ_UNBOUND)) {
- struct global_cwq *last_gcwq;
-
- if (unlikely(cpu == WORK_CPU_UNBOUND))
- cpu = raw_smp_processor_id();
+ /* pwq which will be used unless @work is executing elsewhere */
+ if (!(wq->flags & WQ_UNBOUND))
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ else
+ pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- /*
- * It's multi cpu. If @wq is non-reentrant and @work
- * was previously on a different cpu, it might still
- * be running there, in which case the work needs to
- * be queued on that cpu to guarantee non-reentrance.
- */
- gcwq = get_gcwq(cpu);
- if (wq->flags & WQ_NON_REENTRANT &&
- (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
- struct worker *worker;
+ /*
+ * If @work was previously on a different pool, it might still be
+ * running there, in which case the work needs to be queued on that
+ * pool to guarantee non-reentrancy.
+ */
+ last_pool = get_work_pool(work);
+ if (last_pool && last_pool != pwq->pool) {
+ struct worker *worker;
- spin_lock_irqsave(&last_gcwq->lock, flags);
+ spin_lock(&last_pool->lock);
- worker = find_worker_executing_work(last_gcwq, work);
+ worker = find_worker_executing_work(last_pool, work);
- if (worker && worker->current_cwq->wq == wq)
- gcwq = last_gcwq;
- else {
- /* meh... not running there, queue here */
- spin_unlock_irqrestore(&last_gcwq->lock, flags);
- spin_lock_irqsave(&gcwq->lock, flags);
- }
- } else
- spin_lock_irqsave(&gcwq->lock, flags);
+ if (worker && worker->current_pwq->wq == wq) {
+ pwq = worker->current_pwq;
+ } else {
+ /* meh... not running there, queue here */
+ spin_unlock(&last_pool->lock);
+ spin_lock(&pwq->pool->lock);
+ }
} else {
- gcwq = get_gcwq(WORK_CPU_UNBOUND);
- spin_lock_irqsave(&gcwq->lock, flags);
+ spin_lock(&pwq->pool->lock);
+ }
+
+ /*
+ * pwq is determined and locked. For unbound pools, we could have
+ * raced with pwq release and it could already be dead. If its
+ * refcnt is zero, repeat pwq selection. Note that pwqs never die
+ * without another pwq replacing it in the numa_pwq_tbl or while
+ * work items are executing on it, so the retrying is guaranteed to
+ * make forward-progress.
+ */
+ if (unlikely(!pwq->refcnt)) {
+ if (wq->flags & WQ_UNBOUND) {
+ spin_unlock(&pwq->pool->lock);
+ cpu_relax();
+ goto retry;
+ }
+ /* oops */
+ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
+ wq->name, cpu);
}
- /* gcwq determined, get cwq and queue */
- cwq = get_cwq(gcwq->cpu, wq);
- trace_workqueue_queue_work(cpu, cwq, work);
+ /* pwq determined, queue */
+ trace_workqueue_queue_work(req_cpu, pwq, work);
- BUG_ON(!list_empty(&work->entry));
+ if (WARN_ON(!list_empty(&work->entry))) {
+ spin_unlock(&pwq->pool->lock);
+ return;
+ }
- cwq->nr_in_flight[cwq->work_color]++;
- work_flags = work_color_to_flags(cwq->work_color);
+ pwq->nr_in_flight[pwq->work_color]++;
+ work_flags = work_color_to_flags(pwq->work_color);
- if (likely(cwq->nr_active < cwq->max_active)) {
+ if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
- cwq->nr_active++;
- worklist = gcwq_determine_ins_pos(gcwq, cwq);
+ pwq->nr_active++;
+ worklist = &pwq->pool->worklist;
} else {
work_flags |= WORK_STRUCT_DELAYED;
- worklist = &cwq->delayed_works;
+ worklist = &pwq->delayed_works;
}
- insert_work(cwq, work, worklist, work_flags);
+ insert_work(pwq, work, worklist, work_flags);
- spin_unlock_irqrestore(&gcwq->lock, flags);
+ spin_unlock(&pwq->pool->lock);
}
/**
- * queue_work - queue work on a workqueue
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
- */
-int queue_work(struct workqueue_struct *wq, struct work_struct *work)
-{
- int ret;
-
- ret = queue_work_on(get_cpu(), wq, work);
- put_cpu();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(queue_work);
-
-/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- *
* We queue the work to a specific CPU, the caller must ensure it
* can't go away.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
*/
-int
-queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work_on(int cpu, struct workqueue_struct *wq,
+ struct work_struct *work)
{
- int ret = 0;
+ bool ret = false;
+ unsigned long flags;
+
+ local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
- ret = 1;
+ ret = true;
}
+
+ local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL(queue_work_on);
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
- __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
+ /* should have been called from irqsafe timer with irq already off */
+ __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
+EXPORT_SYMBOL(delayed_work_timer_fn);
-/**
- * queue_delayed_work - queue work on a workqueue after delay
- * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- */
-int queue_delayed_work(struct workqueue_struct *wq,
- struct delayed_work *dwork, unsigned long delay)
+static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
{
- if (delay == 0)
- return queue_work(wq, &dwork->work);
+ struct timer_list *timer = &dwork->timer;
+ struct work_struct *work = &dwork->work;
+
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+ timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer_pending(timer));
+ WARN_ON_ONCE(!list_empty(&work->entry));
- return queue_delayed_work_on(-1, wq, dwork, delay);
+ /*
+ * If @delay is 0, queue @dwork->work immediately. This is for
+ * both optimization and correctness. The earliest @timer can
+ * expire is on the closest next tick and delayed_work users depend
+ * on that there's no such delay when @delay is 0.
+ */
+ if (!delay) {
+ __queue_work(cpu, wq, &dwork->work);
+ return;
+ }
+
+ timer_stats_timer_set_start_info(&dwork->timer);
+
+ dwork->wq = wq;
+ dwork->cpu = cpu;
+ timer->expires = jiffies + delay;
+
+ if (unlikely(cpu != WORK_CPU_UNBOUND))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
}
-EXPORT_SYMBOL_GPL(queue_delayed_work);
/**
* queue_delayed_work_on - queue work on specific CPU after delay
@@ -1132,53 +1479,67 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Return: %false if @work was already on a queue, %true otherwise. If
+ * @delay is zero and @dwork is idle, it will be scheduled for immediate
+ * execution.
*/
-int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct delayed_work *dwork, unsigned long delay)
+bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ bool ret = false;
+ unsigned long flags;
- if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
- unsigned int lcpu;
-
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
-
- timer_stats_timer_set_start_info(&dwork->timer);
+ /* read the comment in __queue_work() */
+ local_irq_save(flags);
- /*
- * This stores cwq for the moment, for the timer_fn.
- * Note that the work's gcwq is preserved to allow
- * reentrance detection for delayed works.
- */
- if (!(wq->flags & WQ_UNBOUND)) {
- struct global_cwq *gcwq = get_work_gcwq(work);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_delayed_work(cpu, wq, dwork, delay);
+ ret = true;
+ }
- if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
- lcpu = gcwq->cpu;
- else
- lcpu = raw_smp_processor_id();
- } else
- lcpu = WORK_CPU_UNBOUND;
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL(queue_delayed_work_on);
- set_work_cwq(work, get_cwq(lcpu, wq), 0);
+/**
+ * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * modify @dwork's timer so that it expires after @delay. If @delay is
+ * zero, @work is guaranteed to be scheduled immediately regardless of its
+ * current state.
+ *
+ * Return: %false if @dwork was idle and queued, %true if @dwork was
+ * pending and its timer was modified.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See try_to_grab_pending() for details.
+ */
+bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ unsigned long flags;
+ int ret;
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)dwork;
- timer->function = delayed_work_timer_fn;
+ do {
+ ret = try_to_grab_pending(&dwork->work, true, &flags);
+ } while (unlikely(ret == -EAGAIN));
- if (unlikely(cpu >= 0))
- add_timer_on(timer, cpu);
- else
- add_timer(timer);
- ret = 1;
+ if (likely(ret >= 0)) {
+ __queue_delayed_work(cpu, wq, dwork, delay);
+ local_irq_restore(flags);
}
+
+ /* -ENOENT from try_to_grab_pending() becomes %true */
return ret;
}
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+EXPORT_SYMBOL_GPL(mod_delayed_work_on);
/**
* worker_enter_idle - enter idle state
@@ -1188,34 +1549,37 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
* necessary.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void worker_enter_idle(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
- BUG_ON(worker->flags & WORKER_IDLE);
- BUG_ON(!list_empty(&worker->entry) &&
- (worker->hentry.next || worker->hentry.pprev));
+ if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
+ WARN_ON_ONCE(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev)))
+ return;
/* can't use worker_set_flags(), also called from start_worker() */
worker->flags |= WORKER_IDLE;
- gcwq->nr_idle++;
+ pool->nr_idle++;
worker->last_active = jiffies;
/* idle_list is LIFO */
- list_add(&worker->entry, &gcwq->idle_list);
+ list_add(&worker->entry, &pool->idle_list);
- if (likely(!(worker->flags & WORKER_ROGUE))) {
- if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
- mod_timer(&gcwq->idle_timer,
- jiffies + IDLE_WORKER_TIMEOUT);
- } else
- wake_up_all(&gcwq->trustee_wait);
+ if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
- /* sanity check nr_running */
- WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
- atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+ /*
+ * Sanity check nr_running. Because wq_unbind_fn() releases
+ * pool->lock between setting %WORKER_UNBOUND and zapping
+ * nr_running, the warning may trigger spuriously. Check iff
+ * unbind is not in progress.
+ */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
+ pool->nr_workers == pool->nr_idle &&
+ atomic_read(&pool->nr_running));
}
/**
@@ -1225,184 +1589,146 @@ static void worker_enter_idle(struct worker *worker)
* @worker is leaving idle state. Update stats.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void worker_leave_idle(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
- BUG_ON(!(worker->flags & WORKER_IDLE));
+ if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+ return;
worker_clr_flags(worker, WORKER_IDLE);
- gcwq->nr_idle--;
+ pool->nr_idle--;
list_del_init(&worker->entry);
}
-/**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
- * @worker: self
- *
- * Works which are scheduled while the cpu is online must at least be
- * scheduled to a worker which is bound to the cpu so that if they are
- * flushed from cpu callbacks while cpu is going down, they are
- * guaranteed to execute on the cpu.
- *
- * This function is to be used by rogue workers and rescuers to bind
- * themselves to the target cpu and may race with cpu going down or
- * coming online. kthread_bind() can't be used because it may put the
- * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and gcwq may be
- * [dis]associated in the meantime.
- *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
- * the binding against GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
- * idle state or fetches works without dropping lock, it can guarantee
- * the scheduling requirement described in the first paragraph.
- *
- * CONTEXT:
- * Might sleep. Called without any lock but returns with gcwq->lock
- * held.
- *
- * RETURNS:
- * %true if the associated gcwq is online (@worker is successfully
- * bound), %false if offline.
- */
-static bool worker_maybe_bind_and_lock(struct worker *worker)
-__acquires(&gcwq->lock)
+static struct worker *alloc_worker(void)
{
- struct global_cwq *gcwq = worker->gcwq;
- struct task_struct *task = worker->task;
-
- while (true) {
- /*
- * The following call may fail, succeed or succeed
- * without actually migrating the task to the cpu if
- * it races with cpu hotunplug operation. Verify
- * against GCWQ_DISASSOCIATED.
- */
- if (!(gcwq->flags & GCWQ_DISASSOCIATED))
- set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
-
- spin_lock_irq(&gcwq->lock);
- if (gcwq->flags & GCWQ_DISASSOCIATED)
- return false;
- if (task_cpu(task) == gcwq->cpu &&
- cpumask_equal(&current->cpus_allowed,
- get_cpu_mask(gcwq->cpu)))
- return true;
- spin_unlock_irq(&gcwq->lock);
+ struct worker *worker;
- /*
- * We've raced with CPU hot[un]plug. Give it a breather
- * and retry migration. cond_resched() is required here;
- * otherwise, we might deadlock against cpu_stop trying to
- * bring down the CPU on non-preemptive kernel.
- */
- cpu_relax();
- cond_resched();
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (worker) {
+ INIT_LIST_HEAD(&worker->entry);
+ INIT_LIST_HEAD(&worker->scheduled);
+ INIT_LIST_HEAD(&worker->node);
+ /* on creation a worker is in !idle && prep state */
+ worker->flags = WORKER_PREP;
}
+ return worker;
}
-/*
- * Function for worker->rebind_work used to rebind rogue busy workers
- * to the associated cpu which is coming back online. This is
- * scheduled by cpu up but can race with other cpu hotplug operations
- * and may be executed twice without intervening cpu down.
+/**
+ * worker_attach_to_pool() - attach a worker to a pool
+ * @worker: worker to be attached
+ * @pool: the target pool
+ *
+ * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
+ * cpu-binding of @worker are kept coordinated with the pool across
+ * cpu-[un]hotplugs.
*/
-static void worker_rebind_fn(struct work_struct *work)
+static void worker_attach_to_pool(struct worker *worker,
+ struct worker_pool *pool)
{
- struct worker *worker = container_of(work, struct worker, rebind_work);
- struct global_cwq *gcwq = worker->gcwq;
+ mutex_lock(&pool->attach_mutex);
+
+ /*
+ * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+ * online CPUs. It'll be re-applied when any of the CPUs come up.
+ */
+ set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+
+ /*
+ * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
+ * stable across this function. See the comments above the
+ * flag definition for details.
+ */
+ if (pool->flags & POOL_DISASSOCIATED)
+ worker->flags |= WORKER_UNBOUND;
- if (worker_maybe_bind_and_lock(worker))
- worker_clr_flags(worker, WORKER_REBIND);
+ list_add_tail(&worker->node, &pool->workers);
- spin_unlock_irq(&gcwq->lock);
+ mutex_unlock(&pool->attach_mutex);
}
-static struct worker *alloc_worker(void)
+/**
+ * worker_detach_from_pool() - detach a worker from its pool
+ * @worker: worker which is attached to its pool
+ * @pool: the pool @worker is attached to
+ *
+ * Undo the attaching which had been done in worker_attach_to_pool(). The
+ * caller worker shouldn't access to the pool after detached except it has
+ * other reference to the pool.
+ */
+static void worker_detach_from_pool(struct worker *worker,
+ struct worker_pool *pool)
{
- struct worker *worker;
+ struct completion *detach_completion = NULL;
- worker = kzalloc(sizeof(*worker), GFP_KERNEL);
- if (worker) {
- INIT_LIST_HEAD(&worker->entry);
- INIT_LIST_HEAD(&worker->scheduled);
- INIT_WORK(&worker->rebind_work, worker_rebind_fn);
- /* on creation a worker is in !idle && prep state */
- worker->flags = WORKER_PREP;
- }
- return worker;
+ mutex_lock(&pool->attach_mutex);
+ list_del(&worker->node);
+ if (list_empty(&pool->workers))
+ detach_completion = pool->detach_completion;
+ mutex_unlock(&pool->attach_mutex);
+
+ if (detach_completion)
+ complete(detach_completion);
}
/**
* create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
+ * @pool: pool the new worker will belong to
*
- * Create a new worker which is bound to @gcwq. The returned worker
- * can be started by calling start_worker() or destroyed using
- * destroy_worker().
+ * Create a new worker which is attached to @pool. The new worker must be
+ * started by start_worker().
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
*
- * RETURNS:
+ * Return:
* Pointer to the newly created worker.
*/
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
{
- bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
struct worker *worker = NULL;
int id = -1;
+ char id_buf[16];
- spin_lock_irq(&gcwq->lock);
- while (ida_get_new(&gcwq->worker_ida, &id)) {
- spin_unlock_irq(&gcwq->lock);
- if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
- goto fail;
- spin_lock_irq(&gcwq->lock);
- }
- spin_unlock_irq(&gcwq->lock);
+ /* ID is needed to determine kthread name */
+ id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto fail;
worker = alloc_worker();
if (!worker)
goto fail;
- worker->gcwq = gcwq;
+ worker->pool = pool;
worker->id = id;
- if (!on_unbound_cpu)
- worker->task = kthread_create_on_node(worker_thread,
- worker,
- cpu_to_node(gcwq->cpu),
- "kworker/%u:%d", gcwq->cpu, id);
+ if (pool->cpu >= 0)
+ snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
+ pool->attrs->nice < 0 ? "H" : "");
else
- worker->task = kthread_create(worker_thread, worker,
- "kworker/u:%d", id);
+ snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
+
+ worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
+ "kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;
- /*
- * A rogue worker will become a regular one if CPU comes
- * online later on. Make sure every worker has
- * PF_THREAD_BOUND set.
- */
- if (bind && !on_unbound_cpu)
- kthread_bind(worker->task, gcwq->cpu);
- else {
- worker->task->flags |= PF_THREAD_BOUND;
- if (on_unbound_cpu)
- worker->flags |= WORKER_UNBOUND;
- }
+ set_user_nice(worker->task, pool->attrs->nice);
+
+ /* prevent userland from meddling with cpumask of workqueue workers */
+ worker->task->flags |= PF_NO_SETAFFINITY;
+
+ /* successful, attach the worker to the pool */
+ worker_attach_to_pool(worker, pool);
return worker;
+
fail:
- if (id >= 0) {
- spin_lock_irq(&gcwq->lock);
- ida_remove(&gcwq->worker_ida, id);
- spin_unlock_irq(&gcwq->lock);
- }
+ if (id >= 0)
+ ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
@@ -1411,376 +1737,255 @@ fail:
* start_worker - start a newly created worker
* @worker: worker to start
*
- * Make the gcwq aware of @worker and start it.
+ * Make the pool aware of @worker and start it.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void start_worker(struct worker *worker)
{
- worker->flags |= WORKER_STARTED;
- worker->gcwq->nr_workers++;
+ worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
}
/**
+ * create_and_start_worker - create and start a worker for a pool
+ * @pool: the target pool
+ *
+ * Grab the managership of @pool and create and start a new worker for it.
+ *
+ * Return: 0 on success. A negative error code otherwise.
+ */
+static int create_and_start_worker(struct worker_pool *pool)
+{
+ struct worker *worker;
+
+ worker = create_worker(pool);
+ if (worker) {
+ spin_lock_irq(&pool->lock);
+ start_worker(worker);
+ spin_unlock_irq(&pool->lock);
+ }
+
+ return worker ? 0 : -ENOMEM;
+}
+
+/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
- * Destroy @worker and adjust @gcwq stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly. The worker should
+ * be idle.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
- int id = worker->id;
+ struct worker_pool *pool = worker->pool;
+
+ lockdep_assert_held(&pool->lock);
/* sanity check frenzy */
- BUG_ON(worker->current_work);
- BUG_ON(!list_empty(&worker->scheduled));
+ if (WARN_ON(worker->current_work) ||
+ WARN_ON(!list_empty(&worker->scheduled)) ||
+ WARN_ON(!(worker->flags & WORKER_IDLE)))
+ return;
- if (worker->flags & WORKER_STARTED)
- gcwq->nr_workers--;
- if (worker->flags & WORKER_IDLE)
- gcwq->nr_idle--;
+ pool->nr_workers--;
+ pool->nr_idle--;
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
-
- spin_unlock_irq(&gcwq->lock);
-
- kthread_stop(worker->task);
- kfree(worker);
-
- spin_lock_irq(&gcwq->lock);
- ida_remove(&gcwq->worker_ida, id);
+ wake_up_process(worker->task);
}
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
{
- struct global_cwq *gcwq = (void *)__gcwq;
+ struct worker_pool *pool = (void *)__pool;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
- if (too_many_workers(gcwq)) {
+ while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
- worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
- if (time_before(jiffies, expires))
- mod_timer(&gcwq->idle_timer, expires);
- else {
- /* it's been idle for too long, wake up manager */
- gcwq->flags |= GCWQ_MANAGE_WORKERS;
- wake_up_worker(gcwq);
+ if (time_before(jiffies, expires)) {
+ mod_timer(&pool->idle_timer, expires);
+ break;
}
+
+ destroy_worker(worker);
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
-static bool send_mayday(struct work_struct *work)
+static void send_mayday(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq = get_work_cwq(work);
- struct workqueue_struct *wq = cwq->wq;
- unsigned int cpu;
+ struct pool_workqueue *pwq = get_work_pwq(work);
+ struct workqueue_struct *wq = pwq->wq;
- if (!(wq->flags & WQ_RESCUER))
- return false;
+ lockdep_assert_held(&wq_mayday_lock);
+
+ if (!wq->rescuer)
+ return;
/* mayday mayday mayday */
- cpu = cwq->gcwq->cpu;
- /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
- if (cpu == WORK_CPU_UNBOUND)
- cpu = 0;
- if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
+ if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
- return true;
+ }
}
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void pool_mayday_timeout(unsigned long __pool)
{
- struct global_cwq *gcwq = (void *)__gcwq;
+ struct worker_pool *pool = (void *)__pool;
struct work_struct *work;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
+ spin_lock(&pool->lock);
- if (need_to_create_worker(gcwq)) {
+ if (need_to_create_worker(pool)) {
/*
* We've been trying to create a new worker but
* haven't been successful. We might be hitting an
* allocation deadlock. Send distress signals to
* rescuers.
*/
- list_for_each_entry(work, &gcwq->worklist, entry)
+ list_for_each_entry(work, &pool->worklist, entry)
send_mayday(work);
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock(&pool->lock);
+ spin_unlock_irq(&wq_mayday_lock);
- mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
/**
* maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
*
- * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary. @pool is guaranteed to
* have at least one idle worker on return from this function. If
* creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
* possible allocation deadlock.
*
- * On return, need_to_create_worker() is guaranteed to be false and
- * may_start_working() true.
+ * On return, need_to_create_worker() is guaranteed to be %false and
+ * may_start_working() %true.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*
- * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * Return:
+ * %false if no action was taken and pool->lock stayed locked, %true
* otherwise.
*/
-static bool maybe_create_worker(struct global_cwq *gcwq)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
+static bool maybe_create_worker(struct worker_pool *pool)
+__releases(&pool->lock)
+__acquires(&pool->lock)
{
- if (!need_to_create_worker(gcwq))
+ if (!need_to_create_worker(pool))
return false;
restart:
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
- mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
struct worker *worker;
- worker = create_worker(gcwq, true);
+ worker = create_worker(pool);
if (worker) {
- del_timer_sync(&gcwq->mayday_timer);
- spin_lock_irq(&gcwq->lock);
+ del_timer_sync(&pool->mayday_timer);
+ spin_lock_irq(&pool->lock);
start_worker(worker);
- BUG_ON(need_to_create_worker(gcwq));
+ if (WARN_ON_ONCE(need_to_create_worker(pool)))
+ goto restart;
return true;
}
- if (!need_to_create_worker(gcwq))
+ if (!need_to_create_worker(pool))
break;
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(CREATE_COOLDOWN);
- if (!need_to_create_worker(gcwq))
+ if (!need_to_create_worker(pool))
break;
}
- del_timer_sync(&gcwq->mayday_timer);
- spin_lock_irq(&gcwq->lock);
- if (need_to_create_worker(gcwq))
+ del_timer_sync(&pool->mayday_timer);
+ spin_lock_irq(&pool->lock);
+ if (need_to_create_worker(pool))
goto restart;
return true;
}
/**
- * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
- *
- * Destroy @gcwq workers which have been idle for longer than
- * IDLE_WORKER_TIMEOUT.
- *
- * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. Called only from manager.
- *
- * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
- * otherwise.
- */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
-{
- bool ret = false;
-
- while (too_many_workers(gcwq)) {
- struct worker *worker;
- unsigned long expires;
-
- worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
- expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-
- if (time_before(jiffies, expires)) {
- mod_timer(&gcwq->idle_timer, expires);
- break;
- }
-
- destroy_worker(worker);
- ret = true;
- }
-
- return ret;
-}
-
-/**
* manage_workers - manage worker pool
* @worker: self
*
- * Assume the manager role and manage gcwq worker pool @worker belongs
+ * Assume the manager role and manage the worker pool @worker belongs
* to. At any given time, there can be only zero or one manager per
- * gcwq. The exclusion is handled automatically by this function.
+ * pool. The exclusion is handled automatically by this function.
*
* The caller can safely start processing works on false return. On
* true return, it's guaranteed that need_to_create_worker() is false
* and may_start_working() is true.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
- * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true if
- * some action was taken.
+ * Return:
+ * %false if the pool don't need management and the caller can safely start
+ * processing works, %true indicates that the function released pool->lock
+ * and reacquired it to perform some management function and that the
+ * conditions that the caller verified while holding the lock before
+ * calling the function might no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
bool ret = false;
- if (gcwq->flags & GCWQ_MANAGING_WORKERS)
- return ret;
-
- gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
- gcwq->flags |= GCWQ_MANAGING_WORKERS;
-
/*
- * Destroy and then create so that may_start_working() is true
- * on return.
+ * Anyone who successfully grabs manager_arb wins the arbitration
+ * and becomes the manager. mutex_trylock() on pool->manager_arb
+ * failure while holding pool->lock reliably indicates that someone
+ * else is managing the pool and the worker which failed trylock
+ * can proceed to executing work items. This means that anyone
+ * grabbing manager_arb is responsible for actually performing
+ * manager duties. If manager_arb is grabbed and released without
+ * actual management, the pool may stall indefinitely.
*/
- ret |= maybe_destroy_workers(gcwq);
- ret |= maybe_create_worker(gcwq);
-
- gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+ if (!mutex_trylock(&pool->manager_arb))
+ return ret;
- /*
- * The trustee might be waiting to take over the manager
- * position, tell it we're done.
- */
- if (unlikely(gcwq->trustee))
- wake_up_all(&gcwq->trustee_wait);
+ ret |= maybe_create_worker(pool);
+ mutex_unlock(&pool->manager_arb);
return ret;
}
/**
- * move_linked_works - move linked works to a list
- * @work: start of series of works to be scheduled
- * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head. Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work. This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
- struct work_struct **nextp)
-{
- struct work_struct *n;
-
- /*
- * Linked worklist will always end before the end of the list,
- * use NULL for list head.
- */
- list_for_each_entry_safe_from(work, n, NULL, entry) {
- list_move_tail(&work->entry, head);
- if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
- break;
- }
-
- /*
- * If we're already inside safe list traversal and have moved
- * multiple works to the scheduled queue, the next position
- * needs to be updated.
- */
- if (nextp)
- *nextp = n;
-}
-
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
-{
- struct work_struct *work = list_first_entry(&cwq->delayed_works,
- struct work_struct, entry);
- struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
-
- trace_workqueue_activate_work(work);
- move_linked_works(work, pos, NULL);
- __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
- cwq->nr_active++;
-}
-
-/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
- * @color: color of work which left the queue
- * @delayed: for a delayed work
- *
- * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
- bool delayed)
-{
- /* ignore uncolored works */
- if (color == WORK_NO_COLOR)
- return;
-
- cwq->nr_in_flight[color]--;
-
- if (!delayed) {
- cwq->nr_active--;
- if (!list_empty(&cwq->delayed_works)) {
- /* one down, submit a delayed one */
- if (cwq->nr_active < cwq->max_active)
- cwq_activate_first_delayed(cwq);
- }
- }
-
- /* is flush in progress and are we at the flushing tip? */
- if (likely(cwq->flush_color != color))
- return;
-
- /* are there still in-flight works? */
- if (cwq->nr_in_flight[color])
- return;
-
- /* this cwq is done, clear flush_color */
- cwq->flush_color = -1;
-
- /*
- * If this was the last cwq, wake up the first flusher. It
- * will handle the rest.
- */
- if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
- complete(&cwq->wq->first_flusher->done);
-}
-
-/**
* process_one_work - process single work
* @worker: self
* @work: work to process
@@ -1792,17 +1997,15 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
* call this function to process a work.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
-{
- struct cpu_workqueue_struct *cwq = get_work_cwq(work);
- struct global_cwq *gcwq = cwq->gcwq;
- struct hlist_head *bwh = busy_worker_head(gcwq, work);
- bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
- work_func_t f = work->func;
+__releases(&pool->lock)
+__acquires(&pool->lock)
+{
+ struct pool_workqueue *pwq = get_work_pwq(work);
+ struct worker_pool *pool = worker->pool;
+ bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
int work_color;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
@@ -1813,89 +2016,108 @@ __acquires(&gcwq->lock)
* lock freed" warnings as well as problems when looking into
* work->lockdep_map, make a copy and use that here.
*/
- struct lockdep_map lockdep_map = work->lockdep_map;
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/*
+ * Ensure we're on the correct CPU. DISASSOCIATED test is
+ * necessary to avoid spurious warnings from rescuers servicing the
+ * unbound or a disassociated pool.
+ */
+ WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
+ !(pool->flags & POOL_DISASSOCIATED) &&
+ raw_smp_processor_id() != pool->cpu);
+
+ /*
* A single work shouldn't be executed concurrently by
* multiple workers on a single cpu. Check whether anyone is
* already processing the work. If so, defer the work to the
* currently executing one.
*/
- collision = __find_worker_executing_work(gcwq, bwh, work);
+ collision = find_worker_executing_work(pool, work);
if (unlikely(collision)) {
move_linked_works(work, &collision->scheduled, NULL);
return;
}
- /* claim and process */
+ /* claim and dequeue */
debug_work_deactivate(work);
- hlist_add_head(&worker->hentry, bwh);
+ hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
- worker->current_cwq = cwq;
+ worker->current_func = work->func;
+ worker->current_pwq = pwq;
work_color = get_work_color(work);
- /* record the current cpu number in the work data and dequeue */
- set_work_cpu(work, gcwq->cpu);
list_del_init(&work->entry);
/*
- * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
- * wake up another worker; otherwise, clear HIGHPRI_PENDING.
- */
- if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
- struct work_struct *nwork = list_first_entry(&gcwq->worklist,
- struct work_struct, entry);
-
- if (!list_empty(&gcwq->worklist) &&
- get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
- wake_up_worker(gcwq);
- else
- gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
- }
-
- /*
* CPU intensive works don't participate in concurrency
* management. They're the scheduler's responsibility.
*/
if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
- spin_unlock_irq(&gcwq->lock);
+ /*
+ * Unbound pool isn't concurrency managed and work items should be
+ * executed ASAP. Wake up another worker if necessary.
+ */
+ if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+ wake_up_worker(pool);
+
+ /*
+ * Record the last pool and clear PENDING which should be the last
+ * update to @work. Also, do this inside @pool->lock so that
+ * PENDING and queued state changes happen together while IRQ is
+ * disabled.
+ */
+ set_work_pool_and_clear_pending(work, pool->id);
+
+ spin_unlock_irq(&pool->lock);
- work_clear_pending(work);
- lock_map_acquire_read(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
- f(work);
+ worker->current_func(work);
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
trace_workqueue_execute_end(work);
lock_map_release(&lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
- printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(), task_pid_nr(current));
- printk(KERN_ERR " last function: ");
- print_symbol("%s\n", (unsigned long)f);
+ pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+ " last function: %pf\n",
+ current->comm, preempt_count(), task_pid_nr(current),
+ worker->current_func);
debug_show_held_locks(current);
dump_stack();
}
- spin_lock_irq(&gcwq->lock);
+ /*
+ * The following prevents a kworker from hogging CPU on !PREEMPT
+ * kernels, where a requeueing work item waiting for something to
+ * happen could deadlock with stop_machine as such work item could
+ * indefinitely requeue itself while all other CPUs are trapped in
+ * stop_machine.
+ */
+ cond_resched();
+
+ spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* we're done with it, release */
- hlist_del_init(&worker->hentry);
+ hash_del(&worker->hentry);
worker->current_work = NULL;
- worker->current_cwq = NULL;
- cwq_dec_nr_in_flight(cwq, work_color, false);
+ worker->current_func = NULL;
+ worker->current_pwq = NULL;
+ worker->desc_valid = false;
+ pwq_dec_nr_in_flight(pwq, work_color);
}
/**
@@ -1907,7 +2129,7 @@ __acquires(&gcwq->lock)
* fetches a work from the top and executes it.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
@@ -1923,37 +2145,45 @@ static void process_scheduled_works(struct worker *worker)
* worker_thread - the worker thread function
* @__worker: self
*
- * The gcwq worker thread function. There's a single dynamic pool of
- * these per each cpu. These workers process all works regardless of
- * their specific target workqueue. The only exception is works which
- * belong to workqueues with a rescuer which will be explained in
- * rescuer_thread().
+ * The worker thread function. All workers belong to a worker_pool -
+ * either a per-cpu one or dynamic unbound one. These workers process all
+ * work items regardless of their specific target workqueue. The only
+ * exception is work items which belong to workqueues with a rescuer which
+ * will be explained in rescuer_thread().
+ *
+ * Return: 0
*/
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
worker->task->flags |= PF_WQ_WORKER;
woke_up:
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
- /* DIE can be set only while we're idle, checking here is enough */
- if (worker->flags & WORKER_DIE) {
- spin_unlock_irq(&gcwq->lock);
+ /* am I supposed to die? */
+ if (unlikely(worker->flags & WORKER_DIE)) {
+ spin_unlock_irq(&pool->lock);
+ WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
+
+ set_task_comm(worker->task, "kworker/dying");
+ ida_simple_remove(&pool->worker_ida, worker->id);
+ worker_detach_from_pool(worker, pool);
+ kfree(worker);
return 0;
}
worker_leave_idle(worker);
recheck:
/* no more worker necessary? */
- if (!need_more_worker(gcwq))
+ if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? */
- if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+ if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
/*
@@ -1961,18 +2191,20 @@ recheck:
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
- BUG_ON(!list_empty(&worker->scheduled));
+ WARN_ON_ONCE(!list_empty(&worker->scheduled));
/*
- * When control reaches this point, we're guaranteed to have
- * at least one idle worker or that someone else has already
- * assumed the manager role.
+ * Finish PREP stage. We're guaranteed to have at least one idle
+ * worker or that someone else has already assumed the manager
+ * role. This is where @worker starts participating in concurrency
+ * management if applicable and concurrency management is restored
+ * after being rebound. See rebind_workers() for details.
*/
- worker_clr_flags(worker, WORKER_PREP);
+ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work =
- list_first_entry(&gcwq->worklist,
+ list_first_entry(&pool->worklist,
struct work_struct, entry);
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1984,100 +2216,136 @@ recheck:
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
- } while (keep_working(gcwq));
+ } while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP, false);
sleep:
- if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
- goto recheck;
-
/*
- * gcwq->lock is held and there's no work to process and no
- * need to manage, sleep. Workers are woken up only while
- * holding gcwq->lock or from local cpu, so setting the
- * current state before releasing gcwq->lock is enough to
- * prevent losing any event.
+ * pool->lock is held and there's no work to process and no need to
+ * manage, sleep. Workers are woken up only while holding
+ * pool->lock or from local cpu, so setting the current state
+ * before releasing pool->lock is enough to prevent losing any
+ * event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
/**
* rescuer_thread - the rescuer thread function
- * @__wq: the associated workqueue
+ * @__rescuer: self
*
* Workqueue rescuer thread function. There's one rescuer for each
- * workqueue which has WQ_RESCUER set.
+ * workqueue which has WQ_MEM_RECLAIM set.
*
- * Regular work processing on a gcwq may block trying to create a new
+ * Regular work processing on a pool may block trying to create a new
* worker which uses GFP_KERNEL allocation which has slight chance of
* developing into deadlock if some works currently on the same queue
* need to be processed to satisfy the GFP_KERNEL allocation. This is
* the problem rescuer solves.
*
- * When such condition is possible, the gcwq summons rescuers of all
- * workqueues which have works queued on the gcwq and let them process
+ * When such condition is possible, the pool summons rescuers of all
+ * workqueues which have works queued on the pool and let them process
* those works so that forward progress can be guaranteed.
*
* This should happen rarely.
+ *
+ * Return: 0
*/
-static int rescuer_thread(void *__wq)
+static int rescuer_thread(void *__rescuer)
{
- struct workqueue_struct *wq = __wq;
- struct worker *rescuer = wq->rescuer;
+ struct worker *rescuer = __rescuer;
+ struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
- bool is_unbound = wq->flags & WQ_UNBOUND;
- unsigned int cpu;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
+
+ /*
+ * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
+ * doesn't participate in concurrency management.
+ */
+ rescuer->task->flags |= PF_WQ_WORKER;
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- return 0;
-
/*
- * See whether any cpu is asking for help. Unbounded
- * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
*/
- for_each_mayday_cpu(cpu, wq->mayday_mask) {
- unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
- struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
- struct global_cwq *gcwq = cwq->gcwq;
+ should_stop = kthread_should_stop();
+
+ /* see whether any pwq is asking for help */
+ spin_lock_irq(&wq_mayday_lock);
+
+ while (!list_empty(&wq->maydays)) {
+ struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
+ struct pool_workqueue, mayday_node);
+ struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
__set_current_state(TASK_RUNNING);
- mayday_clear_cpu(cpu, wq->mayday_mask);
+ list_del_init(&pwq->mayday_node);
+
+ spin_unlock_irq(&wq_mayday_lock);
- /* migrate to the target cpu if possible */
- rescuer->gcwq = gcwq;
- worker_maybe_bind_and_lock(rescuer);
+ worker_attach_to_pool(rescuer, pool);
+
+ spin_lock_irq(&pool->lock);
+ rescuer->pool = pool;
/*
* Slurp in all works issued via this workqueue and
* process'em.
*/
- BUG_ON(!list_empty(&rescuer->scheduled));
- list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
- if (get_work_cwq(work) == cwq)
+ WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+ list_for_each_entry_safe(work, n, &pool->worklist, entry)
+ if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
+ spin_unlock_irq(&pool->lock);
+
+ worker_detach_from_pool(rescuer, pool);
+
+ spin_lock_irq(&pool->lock);
/*
- * Leave this gcwq. If keep_working() is %true, notify a
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're holding its lock.
+ */
+ put_pwq(pwq);
+
+ /*
+ * Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
- if (keep_working(gcwq))
- wake_up_worker(gcwq);
+ if (keep_working(pool))
+ wake_up_worker(pool);
+
+ rescuer->pool = NULL;
+ spin_unlock(&pool->lock);
+ spin_lock(&wq_mayday_lock);
+ }
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&wq_mayday_lock);
+
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
}
+ /* rescuers should never participate in concurrency management */
+ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
goto repeat;
}
@@ -2095,7 +2363,7 @@ static void wq_barrier_func(struct work_struct *work)
/**
* insert_wq_barrier - insert a barrier work
- * @cwq: cwq to insert barrier into
+ * @pwq: pwq to insert barrier into
* @barr: wq_barrier to insert
* @target: target work to attach @barr to
* @worker: worker currently executing @target, NULL if @target is not executing
@@ -2112,12 +2380,12 @@ static void wq_barrier_func(struct work_struct *work)
* after a work with LINKED flag set.
*
* Note that when @worker is non-NULL, @target may be modified
- * underneath us, so we can't reliably determine cwq from @target.
+ * underneath us, so we can't reliably determine pwq from @target.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
@@ -2125,7 +2393,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
unsigned int linked = 0;
/*
- * debugobject calls are safe here even with gcwq->lock locked
+ * debugobject calls are safe here even with pool->lock locked
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
@@ -2150,23 +2418,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
}
debug_work_activate(&barr->work);
- insert_work(cwq, &barr->work, head,
+ insert_work(pwq, &barr->work, head,
work_color_to_flags(WORK_NO_COLOR) | linked);
}
/**
- * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
* @wq: workqueue being flushed
* @flush_color: new flush color, < 0 for no-op
* @work_color: new work color, < 0 for no-op
*
- * Prepare cwqs for workqueue flushing.
+ * Prepare pwqs for workqueue flushing.
*
- * If @flush_color is non-negative, flush_color on all cwqs should be
- * -1. If no cwq has in-flight commands at the specified color, all
- * cwq->flush_color's stay at -1 and %false is returned. If any cwq
- * has in flight commands, its cwq->flush_color is set to
- * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * If @flush_color is non-negative, flush_color on all pwqs should be
+ * -1. If no pwq has in-flight commands at the specified color, all
+ * pwq->flush_color's stay at -1 and %false is returned. If any pwq
+ * has in flight commands, its pwq->flush_color is set to
+ * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
* wakeup logic is armed and %true is returned.
*
* The caller should have initialized @wq->first_flusher prior to
@@ -2174,53 +2442,52 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
* @flush_color is negative, no flush color update is done and %false
* is returned.
*
- * If @work_color is non-negative, all cwqs should have the same
+ * If @work_color is non-negative, all pwqs should have the same
* work_color which is previous to @work_color and all will be
* advanced to @work_color.
*
* CONTEXT:
- * mutex_lock(wq->flush_mutex).
+ * mutex_lock(wq->mutex).
*
- * RETURNS:
+ * Return:
* %true if @flush_color >= 0 and there's something to flush. %false
* otherwise.
*/
-static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
int flush_color, int work_color)
{
bool wait = false;
- unsigned int cpu;
+ struct pool_workqueue *pwq;
if (flush_color >= 0) {
- BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
- atomic_set(&wq->nr_cwqs_to_flush, 1);
+ WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
+ atomic_set(&wq->nr_pwqs_to_flush, 1);
}
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- struct global_cwq *gcwq = cwq->gcwq;
+ for_each_pwq(pwq, wq) {
+ struct worker_pool *pool = pwq->pool;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
- BUG_ON(cwq->flush_color != -1);
+ WARN_ON_ONCE(pwq->flush_color != -1);
- if (cwq->nr_in_flight[flush_color]) {
- cwq->flush_color = flush_color;
- atomic_inc(&wq->nr_cwqs_to_flush);
+ if (pwq->nr_in_flight[flush_color]) {
+ pwq->flush_color = flush_color;
+ atomic_inc(&wq->nr_pwqs_to_flush);
wait = true;
}
}
if (work_color >= 0) {
- BUG_ON(work_color != work_next_color(cwq->work_color));
- cwq->work_color = work_color;
+ WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
+ pwq->work_color = work_color;
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
- if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+ if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
complete(&wq->first_flusher->done);
return wait;
@@ -2230,11 +2497,8 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
* flush_workqueue - ensure that any scheduled work has run to completion.
* @wq: workqueue to flush
*
- * Forces execution of the workqueue and blocks until its completion.
- * This is typically used in driver shutdown handlers.
- *
- * We sleep until all works which were queued on entry have been handled,
- * but we are not livelocked by new incoming ones.
+ * This function sleeps until all work items which were queued on entry
+ * have finished execution, but it is not livelocked by new incoming ones.
*/
void flush_workqueue(struct workqueue_struct *wq)
{
@@ -2248,7 +2512,7 @@ void flush_workqueue(struct workqueue_struct *wq)
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
- mutex_lock(&wq->flush_mutex);
+ mutex_lock(&wq->mutex);
/*
* Start-to-wait phase
@@ -2261,17 +2525,17 @@ void flush_workqueue(struct workqueue_struct *wq)
* becomes our flush_color and work_color is advanced
* by one.
*/
- BUG_ON(!list_empty(&wq->flusher_overflow));
+ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
this_flusher.flush_color = wq->work_color;
wq->work_color = next_color;
if (!wq->first_flusher) {
/* no flush in progress, become the first flusher */
- BUG_ON(wq->flush_color != this_flusher.flush_color);
+ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
wq->first_flusher = &this_flusher;
- if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+ if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
wq->work_color)) {
/* nothing to flush, done */
wq->flush_color = next_color;
@@ -2280,9 +2544,9 @@ void flush_workqueue(struct workqueue_struct *wq)
}
} else {
/* wait in queue */
- BUG_ON(wq->flush_color == this_flusher.flush_color);
+ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
list_add_tail(&this_flusher.list, &wq->flusher_queue);
- flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
} else {
/*
@@ -2293,7 +2557,7 @@ void flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
- mutex_unlock(&wq->flush_mutex);
+ mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done);
@@ -2306,7 +2570,7 @@ void flush_workqueue(struct workqueue_struct *wq)
if (wq->first_flusher != &this_flusher)
return;
- mutex_lock(&wq->flush_mutex);
+ mutex_lock(&wq->mutex);
/* we might have raced, check again with mutex held */
if (wq->first_flusher != &this_flusher)
@@ -2314,8 +2578,8 @@ void flush_workqueue(struct workqueue_struct *wq)
wq->first_flusher = NULL;
- BUG_ON(!list_empty(&this_flusher.list));
- BUG_ON(wq->flush_color != this_flusher.flush_color);
+ WARN_ON_ONCE(!list_empty(&this_flusher.list));
+ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
while (true) {
struct wq_flusher *next, *tmp;
@@ -2328,8 +2592,8 @@ void flush_workqueue(struct workqueue_struct *wq)
complete(&next->done);
}
- BUG_ON(!list_empty(&wq->flusher_overflow) &&
- wq->flush_color != work_next_color(wq->work_color));
+ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
+ wq->flush_color != work_next_color(wq->work_color));
/* this flush_color is finished, advance by one */
wq->flush_color = work_next_color(wq->flush_color);
@@ -2349,25 +2613,25 @@ void flush_workqueue(struct workqueue_struct *wq)
list_splice_tail_init(&wq->flusher_overflow,
&wq->flusher_queue);
- flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
if (list_empty(&wq->flusher_queue)) {
- BUG_ON(wq->flush_color != wq->work_color);
+ WARN_ON_ONCE(wq->flush_color != wq->work_color);
break;
}
/*
* Need to flush more colors. Make the next flusher
- * the new first flusher and arm cwqs.
+ * the new first flusher and arm pwqs.
*/
- BUG_ON(wq->flush_color == wq->work_color);
- BUG_ON(wq->flush_color != next->flush_color);
+ WARN_ON_ONCE(wq->flush_color == wq->work_color);
+ WARN_ON_ONCE(wq->flush_color != next->flush_color);
list_del_init(&next->list);
wq->first_flusher = next;
- if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+ if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
break;
/*
@@ -2378,7 +2642,7 @@ void flush_workqueue(struct workqueue_struct *wq)
}
out_unlock:
- mutex_unlock(&wq->flush_mutex);
+ mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -2396,78 +2660,77 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
void drain_workqueue(struct workqueue_struct *wq)
{
unsigned int flush_cnt = 0;
- unsigned int cpu;
+ struct pool_workqueue *pwq;
/*
* __queue_work() needs to test whether there are drainers, is much
* hotter than drain_workqueue() and already looks at @wq->flags.
- * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+ * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
*/
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq->mutex);
if (!wq->nr_drainers++)
- wq->flags |= WQ_DRAINING;
- spin_unlock(&workqueue_lock);
+ wq->flags |= __WQ_DRAINING;
+ mutex_unlock(&wq->mutex);
reflush:
flush_workqueue(wq);
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ mutex_lock(&wq->mutex);
+
+ for_each_pwq(pwq, wq) {
bool drained;
- spin_lock_irq(&cwq->gcwq->lock);
- drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
- spin_unlock_irq(&cwq->gcwq->lock);
+ spin_lock_irq(&pwq->pool->lock);
+ drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
+ spin_unlock_irq(&pwq->pool->lock);
if (drained)
continue;
if (++flush_cnt == 10 ||
(flush_cnt % 100 == 0 && flush_cnt <= 1000))
- pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
- wq->name, flush_cnt);
+ pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
+ wq->name, flush_cnt);
+
+ mutex_unlock(&wq->mutex);
goto reflush;
}
- spin_lock(&workqueue_lock);
if (!--wq->nr_drainers)
- wq->flags &= ~WQ_DRAINING;
- spin_unlock(&workqueue_lock);
+ wq->flags &= ~__WQ_DRAINING;
+ mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
- bool wait_executing)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
{
struct worker *worker = NULL;
- struct global_cwq *gcwq;
- struct cpu_workqueue_struct *cwq;
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
might_sleep();
- gcwq = get_work_gcwq(work);
- if (!gcwq)
+
+ local_irq_disable();
+ pool = get_work_pool(work);
+ if (!pool) {
+ local_irq_enable();
return false;
+ }
- spin_lock_irq(&gcwq->lock);
- if (!list_empty(&work->entry)) {
- /*
- * See the comment near try_to_grab_pending()->smp_rmb().
- * If it was re-queued to a different gcwq under us, we
- * are not going to wait.
- */
- smp_rmb();
- cwq = get_work_cwq(work);
- if (unlikely(!cwq || gcwq != cwq->gcwq))
+ spin_lock(&pool->lock);
+ /* see the comment in try_to_grab_pending() with the same code */
+ pwq = get_work_pwq(work);
+ if (pwq) {
+ if (unlikely(pwq->pool != pool))
goto already_gone;
- } else if (wait_executing) {
- worker = find_worker_executing_work(gcwq, work);
+ } else {
+ worker = find_worker_executing_work(pool, work);
if (!worker)
goto already_gone;
- cwq = worker->current_cwq;
- } else
- goto already_gone;
+ pwq = worker->current_pwq;
+ }
- insert_wq_barrier(cwq, barr, work, worker);
- spin_unlock_irq(&gcwq->lock);
+ insert_wq_barrier(pwq, barr, work, worker);
+ spin_unlock_irq(&pool->lock);
/*
* If @max_active is 1 or rescuer is in use, flushing another work
@@ -2475,15 +2738,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
* flusher is not running on the same workqueue by verifying write
* access.
*/
- if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
- lock_map_acquire(&cwq->wq->lockdep_map);
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+ lock_map_acquire(&pwq->wq->lockdep_map);
else
- lock_map_acquire_read(&cwq->wq->lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
return true;
already_gone:
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
return false;
}
@@ -2491,17 +2754,10 @@ already_gone:
* flush_work - wait for a work to finish executing the last queueing instance
* @work: the work to flush
*
- * Wait until @work has finished execution. This function considers
- * only the last queueing instance of @work. If @work has been
- * enqueued across different CPUs on a non-reentrant workqueue or on
- * multiple workqueues, @work might still be executing on return on
- * some of the CPUs from earlier queueing.
- *
- * If @work was queued only on a non-reentrant, ordered or unbound
- * workqueue, @work is guaranteed to be idle on return if it hasn't
- * been requeued since flush started.
+ * Wait until @work has finished execution. @work is guaranteed to be idle
+ * on return if it hasn't been requeued since flush started.
*
- * RETURNS:
+ * Return:
* %true if flush_work() waited for the work to finish execution,
* %false if it was already idle.
*/
@@ -2509,140 +2765,39 @@ bool flush_work(struct work_struct *work)
{
struct wq_barrier barr;
- if (start_flush_work(work, &barr, true)) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- return true;
- } else
- return false;
-}
-EXPORT_SYMBOL_GPL(flush_work);
-
-static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
-{
- struct wq_barrier barr;
- struct worker *worker;
-
- spin_lock_irq(&gcwq->lock);
-
- worker = find_worker_executing_work(gcwq, work);
- if (unlikely(worker))
- insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-
- spin_unlock_irq(&gcwq->lock);
-
- if (unlikely(worker)) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- return true;
- } else
- return false;
-}
-
-static bool wait_on_work(struct work_struct *work)
-{
- bool ret = false;
- int cpu;
-
- might_sleep();
-
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
- for_each_gcwq_cpu(cpu)
- ret |= wait_on_cpu_work(get_gcwq(cpu), work);
- return ret;
-}
-
-/**
- * flush_work_sync - wait until a work has finished execution
- * @work: the work to flush
- *
- * Wait until @work has finished execution. On return, it's
- * guaranteed that all queueing instances of @work which happened
- * before this function is called are finished. In other words, if
- * @work hasn't been requeued since this function was called, @work is
- * guaranteed to be idle on return.
- *
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_work_sync(struct work_struct *work)
-{
- struct wq_barrier barr;
- bool pending, waited;
-
- /* we'll wait for executions separately, queue barr only if pending */
- pending = start_flush_work(work, &barr, false);
-
- /* wait for executions to finish */
- waited = wait_on_work(work);
-
- /* wait for the pending one */
- if (pending) {
+ if (start_flush_work(work, &barr)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
+ return true;
+ } else {
+ return false;
}
-
- return pending || waited;
-}
-EXPORT_SYMBOL_GPL(flush_work_sync);
-
-/*
- * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
- * so this work can't be re-armed in any way.
- */
-static int try_to_grab_pending(struct work_struct *work)
-{
- struct global_cwq *gcwq;
- int ret = -1;
-
- if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
- return 0;
-
- /*
- * The queueing is in progress, or it is already queued. Try to
- * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
- */
- gcwq = get_work_gcwq(work);
- if (!gcwq)
- return ret;
-
- spin_lock_irq(&gcwq->lock);
- if (!list_empty(&work->entry)) {
- /*
- * This work is queued, but perhaps we locked the wrong gcwq.
- * In that case we must see the new value after rmb(), see
- * insert_work()->wmb().
- */
- smp_rmb();
- if (gcwq == get_work_gcwq(work)) {
- debug_work_deactivate(work);
- list_del_init(&work->entry);
- cwq_dec_nr_in_flight(get_work_cwq(work),
- get_work_color(work),
- *work_data_bits(work) & WORK_STRUCT_DELAYED);
- ret = 1;
- }
- }
- spin_unlock_irq(&gcwq->lock);
-
- return ret;
}
+EXPORT_SYMBOL_GPL(flush_work);
-static bool __cancel_work_timer(struct work_struct *work,
- struct timer_list* timer)
+static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
+ unsigned long flags;
int ret;
do {
- ret = (timer && likely(del_timer(timer)));
- if (!ret)
- ret = try_to_grab_pending(work);
- wait_on_work(work);
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ /*
+ * If someone else is canceling, wait for the same event it
+ * would be waiting for before retrying.
+ */
+ if (unlikely(ret == -ENOENT))
+ flush_work(work);
} while (unlikely(ret < 0));
+ /* tell other tasks trying to grab @work to back off */
+ mark_work_canceling(work);
+ local_irq_restore(flags);
+
+ flush_work(work);
clear_work_data(work);
return ret;
}
@@ -2662,12 +2817,12 @@ static bool __cancel_work_timer(struct work_struct *work,
* The caller must ensure that the workqueue on which @work was last
* queued can't be destroyed before this function returns.
*
- * RETURNS:
+ * Return:
* %true if @work was pending, %false otherwise.
*/
bool cancel_work_sync(struct work_struct *work)
{
- return __cancel_work_timer(work, NULL);
+ return __cancel_work_timer(work, false);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);
@@ -2679,39 +2834,54 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
* immediate execution. Like flush_work(), this function only
* considers the last queueing instance of @dwork.
*
- * RETURNS:
+ * Return:
* %true if flush_work() waited for the work to finish execution,
* %false if it was already idle.
*/
bool flush_delayed_work(struct delayed_work *dwork)
{
+ local_irq_disable();
if (del_timer_sync(&dwork->timer))
- __queue_work(raw_smp_processor_id(),
- get_work_cwq(&dwork->work)->wq, &dwork->work);
+ __queue_work(dwork->cpu, dwork->wq, &dwork->work);
+ local_irq_enable();
return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);
/**
- * flush_delayed_work_sync - wait for a dwork to finish
- * @dwork: the delayed work to flush
+ * cancel_delayed_work - cancel a delayed work
+ * @dwork: delayed_work to cancel
*
- * Delayed timer is cancelled and the pending work is queued for
- * execution immediately. Other than timer handling, its behavior
- * is identical to flush_work_sync().
+ * Kill off a pending delayed_work.
*
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
+ * Return: %true if @dwork was pending and canceled; %false if it wasn't
+ * pending.
+ *
+ * Note:
+ * The work callback function may still be running on return, unless
+ * it returns %true and the work doesn't re-arm itself. Explicitly flush or
+ * use cancel_delayed_work_sync() to wait on it.
+ *
+ * This function is safe to call from any context including IRQ handler.
*/
-bool flush_delayed_work_sync(struct delayed_work *dwork)
+bool cancel_delayed_work(struct delayed_work *dwork)
{
- if (del_timer_sync(&dwork->timer))
- __queue_work(raw_smp_processor_id(),
- get_work_cwq(&dwork->work)->wq, &dwork->work);
- return flush_work_sync(&dwork->work);
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(&dwork->work, true, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_pool_and_clear_pending(&dwork->work,
+ get_work_pool_id(&dwork->work));
+ local_irq_restore(flags);
+ return ret;
}
-EXPORT_SYMBOL(flush_delayed_work_sync);
+EXPORT_SYMBOL(cancel_delayed_work);
/**
* cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -2719,77 +2889,16 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
*
* This is cancel_work_sync() for delayed works.
*
- * RETURNS:
+ * Return:
* %true if @dwork was pending, %false otherwise.
*/
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
- return __cancel_work_timer(&dwork->work, &dwork->timer);
+ return __cancel_work_timer(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);
/**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns zero if @work was already on the kernel-global workqueue and
- * non-zero otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-int schedule_work(struct work_struct *work)
-{
- return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-
-/*
- * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to put the work task on
- * @work: job to be done
- *
- * This puts a job on a specific cpu
- */
-int schedule_work_on(int cpu, struct work_struct *work)
-{
- return queue_work_on(cpu, system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work_on);
-
-/**
- * schedule_delayed_work - put work task in global workqueue after delay
- * @dwork: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue.
- */
-int schedule_delayed_work(struct delayed_work *dwork,
- unsigned long delay)
-{
- return queue_delayed_work(system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work);
-
-/**
- * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
- * @cpu: cpu to use
- * @dwork: job to be done
- * @delay: number of jiffies to wait
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue on the specified CPU.
- */
-int schedule_delayed_work_on(int cpu,
- struct delayed_work *dwork, unsigned long delay)
-{
- return queue_delayed_work_on(cpu, system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work_on);
-
-/**
* schedule_on_each_cpu - execute a function synchronously on each online CPU
* @func: the function to call
*
@@ -2797,7 +2906,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
* system workqueue and blocks until all CPUs have completed.
* schedule_on_each_cpu() is very slow.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
int schedule_on_each_cpu(work_func_t func)
@@ -2865,7 +2974,7 @@ EXPORT_SYMBOL(flush_scheduled_work);
* Executes the function immediately if process context is available,
* otherwise schedules the function for delayed execution.
*
- * Returns: 0 - function was executed
+ * Return: 0 - function was executed
* 1 - function was scheduled for execution
*/
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
@@ -2882,62 +2991,1063 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
-int keventd_up(void)
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return system_wq != NULL;
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
}
-static int alloc_cwqs(struct workqueue_struct *wq)
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ mutex_lock(&wq->mutex);
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ mutex_unlock(&wq->mutex);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+ ret = apply_workqueue_attrs(wq, attrs);
+ else
+ ret = -EINVAL;
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
+ mutex_unlock(&wq->mutex);
+
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs(wq, attrs);
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_groups = wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+ return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
/*
- * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
- * Make sure that the alignment isn't lower than that of
- * unsigned long long.
+ * Adjusting max_active or creating new pwqs by applyting
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
*/
- const size_t size = sizeof(struct cpu_workqueue_struct);
- const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
- __alignof__(unsigned long long));
-#ifdef CONFIG_SMP
- bool percpu = !(wq->flags & WQ_UNBOUND);
-#else
- bool percpu = false;
-#endif
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
- if (percpu)
- wq->cpu_wq.pcpu = __alloc_percpu(size, align);
- else {
- void *ptr;
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ dev_set_uevent_suppress(&wq_dev->dev, false);
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
+ *
+ * Undo alloc_workqueue_attrs().
+ */
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
+{
+ if (attrs) {
+ free_cpumask_var(attrs->cpumask);
+ kfree(attrs);
+ }
+}
+
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it.
+ *
+ * Return: The allocated new workqueue_attr on success. %NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = kzalloc(sizeof(*attrs), gfp_mask);
+ if (!attrs)
+ goto fail;
+ if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+ goto fail;
+
+ cpumask_copy(attrs->cpumask, cpu_possible_mask);
+ return attrs;
+fail:
+ free_workqueue_attrs(attrs);
+ return NULL;
+}
+
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+ const struct workqueue_attrs *from)
+{
+ to->nice = from->nice;
+ cpumask_copy(to->cpumask, from->cpumask);
+ /*
+ * Unlike hash and equality test, this function doesn't ignore
+ * ->no_numa as it is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears ->no_numa after copying.
+ */
+ to->no_numa = from->no_numa;
+}
+
+/* hash value of the content of @attr */
+static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
+{
+ u32 hash = 0;
+
+ hash = jhash_1word(attrs->nice, hash);
+ hash = jhash(cpumask_bits(attrs->cpumask),
+ BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ return hash;
+}
+
+/* content equality test */
+static bool wqattrs_equal(const struct workqueue_attrs *a,
+ const struct workqueue_attrs *b)
+{
+ if (a->nice != b->nice)
+ return false;
+ if (!cpumask_equal(a->cpumask, b->cpumask))
+ return false;
+ return true;
+}
+
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
+ *
+ * Return: 0 on success, -errno on failure. Even on failure, all fields
+ * inside @pool proper are initialized and put_unbound_pool() can be called
+ * on @pool safely to release it.
+ */
+static int init_worker_pool(struct worker_pool *pool)
+{
+ spin_lock_init(&pool->lock);
+ pool->id = -1;
+ pool->cpu = -1;
+ pool->node = NUMA_NO_NODE;
+ pool->flags |= POOL_DISASSOCIATED;
+ INIT_LIST_HEAD(&pool->worklist);
+ INIT_LIST_HEAD(&pool->idle_list);
+ hash_init(pool->busy_hash);
+
+ init_timer_deferrable(&pool->idle_timer);
+ pool->idle_timer.function = idle_worker_timeout;
+ pool->idle_timer.data = (unsigned long)pool;
+
+ setup_timer(&pool->mayday_timer, pool_mayday_timeout,
+ (unsigned long)pool);
+
+ mutex_init(&pool->manager_arb);
+ mutex_init(&pool->attach_mutex);
+ INIT_LIST_HEAD(&pool->workers);
+
+ ida_init(&pool->worker_ida);
+ INIT_HLIST_NODE(&pool->hash_node);
+ pool->refcnt = 1;
+
+ /* shouldn't fail above this point */
+ pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!pool->attrs)
+ return -ENOMEM;
+ return 0;
+}
+
+static void rcu_free_pool(struct rcu_head *rcu)
+{
+ struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
+
+ ida_destroy(&pool->worker_ida);
+ free_workqueue_attrs(pool->attrs);
+ kfree(pool);
+}
+
+/**
+ * put_unbound_pool - put a worker_pool
+ * @pool: worker_pool to put
+ *
+ * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * safe manner. get_unbound_pool() calls this function on its failure path
+ * and this function should be able to release pools which went through,
+ * successfully or not, init_worker_pool().
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static void put_unbound_pool(struct worker_pool *pool)
+{
+ DECLARE_COMPLETION_ONSTACK(detach_completion);
+ struct worker *worker;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (--pool->refcnt)
+ return;
+
+ /* sanity checks */
+ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+ WARN_ON(!list_empty(&pool->worklist)))
+ return;
+
+ /* release id and unhash */
+ if (pool->id >= 0)
+ idr_remove(&worker_pool_idr, pool->id);
+ hash_del(&pool->hash_node);
+
+ /*
+ * Become the manager and destroy all workers. Grabbing
+ * manager_arb prevents @pool's workers from blocking on
+ * attach_mutex.
+ */
+ mutex_lock(&pool->manager_arb);
+
+ spin_lock_irq(&pool->lock);
+ while ((worker = first_idle_worker(pool)))
+ destroy_worker(worker);
+ WARN_ON(pool->nr_workers || pool->nr_idle);
+ spin_unlock_irq(&pool->lock);
+
+ mutex_lock(&pool->attach_mutex);
+ if (!list_empty(&pool->workers))
+ pool->detach_completion = &detach_completion;
+ mutex_unlock(&pool->attach_mutex);
+
+ if (pool->detach_completion)
+ wait_for_completion(pool->detach_completion);
+
+ mutex_unlock(&pool->manager_arb);
+
+ /* shut down the timers */
+ del_timer_sync(&pool->idle_timer);
+ del_timer_sync(&pool->mayday_timer);
+
+ /* sched-RCU protected to allow dereferences from get_work_pool() */
+ call_rcu_sched(&pool->rcu, rcu_free_pool);
+}
+
+/**
+ * get_unbound_pool - get a worker_pool with the specified attributes
+ * @attrs: the attributes of the worker_pool to get
+ *
+ * Obtain a worker_pool which has the same attributes as @attrs, bump the
+ * reference count and return it. If there already is a matching
+ * worker_pool, it will be used; otherwise, this function attempts to
+ * create a new one.
+ *
+ * Should be called with wq_pool_mutex held.
+ *
+ * Return: On success, a worker_pool with the same attributes as @attrs.
+ * On failure, %NULL.
+ */
+static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
+{
+ u32 hash = wqattrs_hash(attrs);
+ struct worker_pool *pool;
+ int node;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ /* do we already have a matching pool? */
+ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
+ if (wqattrs_equal(pool->attrs, attrs)) {
+ pool->refcnt++;
+ goto out_unlock;
+ }
+ }
+
+ /* nope, create a new one */
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool || init_worker_pool(pool) < 0)
+ goto fail;
+
+ lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
+ copy_workqueue_attrs(pool->attrs, attrs);
+
+ /*
+ * no_numa isn't a worker_pool attribute, always clear it. See
+ * 'struct workqueue_attrs' comments for detail.
+ */
+ pool->attrs->no_numa = false;
+
+ /* if cpumask is contained inside a NUMA node, we belong to that node */
+ if (wq_numa_enabled) {
+ for_each_node(node) {
+ if (cpumask_subset(pool->attrs->cpumask,
+ wq_numa_possible_cpumask[node])) {
+ pool->node = node;
+ break;
+ }
+ }
+ }
+
+ if (worker_pool_assign_id(pool) < 0)
+ goto fail;
+
+ /* create and start the initial worker */
+ if (create_and_start_worker(pool) < 0)
+ goto fail;
+
+ /* install */
+ hash_add(unbound_pool_hash, &pool->hash_node, hash);
+out_unlock:
+ return pool;
+fail:
+ if (pool)
+ put_unbound_pool(pool);
+ return NULL;
+}
+
+static void rcu_free_pwq(struct rcu_head *rcu)
+{
+ kmem_cache_free(pwq_cache,
+ container_of(rcu, struct pool_workqueue, rcu));
+}
+
+/*
+ * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
+ * and needs to be destroyed.
+ */
+static void pwq_unbound_release_workfn(struct work_struct *work)
+{
+ struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
+ unbound_release_work);
+ struct workqueue_struct *wq = pwq->wq;
+ struct worker_pool *pool = pwq->pool;
+ bool is_last;
+
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
+
+ /*
+ * Unlink @pwq. Synchronization against wq->mutex isn't strictly
+ * necessary on release but do it anyway. It's easier to verify
+ * and consistent with the linking path.
+ */
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+
+ mutex_lock(&wq_pool_mutex);
+ put_unbound_pool(pool);
+ mutex_unlock(&wq_pool_mutex);
+
+ call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+
+ /*
+ * If we're the last pwq going away, @wq is already dead and no one
+ * is gonna access it anymore. Free it.
+ */
+ if (is_last) {
+ free_workqueue_attrs(wq->unbound_attrs);
+ kfree(wq);
+ }
+}
+
+/**
+ * pwq_adjust_max_active - update a pwq's max_active to the current setting
+ * @pwq: target pool_workqueue
+ *
+ * If @pwq isn't freezing, set @pwq->max_active to the associated
+ * workqueue's saved_max_active and activate delayed work items
+ * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
+ */
+static void pwq_adjust_max_active(struct pool_workqueue *pwq)
+{
+ struct workqueue_struct *wq = pwq->wq;
+ bool freezable = wq->flags & WQ_FREEZABLE;
+
+ /* for @wq->saved_max_active */
+ lockdep_assert_held(&wq->mutex);
+
+ /* fast exit for non-freezable wqs */
+ if (!freezable && pwq->max_active == wq->saved_max_active)
+ return;
+
+ spin_lock_irq(&pwq->pool->lock);
+
+ /*
+ * During [un]freezing, the caller is responsible for ensuring that
+ * this function is called at least once after @workqueue_freezing
+ * is updated and visible.
+ */
+ if (!freezable || !workqueue_freezing) {
+ pwq->max_active = wq->saved_max_active;
+
+ while (!list_empty(&pwq->delayed_works) &&
+ pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
/*
- * Allocate enough room to align cwq and put an extra
- * pointer at the end pointing back to the originally
- * allocated pointer which will be used for free.
+ * Need to kick a worker after thawed or an unbound wq's
+ * max_active is bumped. It's a slow path. Do it always.
*/
- ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
- if (ptr) {
- wq->cpu_wq.single = PTR_ALIGN(ptr, align);
- *(void **)(wq->cpu_wq.single + 1) = ptr;
+ wake_up_worker(pwq->pool);
+ } else {
+ pwq->max_active = 0;
+ }
+
+ spin_unlock_irq(&pwq->pool->lock);
+}
+
+/* initialize newly alloced @pwq which is associated with @wq and @pool */
+static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
+ struct worker_pool *pool)
+{
+ BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+
+ memset(pwq, 0, sizeof(*pwq));
+
+ pwq->pool = pool;
+ pwq->wq = wq;
+ pwq->flush_color = -1;
+ pwq->refcnt = 1;
+ INIT_LIST_HEAD(&pwq->delayed_works);
+ INIT_LIST_HEAD(&pwq->pwqs_node);
+ INIT_LIST_HEAD(&pwq->mayday_node);
+ INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+}
+
+/* sync @pwq with the current state of its associated wq and link it */
+static void link_pwq(struct pool_workqueue *pwq)
+{
+ struct workqueue_struct *wq = pwq->wq;
+
+ lockdep_assert_held(&wq->mutex);
+
+ /* may be called multiple times, ignore if already linked */
+ if (!list_empty(&pwq->pwqs_node))
+ return;
+
+ /*
+ * Set the matching work_color. This is synchronized with
+ * wq->mutex to avoid confusing flush_workqueue().
+ */
+ pwq->work_color = wq->work_color;
+
+ /* sync max_active to the current setting */
+ pwq_adjust_max_active(pwq);
+
+ /* link in @pwq */
+ list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
+}
+
+/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
+static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ pool = get_unbound_pool(attrs);
+ if (!pool)
+ return NULL;
+
+ pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
+ if (!pwq) {
+ put_unbound_pool(pool);
+ return NULL;
+ }
+
+ init_pwq(pwq, wq, pool);
+ return pwq;
+}
+
+/* undo alloc_unbound_pwq(), used only in the error path */
+static void free_unbound_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (pwq) {
+ put_unbound_pool(pwq->pool);
+ kmem_cache_free(pwq_cache, pwq);
+ }
+}
+
+/**
+ * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of interest
+ * @node: the target NUMA node
+ * @cpu_going_down: if >= 0, the CPU to consider as offline
+ * @cpumask: outarg, the resulting cpumask
+ *
+ * Calculate the cpumask a workqueue with @attrs should use on @node. If
+ * @cpu_going_down is >= 0, that cpu is considered offline during
+ * calculation. The result is stored in @cpumask.
+ *
+ * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
+ * enabled and @node has online CPUs requested by @attrs, the returned
+ * cpumask is the intersection of the possible CPUs of @node and
+ * @attrs->cpumask.
+ *
+ * The caller is responsible for ensuring that the cpumask of @node stays
+ * stable.
+ *
+ * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
+ */
+static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
+ int cpu_going_down, cpumask_t *cpumask)
+{
+ if (!wq_numa_enabled || attrs->no_numa)
+ goto use_dfl;
+
+ /* does @node have any online CPUs @attrs wants? */
+ cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+ if (cpu_going_down >= 0)
+ cpumask_clear_cpu(cpu_going_down, cpumask);
+
+ if (cpumask_empty(cpumask))
+ goto use_dfl;
+
+ /* yeap, return possible CPUs in @node that @attrs wants */
+ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ return !cpumask_equal(cpumask, attrs->cpumask);
+
+use_dfl:
+ cpumask_copy(cpumask, attrs->cpumask);
+ return false;
+}
+
+/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
+static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
+ int node,
+ struct pool_workqueue *pwq)
+{
+ struct pool_workqueue *old_pwq;
+
+ lockdep_assert_held(&wq->mutex);
+
+ /* link_pwq() can handle duplicate calls */
+ link_pwq(pwq);
+
+ old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+ rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+ return old_pwq;
+}
+
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on. Older pwqs are released as in-flight work
+ * items finish. Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct workqueue_attrs *new_attrs, *tmp_attrs;
+ struct pool_workqueue **pwq_tbl, *dfl_pwq;
+ int node, ret;
+
+ /* only unbound workqueues can change attributes */
+ if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+ return -EINVAL;
+
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+ return -EINVAL;
+
+ pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!pwq_tbl || !new_attrs || !tmp_attrs)
+ goto enomem;
+
+ /* make a copy of @attrs and sanitize it */
+ copy_workqueue_attrs(new_attrs, attrs);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+
+ /*
+ * We may create multiple pwqs with differing cpumasks. Make a
+ * copy of @new_attrs which will be modified and used to obtain
+ * pools.
+ */
+ copy_workqueue_attrs(tmp_attrs, new_attrs);
+
+ /*
+ * CPUs should stay stable across pwq creations and installations.
+ * Pin CPUs, determine the target cpumask for each node and create
+ * pwqs accordingly.
+ */
+ get_online_cpus();
+
+ mutex_lock(&wq_pool_mutex);
+
+ /*
+ * If something goes wrong during CPU up/down, we'll fall back to
+ * the default pwq covering whole @attrs->cpumask. Always create
+ * it even if we don't use it immediately.
+ */
+ dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+ if (!dfl_pwq)
+ goto enomem_pwq;
+
+ for_each_node(node) {
+ if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+ pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+ if (!pwq_tbl[node])
+ goto enomem_pwq;
+ } else {
+ dfl_pwq->refcnt++;
+ pwq_tbl[node] = dfl_pwq;
}
}
- /* just in case, make sure it's actually aligned */
- BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
- return wq->cpu_wq.v ? 0 : -ENOMEM;
+ mutex_unlock(&wq_pool_mutex);
+
+ /* all pwqs have been created successfully, let's install'em */
+ mutex_lock(&wq->mutex);
+
+ copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+
+ /* save the previous pwq and install the new one */
+ for_each_node(node)
+ pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+
+ /* @dfl_pwq might not have been used, ensure it's linked */
+ link_pwq(dfl_pwq);
+ swap(wq->dfl_pwq, dfl_pwq);
+
+ mutex_unlock(&wq->mutex);
+
+ /* put the old pwqs */
+ for_each_node(node)
+ put_pwq_unlocked(pwq_tbl[node]);
+ put_pwq_unlocked(dfl_pwq);
+
+ put_online_cpus();
+ ret = 0;
+ /* fall through */
+out_free:
+ free_workqueue_attrs(tmp_attrs);
+ free_workqueue_attrs(new_attrs);
+ kfree(pwq_tbl);
+ return ret;
+
+enomem_pwq:
+ free_unbound_pwq(dfl_pwq);
+ for_each_node(node)
+ if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
+ free_unbound_pwq(pwq_tbl[node]);
+ mutex_unlock(&wq_pool_mutex);
+ put_online_cpus();
+enomem:
+ ret = -ENOMEM;
+ goto out_free;
+}
+
+/**
+ * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * @wq: the target workqueue
+ * @cpu: the CPU coming up or going down
+ * @online: whether @cpu is coming up or going down
+ *
+ * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
+ * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
+ * @wq accordingly.
+ *
+ * If NUMA affinity can't be adjusted due to memory allocation failure, it
+ * falls back to @wq->dfl_pwq which may not be optimal but is always
+ * correct.
+ *
+ * Note that when the last allowed CPU of a NUMA node goes offline for a
+ * workqueue with a cpumask spanning multiple nodes, the workers which were
+ * already executing the work items for the workqueue will lose their CPU
+ * affinity and may execute on any CPU. This is similar to how per-cpu
+ * workqueues behave on CPU_DOWN. If a workqueue user wants strict
+ * affinity, it's the user's responsibility to flush the work item from
+ * CPU_DOWN_PREPARE.
+ */
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
+ bool online)
+{
+ int node = cpu_to_node(cpu);
+ int cpu_off = online ? -1 : cpu;
+ struct pool_workqueue *old_pwq = NULL, *pwq;
+ struct workqueue_attrs *target_attrs;
+ cpumask_t *cpumask;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+ return;
+
+ /*
+ * We don't wanna alloc/free wq_attrs for each wq for each CPU.
+ * Let's use a preallocated one. The following buf is protected by
+ * CPU hotplug exclusion.
+ */
+ target_attrs = wq_update_unbound_numa_attrs_buf;
+ cpumask = target_attrs->cpumask;
+
+ mutex_lock(&wq->mutex);
+ if (wq->unbound_attrs->no_numa)
+ goto out_unlock;
+
+ copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
+ pwq = unbound_pwq_by_node(wq, node);
+
+ /*
+ * Let's determine what needs to be done. If the target cpumask is
+ * different from wq's, we need to compare it to @pwq's and create
+ * a new one if they don't match. If the target cpumask equals
+ * wq's, the default pwq should be used.
+ */
+ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+ if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+ goto out_unlock;
+ } else {
+ goto use_dfl_pwq;
+ }
+
+ mutex_unlock(&wq->mutex);
+
+ /* create a new pwq */
+ pwq = alloc_unbound_pwq(wq, target_attrs);
+ if (!pwq) {
+ pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ wq->name);
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
+ }
+
+ /*
+ * Install the new pwq. As this function is called only from CPU
+ * hotplug callbacks and applying a new attrs is wrapped with
+ * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
+ * inbetween.
+ */
+ mutex_lock(&wq->mutex);
+ old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+ goto out_unlock;
+
+use_dfl_pwq:
+ spin_lock_irq(&wq->dfl_pwq->pool->lock);
+ get_pwq(wq->dfl_pwq);
+ spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+ old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+out_unlock:
+ mutex_unlock(&wq->mutex);
+ put_pwq_unlocked(old_pwq);
}
-static void free_cwqs(struct workqueue_struct *wq)
+static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
-#ifdef CONFIG_SMP
- bool percpu = !(wq->flags & WQ_UNBOUND);
-#else
- bool percpu = false;
-#endif
+ bool highpri = wq->flags & WQ_HIGHPRI;
+ int cpu, ret;
+
+ if (!(wq->flags & WQ_UNBOUND)) {
+ wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
+ if (!wq->cpu_pwqs)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pool_workqueue *pwq =
+ per_cpu_ptr(wq->cpu_pwqs, cpu);
+ struct worker_pool *cpu_pools =
+ per_cpu(cpu_worker_pools, cpu);
+
+ init_pwq(pwq, wq, &cpu_pools[highpri]);
- if (percpu)
- free_percpu(wq->cpu_wq.pcpu);
- else if (wq->cpu_wq.single) {
- /* the pointer to free is stored right after the cwq */
- kfree(*(void **)(wq->cpu_wq.single + 1));
+ mutex_lock(&wq->mutex);
+ link_pwq(pwq);
+ mutex_unlock(&wq->mutex);
+ }
+ return 0;
+ } else if (wq->flags & __WQ_ORDERED) {
+ ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ /* there should only be single pwq for ordering guarantee */
+ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ "ordering guarantee broken for workqueue %s\n", wq->name);
+ return ret;
+ } else {
+ return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
}
@@ -2947,9 +4057,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
if (max_active < 1 || max_active > lim)
- printk(KERN_WARNING "workqueue: max_active %d requested for %s "
- "is out of range, clamping between %d and %d\n",
- max_active, name, 1, lim);
+ pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
+ max_active, name, 1, lim);
return clamp_val(max_active, 1, lim);
}
@@ -2960,37 +4069,32 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct lock_class_key *key,
const char *lock_name, ...)
{
- va_list args, args1;
+ size_t tbl_size = 0;
+ va_list args;
struct workqueue_struct *wq;
- unsigned int cpu;
- size_t namelen;
+ struct pool_workqueue *pwq;
- /* determine namelen, allocate wq and format name */
- va_start(args, lock_name);
- va_copy(args1, args);
- namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+ /* see the comment above the definition of WQ_POWER_EFFICIENT */
+ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+ flags |= WQ_UNBOUND;
- wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
- if (!wq)
- goto err;
+ /* allocate wq and format name */
+ if (flags & WQ_UNBOUND)
+ tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
- vsnprintf(wq->name, namelen, fmt, args1);
- va_end(args);
- va_end(args1);
+ wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
+ if (!wq)
+ return NULL;
- /*
- * Workqueues which may be used during memory reclaim should
- * have a rescuer to guarantee forward progress.
- */
- if (flags & WQ_MEM_RECLAIM)
- flags |= WQ_RESCUER;
+ if (flags & WQ_UNBOUND) {
+ wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!wq->unbound_attrs)
+ goto err_free_wq;
+ }
- /*
- * Unbound workqueues aren't concurrency managed and should be
- * dispatched to workers immediately.
- */
- if (flags & WQ_UNBOUND)
- flags |= WQ_HIGHPRI;
+ va_start(args, lock_name);
+ vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+ va_end(args);
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -2998,71 +4102,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
/* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
- mutex_init(&wq->flush_mutex);
- atomic_set(&wq->nr_cwqs_to_flush, 0);
+ mutex_init(&wq->mutex);
+ atomic_set(&wq->nr_pwqs_to_flush, 0);
+ INIT_LIST_HEAD(&wq->pwqs);
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
+ INIT_LIST_HEAD(&wq->maydays);
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);
- if (alloc_cwqs(wq) < 0)
- goto err;
-
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- struct global_cwq *gcwq = get_gcwq(cpu);
-
- BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
- cwq->gcwq = gcwq;
- cwq->wq = wq;
- cwq->flush_color = -1;
- cwq->max_active = max_active;
- INIT_LIST_HEAD(&cwq->delayed_works);
- }
+ if (alloc_and_link_pwqs(wq) < 0)
+ goto err_free_wq;
- if (flags & WQ_RESCUER) {
+ /*
+ * Workqueues which may be used during memory reclaim should
+ * have a rescuer to guarantee forward progress.
+ */
+ if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
- if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
- goto err;
-
- wq->rescuer = rescuer = alloc_worker();
+ rescuer = alloc_worker();
if (!rescuer)
- goto err;
+ goto err_destroy;
- rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+ rescuer->rescue_wq = wq;
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
wq->name);
- if (IS_ERR(rescuer->task))
- goto err;
+ if (IS_ERR(rescuer->task)) {
+ kfree(rescuer);
+ goto err_destroy;
+ }
- rescuer->task->flags |= PF_THREAD_BOUND;
+ wq->rescuer = rescuer;
+ rescuer->task->flags |= PF_NO_SETAFFINITY;
wake_up_process(rescuer->task);
}
+ if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+ goto err_destroy;
+
/*
- * workqueue_lock protects global freeze state and workqueues
- * list. Grab it, set max_active accordingly and add the new
- * workqueue to workqueues list.
+ * wq_pool_mutex protects global freeze state and workqueues list.
+ * Grab it, adjust max_active and add the new @wq to workqueues
+ * list.
*/
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
- if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
- for_each_cwq_cpu(cpu, wq)
- get_cwq(cpu, wq)->max_active = 0;
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
list_add(&wq->list, &workqueues);
- spin_unlock(&workqueue_lock);
+ mutex_unlock(&wq_pool_mutex);
return wq;
-err:
- if (wq) {
- free_cwqs(wq);
- free_mayday_mask(wq->mayday_mask);
- kfree(wq->rescuer);
- kfree(wq);
- }
+
+err_free_wq:
+ free_workqueue_attrs(wq->unbound_attrs);
+ kfree(wq);
+ return NULL;
+err_destroy:
+ destroy_workqueue(wq);
return NULL;
}
EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
@@ -3075,38 +4178,76 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
- unsigned int cpu;
+ struct pool_workqueue *pwq;
+ int node;
/* drain it before proceeding with destruction */
drain_workqueue(wq);
+ /* sanity checks */
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq) {
+ int i;
+
+ for (i = 0; i < WORK_NR_COLORS; i++) {
+ if (WARN_ON(pwq->nr_in_flight[i])) {
+ mutex_unlock(&wq->mutex);
+ return;
+ }
+ }
+
+ if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
+ WARN_ON(pwq->nr_active) ||
+ WARN_ON(!list_empty(&pwq->delayed_works))) {
+ mutex_unlock(&wq->mutex);
+ return;
+ }
+ }
+ mutex_unlock(&wq->mutex);
+
/*
* wq list is used to freeze wq, remove from list after
* flushing is complete in case freeze races us.
*/
- spin_lock(&workqueue_lock);
- list_del(&wq->list);
- spin_unlock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
+ list_del_init(&wq->list);
+ mutex_unlock(&wq_pool_mutex);
- /* sanity check */
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- int i;
-
- for (i = 0; i < WORK_NR_COLORS; i++)
- BUG_ON(cwq->nr_in_flight[i]);
- BUG_ON(cwq->nr_active);
- BUG_ON(!list_empty(&cwq->delayed_works));
- }
+ workqueue_sysfs_unregister(wq);
- if (wq->flags & WQ_RESCUER) {
+ if (wq->rescuer) {
kthread_stop(wq->rescuer->task);
- free_mayday_mask(wq->mayday_mask);
kfree(wq->rescuer);
+ wq->rescuer = NULL;
}
- free_cwqs(wq);
- kfree(wq);
+ if (!(wq->flags & WQ_UNBOUND)) {
+ /*
+ * The base ref is never dropped on per-cpu pwqs. Directly
+ * free the pwqs and wq.
+ */
+ free_percpu(wq->cpu_pwqs);
+ kfree(wq);
+ } else {
+ /*
+ * We're the sole accessor of @wq at this point. Directly
+ * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
+ * @wq will be freed when the last pwq is released.
+ */
+ for_each_node(node) {
+ pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+ RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
+ put_pwq_unlocked(pwq);
+ }
+
+ /*
+ * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
+ * put. Don't access it afterwards.
+ */
+ pwq = wq->dfl_pwq;
+ wq->dfl_pwq = NULL;
+ put_pwq_unlocked(pwq);
+ }
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -3122,29 +4263,39 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
*/
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
- unsigned int cpu;
+ struct pool_workqueue *pwq;
+
+ /* disallow meddling with max_active for ordered workqueues */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq->mutex);
wq->saved_max_active = max_active;
- for_each_cwq_cpu(cpu, wq) {
- struct global_cwq *gcwq = get_gcwq(cpu);
-
- spin_lock_irq(&gcwq->lock);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
- if (!(wq->flags & WQ_FREEZABLE) ||
- !(gcwq->flags & GCWQ_FREEZING))
- get_cwq(gcwq->cpu, wq)->max_active = max_active;
+ mutex_unlock(&wq->mutex);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
- spin_unlock_irq(&gcwq->lock);
- }
+/**
+ * current_is_workqueue_rescuer - is %current workqueue rescuer?
+ *
+ * Determine whether %current is a workqueue rescuer. Can be used from
+ * work functions to determine whether it's being run off the rescuer task.
+ *
+ * Return: %true if %current is a workqueue rescuer. %false otherwise.
+ */
+bool current_is_workqueue_rescuer(void)
+{
+ struct worker *worker = current_wq_worker();
- spin_unlock(&workqueue_lock);
+ return worker && worker->rescue_wq;
}
-EXPORT_SYMBOL_GPL(workqueue_set_max_active);
/**
* workqueue_congested - test whether a workqueue is congested
@@ -3155,31 +4306,36 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
* no synchronization around this function and the test result is
* unreliable and only useful as advisory hints or for debugging.
*
- * RETURNS:
+ * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
+ * Note that both per-cpu and unbound workqueues may be associated with
+ * multiple pool_workqueues which have separate congested states. A
+ * workqueue being congested on one CPU doesn't mean the workqueue is also
+ * contested on other CPUs / NUMA nodes.
+ *
+ * Return:
* %true if congested, %false otherwise.
*/
-bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct pool_workqueue *pwq;
+ bool ret;
- return !list_empty(&cwq->delayed_works);
-}
-EXPORT_SYMBOL_GPL(workqueue_congested);
+ rcu_read_lock_sched();
-/**
- * work_cpu - return the last known associated cpu for @work
- * @work: the work of interest
- *
- * RETURNS:
- * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
- */
-unsigned int work_cpu(struct work_struct *work)
-{
- struct global_cwq *gcwq = get_work_gcwq(work);
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = smp_processor_id();
- return gcwq ? gcwq->cpu : WORK_CPU_NONE;
+ if (!(wq->flags & WQ_UNBOUND))
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ else
+ pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+
+ ret = !list_empty(&pwq->delayed_works);
+ rcu_read_unlock_sched();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(work_cpu);
+EXPORT_SYMBOL_GPL(workqueue_congested);
/**
* work_busy - test whether a work is currently pending or running
@@ -3188,424 +4344,375 @@ EXPORT_SYMBOL_GPL(work_cpu);
* Test whether @work is currently pending or running. There is no
* synchronization around this function and the test result is
* unreliable and only useful as advisory hints or for debugging.
- * Especially for reentrant wqs, the pending state might hide the
- * running state.
*
- * RETURNS:
+ * Return:
* OR'd bitmask of WORK_BUSY_* bits.
*/
unsigned int work_busy(struct work_struct *work)
{
- struct global_cwq *gcwq = get_work_gcwq(work);
+ struct worker_pool *pool;
unsigned long flags;
unsigned int ret = 0;
- if (!gcwq)
- return false;
-
- spin_lock_irqsave(&gcwq->lock, flags);
-
if (work_pending(work))
ret |= WORK_BUSY_PENDING;
- if (find_worker_executing_work(gcwq, work))
- ret |= WORK_BUSY_RUNNING;
- spin_unlock_irqrestore(&gcwq->lock, flags);
+ local_irq_save(flags);
+ pool = get_work_pool(work);
+ if (pool) {
+ spin_lock(&pool->lock);
+ if (find_worker_executing_work(pool, work))
+ ret |= WORK_BUSY_RUNNING;
+ spin_unlock(&pool->lock);
+ }
+ local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL_GPL(work_busy);
-/*
- * CPU hotplug.
- *
- * There are two challenges in supporting CPU hotplug. Firstly, there
- * are a lot of assumptions on strong associations among work, cwq and
- * gcwq which make migrating pending and scheduled works very
- * difficult to implement without impacting hot paths. Secondly,
- * gcwqs serve mix of short, long and very long running works making
- * blocked draining impractical.
- *
- * This is solved by allowing a gcwq to be detached from CPU, running
- * it with unbound (rogue) workers and allowing it to be reattached
- * later if the cpu comes back online. A separate thread is created
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START Command state used on startup. On CPU_DOWN_PREPARE, a
- * new trustee is started with this state.
- *
- * IN_CHARGE Once started, trustee will enter this state after
- * assuming the manager role and making all existing
- * workers rogue. DOWN_PREPARE waits for trustee to
- * enter this state. After reaching IN_CHARGE, trustee
- * tries to execute the pending worklist until it's empty
- * and the state is set to BUTCHER, or the state is set
- * to RELEASE.
- *
- * BUTCHER Command state which is set by the cpu callback after
- * the cpu has went down. Once this state is set trustee
- * knows that there will be no new works on the worklist
- * and once the worklist is empty it can proceed to
- * killing idle workers.
- *
- * RELEASE Command state which is set by the cpu callback if the
- * cpu down has been canceled or it has come online
- * again. After recognizing this state, trustee stops
- * trying to drain or butcher and clears ROGUE, rebinds
- * all remaining workers back to the cpu and releases
- * manager role.
- *
- * DONE Trustee will enter this state after BUTCHER or RELEASE
- * is complete.
- *
- * trustee CPU draining
- * took over down complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- * | | ^
- * | CPU is back online v return workers |
- * ----------------> RELEASE --------------
- */
-
/**
- * trustee_wait_event_timeout - timed event wait for trustee
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use. Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
+ * set_worker_desc - set description for the current work item
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * This function can be called by a running work function to describe what
+ * the work item is about. If the worker task gets dumped, this
+ * information will be printed out together to help debugging. The
+ * description can be at most WORKER_DESC_LEN including the trailing '\0'.
*/
-#define trustee_wait_event_timeout(cond, timeout) ({ \
- long __ret = (timeout); \
- while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
- __ret) { \
- spin_unlock_irq(&gcwq->lock); \
- __wait_event_timeout(gcwq->trustee_wait, (cond) || \
- (gcwq->trustee_state == TRUSTEE_RELEASE), \
- __ret); \
- spin_lock_irq(&gcwq->lock); \
- } \
- gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
-})
+void set_worker_desc(const char *fmt, ...)
+{
+ struct worker *worker = current_wq_worker();
+ va_list args;
+
+ if (worker) {
+ va_start(args, fmt);
+ vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
+ va_end(args);
+ worker->desc_valid = true;
+ }
+}
/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
+ * print_worker_info - print out worker information and description
+ * @log_lvl: the log level to use when printing
+ * @task: target task
*
- * wait_event() for trustee to use. Automatically handles locking and
- * checks for CANCEL request.
+ * If @task is a worker and currently executing a work item, print out the
+ * name of the workqueue being serviced and worker description set with
+ * set_worker_desc() by the currently executing work item.
*
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
+ * This function can be safely called on any task as long as the
+ * task_struct itself is accessible. While safe, this function isn't
+ * synchronized and may print out mixups or garbages of limited length.
*/
-#define trustee_wait_event(cond) ({ \
- long __ret1; \
- __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
- __ret1 < 0 ? -1 : 0; \
-})
-
-static int __cpuinit trustee_thread(void *__gcwq)
+void print_worker_info(const char *log_lvl, struct task_struct *task)
{
- struct global_cwq *gcwq = __gcwq;
+ work_func_t *fn = NULL;
+ char name[WQ_NAME_LEN] = { };
+ char desc[WORKER_DESC_LEN] = { };
+ struct pool_workqueue *pwq = NULL;
+ struct workqueue_struct *wq = NULL;
+ bool desc_valid = false;
struct worker *worker;
- struct work_struct *work;
- struct hlist_node *pos;
- long rc;
- int i;
- BUG_ON(gcwq->cpu != smp_processor_id());
+ if (!(task->flags & PF_WQ_WORKER))
+ return;
- spin_lock_irq(&gcwq->lock);
/*
- * Claim the manager position and make all workers rogue.
- * Trustee must be bound to the target cpu and can't be
- * cancelled.
+ * This function is called without any synchronization and @task
+ * could be in any state. Be careful with dereferences.
*/
- BUG_ON(gcwq->cpu != smp_processor_id());
- rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
- BUG_ON(rc < 0);
-
- gcwq->flags |= GCWQ_MANAGING_WORKERS;
-
- list_for_each_entry(worker, &gcwq->idle_list, entry)
- worker->flags |= WORKER_ROGUE;
-
- for_each_busy_worker(worker, i, pos, gcwq)
- worker->flags |= WORKER_ROGUE;
+ worker = probe_kthread_data(task);
/*
- * Call schedule() so that we cross rq->lock and thus can
- * guarantee sched callbacks see the rogue flag. This is
- * necessary as scheduler callbacks may be invoked from other
- * cpus.
+ * Carefully copy the associated workqueue's workfn and name. Keep
+ * the original last '\0' in case the original contains garbage.
*/
- spin_unlock_irq(&gcwq->lock);
- schedule();
- spin_lock_irq(&gcwq->lock);
+ probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
+ probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
+ probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
+ probe_kernel_read(name, wq->name, sizeof(name) - 1);
+
+ /* copy worker description */
+ probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
+ if (desc_valid)
+ probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+
+ if (fn || name[0] || desc[0]) {
+ printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+ if (desc[0])
+ pr_cont(" (%s)", desc);
+ pr_cont("\n");
+ }
+}
- /*
- * Sched callbacks are disabled now. Zap nr_running. After
- * this, nr_running stays zero and need_more_worker() and
- * keep_working() are always true as long as the worklist is
- * not empty.
- */
- atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug. Firstly, there
+ * are a lot of assumptions on strong associations among work, pwq and
+ * pool which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths. Secondly,
+ * worker pools serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing the pools to be disassociated from the CPU
+ * running as an unbound one and allowing it to be reattached later if the
+ * cpu comes back online.
+ */
- spin_unlock_irq(&gcwq->lock);
- del_timer_sync(&gcwq->idle_timer);
- spin_lock_irq(&gcwq->lock);
+static void wq_unbind_fn(struct work_struct *work)
+{
+ int cpu = smp_processor_id();
+ struct worker_pool *pool;
+ struct worker *worker;
- /*
- * We're now in charge. Notify and proceed to drain. We need
- * to keep the gcwq running during the whole CPU down
- * procedure as other cpu hotunplug callbacks may need to
- * flush currently running tasks.
- */
- gcwq->trustee_state = TRUSTEE_IN_CHARGE;
- wake_up_all(&gcwq->trustee_wait);
+ for_each_cpu_worker_pool(pool, cpu) {
+ WARN_ON_ONCE(cpu != smp_processor_id());
- /*
- * The original cpu is in the process of dying and may go away
- * anytime now. When that happens, we and all workers would
- * be migrated to other cpus. Try draining any left work. We
- * want to get it over with ASAP - spam rescuers, wake up as
- * many idlers as necessary and create new ones till the
- * worklist is empty. Note that if the gcwq is frozen, there
- * may be frozen works in freezable cwqs. Don't declare
- * completion while frozen.
- */
- while (gcwq->nr_workers != gcwq->nr_idle ||
- gcwq->flags & GCWQ_FREEZING ||
- gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
- int nr_works = 0;
+ mutex_lock(&pool->attach_mutex);
+ spin_lock_irq(&pool->lock);
- list_for_each_entry(work, &gcwq->worklist, entry) {
- send_mayday(work);
- nr_works++;
- }
+ /*
+ * We've blocked all attach/detach operations. Make all workers
+ * unbound and set DISASSOCIATED. Before this, all workers
+ * except for the ones which are still executing works from
+ * before the last CPU down must be on the cpu. After
+ * this, they may become diasporas.
+ */
+ for_each_pool_worker(worker, pool)
+ worker->flags |= WORKER_UNBOUND;
- list_for_each_entry(worker, &gcwq->idle_list, entry) {
- if (!nr_works--)
- break;
- wake_up_process(worker->task);
- }
+ pool->flags |= POOL_DISASSOCIATED;
- if (need_to_create_worker(gcwq)) {
- spin_unlock_irq(&gcwq->lock);
- worker = create_worker(gcwq, false);
- spin_lock_irq(&gcwq->lock);
- if (worker) {
- worker->flags |= WORKER_ROGUE;
- start_worker(worker);
- }
- }
+ spin_unlock_irq(&pool->lock);
+ mutex_unlock(&pool->attach_mutex);
- /* give a breather */
- if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
- break;
+ /*
+ * Call schedule() so that we cross rq->lock and thus can
+ * guarantee sched callbacks see the %WORKER_UNBOUND flag.
+ * This is necessary as scheduler callbacks may be invoked
+ * from other cpus.
+ */
+ schedule();
+
+ /*
+ * Sched callbacks are disabled now. Zap nr_running.
+ * After this, nr_running stays zero and need_more_worker()
+ * and keep_working() are always true as long as the
+ * worklist is not empty. This pool now behaves as an
+ * unbound (in terms of concurrency management) pool which
+ * are served by workers tied to the pool.
+ */
+ atomic_set(&pool->nr_running, 0);
+
+ /*
+ * With concurrency management just turned off, a busy
+ * worker blocking could lead to lengthy stalls. Kick off
+ * unbound chain execution of currently pending work items.
+ */
+ spin_lock_irq(&pool->lock);
+ wake_up_worker(pool);
+ spin_unlock_irq(&pool->lock);
}
+}
- /*
- * Either all works have been scheduled and cpu is down, or
- * cpu down has already been canceled. Wait for and butcher
- * all workers till we're canceled.
- */
- do {
- rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
- while (!list_empty(&gcwq->idle_list))
- destroy_worker(list_first_entry(&gcwq->idle_list,
- struct worker, entry));
- } while (gcwq->nr_workers && rc >= 0);
+/**
+ * rebind_workers - rebind all workers of a pool to the associated CPU
+ * @pool: pool of interest
+ *
+ * @pool->cpu is coming online. Rebind all workers to the CPU.
+ */
+static void rebind_workers(struct worker_pool *pool)
+{
+ struct worker *worker;
+
+ lockdep_assert_held(&pool->attach_mutex);
/*
- * At this point, either draining has completed and no worker
- * is left, or cpu down has been canceled or the cpu is being
- * brought back up. There shouldn't be any idle one left.
- * Tell the remaining busy ones to rebind once it finishes the
- * currently scheduled works by scheduling the rebind_work.
+ * Restore CPU affinity of all workers. As all idle workers should
+ * be on the run-queue of the associated CPU before any local
+ * wake-ups for concurrency management happen, restore CPU affinty
+ * of all workers first and then clear UNBOUND. As we're called
+ * from CPU_ONLINE, the following shouldn't fail.
*/
- WARN_ON(!list_empty(&gcwq->idle_list));
+ for_each_pool_worker(worker, pool)
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
+
+ spin_lock_irq(&pool->lock);
- for_each_busy_worker(worker, i, pos, gcwq) {
- struct work_struct *rebind_work = &worker->rebind_work;
+ for_each_pool_worker(worker, pool) {
+ unsigned int worker_flags = worker->flags;
/*
- * Rebind_work may race with future cpu hotplug
- * operations. Use a separate flag to mark that
- * rebinding is scheduled.
+ * A bound idle worker should actually be on the runqueue
+ * of the associated CPU for local wake-ups targeting it to
+ * work. Kick all idle workers so that they migrate to the
+ * associated CPU. Doing this in the same loop as
+ * replacing UNBOUND with REBOUND is safe as no worker will
+ * be bound before @pool->lock is released.
*/
- worker->flags |= WORKER_REBIND;
- worker->flags &= ~WORKER_ROGUE;
-
- /* queue rebind_work, wq doesn't matter, use the default one */
- if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
- work_data_bits(rebind_work)))
- continue;
+ if (worker_flags & WORKER_IDLE)
+ wake_up_process(worker->task);
- debug_work_activate(rebind_work);
- insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
- worker->scheduled.next,
- work_color_to_flags(WORK_NO_COLOR));
+ /*
+ * We want to clear UNBOUND but can't directly call
+ * worker_clr_flags() or adjust nr_running. Atomically
+ * replace UNBOUND with another NOT_RUNNING flag REBOUND.
+ * @worker will clear REBOUND using worker_clr_flags() when
+ * it initiates the next execution cycle thus restoring
+ * concurrency management. Note that when or whether
+ * @worker clears REBOUND doesn't affect correctness.
+ *
+ * ACCESS_ONCE() is necessary because @worker->flags may be
+ * tested without holding any lock in
+ * wq_worker_waking_up(). Without it, NOT_RUNNING test may
+ * fail incorrectly leading to premature concurrency
+ * management operations.
+ */
+ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
+ worker_flags |= WORKER_REBOUND;
+ worker_flags &= ~WORKER_UNBOUND;
+ ACCESS_ONCE(worker->flags) = worker_flags;
}
- /* relinquish manager role */
- gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-
- /* notify completion */
- gcwq->trustee = NULL;
- gcwq->trustee_state = TRUSTEE_DONE;
- wake_up_all(&gcwq->trustee_wait);
- spin_unlock_irq(&gcwq->lock);
- return 0;
+ spin_unlock_irq(&pool->lock);
}
/**
- * wait_trustee_state - wait for trustee to enter the specified state
- * @gcwq: gcwq the trustee of interest belongs to
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state. DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. To be used by cpu_callback.
+ * restore_unbound_workers_cpumask - restore cpumask of unbound workers
+ * @pool: unbound pool of interest
+ * @cpu: the CPU which is coming up
+ *
+ * An unbound pool may end up with a cpumask which doesn't have any online
+ * CPUs. When a worker of such pool get scheduled, the scheduler resets
+ * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
+ * online CPU before, cpus_allowed of all its workers should be restored.
*/
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
+static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
- if (!(gcwq->trustee_state == state ||
- gcwq->trustee_state == TRUSTEE_DONE)) {
- spin_unlock_irq(&gcwq->lock);
- __wait_event(gcwq->trustee_wait,
- gcwq->trustee_state == state ||
- gcwq->trustee_state == TRUSTEE_DONE);
- spin_lock_irq(&gcwq->lock);
- }
+ static cpumask_t cpumask;
+ struct worker *worker;
+
+ lockdep_assert_held(&pool->attach_mutex);
+
+ /* is @cpu allowed for @pool? */
+ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
+ return;
+
+ /* is @cpu the only online CPU? */
+ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
+ if (cpumask_weight(&cpumask) != 1)
+ return;
+
+ /* as we're called from CPU_ONLINE, the following shouldn't fail */
+ for_each_pool_worker(worker, pool)
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
}
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int workqueue_cpu_up_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct global_cwq *gcwq = get_gcwq(cpu);
- struct task_struct *new_trustee = NULL;
- struct worker *uninitialized_var(new_worker);
- unsigned long flags;
-
- action &= ~CPU_TASKS_FROZEN;
+ int cpu = (unsigned long)hcpu;
+ struct worker_pool *pool;
+ struct workqueue_struct *wq;
+ int pi;
- switch (action) {
- case CPU_DOWN_PREPARE:
- new_trustee = kthread_create(trustee_thread, gcwq,
- "workqueue_trustee/%d\n", cpu);
- if (IS_ERR(new_trustee))
- return notifier_from_errno(PTR_ERR(new_trustee));
- kthread_bind(new_trustee, cpu);
- /* fall through */
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- BUG_ON(gcwq->first_idle);
- new_worker = create_worker(gcwq, false);
- if (!new_worker) {
- if (new_trustee)
- kthread_stop(new_trustee);
- return NOTIFY_BAD;
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (pool->nr_workers)
+ continue;
+ if (create_and_start_worker(pool) < 0)
+ return NOTIFY_BAD;
}
- }
+ break;
- /* some are called w/ irq disabled, don't disturb irq status */
- spin_lock_irqsave(&gcwq->lock, flags);
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ mutex_lock(&wq_pool_mutex);
- switch (action) {
- case CPU_DOWN_PREPARE:
- /* initialize trustee and tell it to acquire the gcwq */
- BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
- gcwq->trustee = new_trustee;
- gcwq->trustee_state = TRUSTEE_START;
- wake_up_process(gcwq->trustee);
- wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
- /* fall through */
- case CPU_UP_PREPARE:
- BUG_ON(gcwq->first_idle);
- gcwq->first_idle = new_worker;
- break;
+ for_each_pool(pool, pi) {
+ mutex_lock(&pool->attach_mutex);
- case CPU_DYING:
- /*
- * Before this, the trustee and all workers except for
- * the ones which are still executing works from
- * before the last CPU down must be on the cpu. After
- * this, they'll all be diasporas.
- */
- gcwq->flags |= GCWQ_DISASSOCIATED;
- break;
+ if (pool->cpu == cpu) {
+ spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
+ spin_unlock_irq(&pool->lock);
- case CPU_POST_DEAD:
- gcwq->trustee_state = TRUSTEE_BUTCHER;
- /* fall through */
- case CPU_UP_CANCELED:
- destroy_worker(gcwq->first_idle);
- gcwq->first_idle = NULL;
- break;
+ rebind_workers(pool);
+ } else if (pool->cpu < 0) {
+ restore_unbound_workers_cpumask(pool, cpu);
+ }
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- gcwq->flags &= ~GCWQ_DISASSOCIATED;
- if (gcwq->trustee_state != TRUSTEE_DONE) {
- gcwq->trustee_state = TRUSTEE_RELEASE;
- wake_up_process(gcwq->trustee);
- wait_trustee_state(gcwq, TRUSTEE_DONE);
+ mutex_unlock(&pool->attach_mutex);
}
- /*
- * Trustee is done and there might be no worker left.
- * Put the first_idle in and request a real manager to
- * take a look.
- */
- spin_unlock_irq(&gcwq->lock);
- kthread_bind(gcwq->first_idle->task, cpu);
- spin_lock_irq(&gcwq->lock);
- gcwq->flags |= GCWQ_MANAGE_WORKERS;
- start_worker(gcwq->first_idle);
- gcwq->first_idle = NULL;
+ /* update NUMA affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, true);
+
+ mutex_unlock(&wq_pool_mutex);
break;
}
+ return NOTIFY_OK;
+}
- spin_unlock_irqrestore(&gcwq->lock, flags);
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int workqueue_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct work_struct unbind_work;
+ struct workqueue_struct *wq;
- return notifier_from_errno(0);
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /* unbinding per-cpu workers should happen on the local CPU */
+ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+ queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+ /* update NUMA affinity of unbound workqueues */
+ mutex_lock(&wq_pool_mutex);
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, false);
+ mutex_unlock(&wq_pool_mutex);
+
+ /* wait for per-cpu unbinding to finish */
+ flush_work(&unbind_work);
+ destroy_work_on_stack(&unbind_work);
+ break;
+ }
+ return NOTIFY_OK;
}
#ifdef CONFIG_SMP
struct work_for_cpu {
- struct completion completion;
+ struct work_struct work;
long (*fn)(void *);
void *arg;
long ret;
};
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
{
- struct work_for_cpu *wfc = _wfc;
+ struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
wfc->ret = wfc->fn(wfc->arg);
- complete(&wfc->completion);
- return 0;
}
/**
@@ -3614,25 +4721,19 @@ static int do_work_for_cpu(void *_wfc)
* @fn: the function to run
* @arg: the function arg
*
- * This will return the value @fn returns.
* It is up to the caller to ensure that the cpu doesn't go offline.
* The caller must not hold any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
*/
-long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
- struct task_struct *sub_thread;
- struct work_for_cpu wfc = {
- .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
- .fn = fn,
- .arg = arg,
- };
+ struct work_for_cpu wfc = { .fn = fn, .arg = arg };
- sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
- if (IS_ERR(sub_thread))
- return PTR_ERR(sub_thread);
- kthread_bind(sub_thread, cpu);
- wake_up_process(sub_thread);
- wait_for_completion(&wfc.completion);
+ INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ destroy_work_on_stack(&wfc.work);
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -3644,41 +4745,30 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
* freeze_workqueues_begin - begin freezing workqueues
*
* Start freezing workqueues. After this function returns, all freezable
- * workqueues will queue new works to their frozen_works list instead of
- * gcwq->worklist.
+ * workqueues will queue new works to their delayed_works list instead of
+ * pool->worklist.
*
* CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
*/
void freeze_workqueues_begin(void)
{
- unsigned int cpu;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
- BUG_ON(workqueue_freezing);
+ WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
- struct workqueue_struct *wq;
-
- spin_lock_irq(&gcwq->lock);
-
- BUG_ON(gcwq->flags & GCWQ_FREEZING);
- gcwq->flags |= GCWQ_FREEZING;
-
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-
- if (cwq && wq->flags & WQ_FREEZABLE)
- cwq->max_active = 0;
- }
-
- spin_unlock_irq(&gcwq->lock);
+ list_for_each_entry(wq, &workqueues, list) {
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
}
- spin_unlock(&workqueue_lock);
+ mutex_unlock(&wq_pool_mutex);
}
/**
@@ -3688,42 +4778,42 @@ void freeze_workqueues_begin(void)
* between freeze_workqueues_begin() and thaw_workqueues().
*
* CONTEXT:
- * Grabs and releases workqueue_lock.
+ * Grabs and releases wq_pool_mutex.
*
- * RETURNS:
+ * Return:
* %true if some freezable workqueues are still busy. %false if freezing
* is complete.
*/
bool freeze_workqueues_busy(void)
{
- unsigned int cpu;
bool busy = false;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
- BUG_ON(!workqueue_freezing);
+ WARN_ON_ONCE(!workqueue_freezing);
- for_each_gcwq_cpu(cpu) {
- struct workqueue_struct *wq;
+ list_for_each_entry(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_FREEZABLE))
+ continue;
/*
* nr_active is monotonically decreasing. It's safe
* to peek without lock.
*/
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-
- if (!cwq || !(wq->flags & WQ_FREEZABLE))
- continue;
-
- BUG_ON(cwq->nr_active < 0);
- if (cwq->nr_active) {
+ rcu_read_lock_sched();
+ for_each_pwq(pwq, wq) {
+ WARN_ON_ONCE(pwq->nr_active < 0);
+ if (pwq->nr_active) {
busy = true;
+ rcu_read_unlock_sched();
goto out_unlock;
}
}
+ rcu_read_unlock_sched();
}
out_unlock:
- spin_unlock(&workqueue_lock);
+ mutex_unlock(&wq_pool_mutex);
return busy;
}
@@ -3731,110 +4821,160 @@ out_unlock:
* thaw_workqueues - thaw workqueues
*
* Thaw workqueues. Normal queueing is restored and all collected
- * frozen works are transferred to their respective gcwq worklists.
+ * frozen works are transferred to their respective pool worklists.
*
* CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
*/
void thaw_workqueues(void)
{
- unsigned int cpu;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
if (!workqueue_freezing)
goto out_unlock;
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
- struct workqueue_struct *wq;
+ workqueue_freezing = false;
- spin_lock_irq(&gcwq->lock);
+ /* restore max_active and repopulate worklist */
+ list_for_each_entry(wq, &workqueues, list) {
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
+ }
- BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
- gcwq->flags &= ~GCWQ_FREEZING;
+out_unlock:
+ mutex_unlock(&wq_pool_mutex);
+}
+#endif /* CONFIG_FREEZER */
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+static void __init wq_numa_init(void)
+{
+ cpumask_var_t *tbl;
+ int node, cpu;
- if (!cwq || !(wq->flags & WQ_FREEZABLE))
- continue;
+ /* determine NUMA pwq table len - highest node id + 1 */
+ for_each_node(node)
+ wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
- /* restore max_active and repopulate worklist */
- cwq->max_active = wq->saved_max_active;
+ if (num_possible_nodes() <= 1)
+ return;
- while (!list_empty(&cwq->delayed_works) &&
- cwq->nr_active < cwq->max_active)
- cwq_activate_first_delayed(cwq);
- }
+ if (wq_disable_numa) {
+ pr_info("workqueue: NUMA affinity support disabled\n");
+ return;
+ }
- wake_up_worker(gcwq);
+ wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+ BUG_ON(!wq_update_unbound_numa_attrs_buf);
- spin_unlock_irq(&gcwq->lock);
+ /*
+ * We want masks of possible CPUs of each node which isn't readily
+ * available. Build one from cpu_to_node() which should have been
+ * fully initialized by now.
+ */
+ tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+ BUG_ON(!tbl);
+
+ for_each_node(node)
+ BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE));
+
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
+ if (WARN_ON(node == NUMA_NO_NODE)) {
+ pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
+ /* happens iff arch is bonkers, let's just proceed */
+ return;
+ }
+ cpumask_set_cpu(cpu, tbl[node]);
}
- workqueue_freezing = false;
-out_unlock:
- spin_unlock(&workqueue_lock);
+ wq_numa_possible_cpumask = tbl;
+ wq_numa_enabled = true;
}
-#endif /* CONFIG_FREEZER */
static int __init init_workqueues(void)
{
- unsigned int cpu;
- int i;
+ int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+ int i, cpu;
- cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+ WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
- /* initialize gcwqs */
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
+ pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- spin_lock_init(&gcwq->lock);
- INIT_LIST_HEAD(&gcwq->worklist);
- gcwq->cpu = cpu;
- gcwq->flags |= GCWQ_DISASSOCIATED;
+ cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+ hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
- INIT_LIST_HEAD(&gcwq->idle_list);
- for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
+ wq_numa_init();
- init_timer_deferrable(&gcwq->idle_timer);
- gcwq->idle_timer.function = idle_worker_timeout;
- gcwq->idle_timer.data = (unsigned long)gcwq;
+ /* initialize CPU pools */
+ for_each_possible_cpu(cpu) {
+ struct worker_pool *pool;
- setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
- (unsigned long)gcwq);
+ i = 0;
+ for_each_cpu_worker_pool(pool, cpu) {
+ BUG_ON(init_worker_pool(pool));
+ pool->cpu = cpu;
+ cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ pool->attrs->nice = std_nice[i++];
+ pool->node = cpu_to_node(cpu);
- ida_init(&gcwq->worker_ida);
-
- gcwq->trustee_state = TRUSTEE_DONE;
- init_waitqueue_head(&gcwq->trustee_wait);
+ /* alloc pool ID */
+ mutex_lock(&wq_pool_mutex);
+ BUG_ON(worker_pool_assign_id(pool));
+ mutex_unlock(&wq_pool_mutex);
+ }
}
/* create the initial worker */
- for_each_online_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
- struct worker *worker;
+ for_each_online_cpu(cpu) {
+ struct worker_pool *pool;
- if (cpu != WORK_CPU_UNBOUND)
- gcwq->flags &= ~GCWQ_DISASSOCIATED;
- worker = create_worker(gcwq, true);
- BUG_ON(!worker);
- spin_lock_irq(&gcwq->lock);
- start_worker(worker);
- spin_unlock_irq(&gcwq->lock);
+ for_each_cpu_worker_pool(pool, cpu) {
+ pool->flags &= ~POOL_DISASSOCIATED;
+ BUG_ON(create_and_start_worker(pool) < 0);
+ }
+ }
+
+ /* create default unbound and ordered wq attrs */
+ for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+ struct workqueue_attrs *attrs;
+
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ unbound_std_wq_attrs[i] = attrs;
+
+ /*
+ * An ordered wq should have only one pwq as ordering is
+ * guaranteed by max_active which is enforced by pwqs.
+ * Turn off NUMA so that dfl_pwq is used for all nodes.
+ */
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ attrs->no_numa = true;
+ ordered_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);
+ system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
- system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
- BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
- !system_unbound_wq || !system_freezable_wq);
+ system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+ WQ_POWER_EFFICIENT, 0);
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+ 0);
+ BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq ||
+ !system_power_efficient_wq ||
+ !system_freezable_power_efficient_wq);
return 0;
}
early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 00000000000..45215870ac6
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,74 @@
+/*
+ * kernel/workqueue_internal.h
+ *
+ * Workqueue internal header file. Only to be included by workqueue and
+ * core kernel subsystems.
+ */
+#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
+#define _KERNEL_WORKQUEUE_INTERNAL_H
+
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+
+struct worker_pool;
+
+/*
+ * The poor guys doing the actual heavy lifting. All on-duty workers are
+ * either serving the manager role, on idle list or on busy hash. For
+ * details on the locking annotation (L, I, X...), refer to workqueue.c.
+ *
+ * Only to be used in workqueue and async.
+ */
+struct worker {
+ /* on idle list while idle, on busy hash table while busy */
+ union {
+ struct list_head entry; /* L: while idle */
+ struct hlist_node hentry; /* L: while busy */
+ };
+
+ struct work_struct *current_work; /* L: work being processed */
+ work_func_t current_func; /* L: current_work's fn */
+ struct pool_workqueue *current_pwq; /* L: current_work's pwq */
+ bool desc_valid; /* ->desc is valid */
+ struct list_head scheduled; /* L: scheduled works */
+
+ /* 64 bytes boundary on 64bit, 32 on 32bit */
+
+ struct task_struct *task; /* I: worker task */
+ struct worker_pool *pool; /* I: the associated pool */
+ /* L: for rescuers */
+ struct list_head node; /* A: anchored at pool->workers */
+ /* A: runs through worker->node */
+
+ unsigned long last_active; /* L: last active timestamp */
+ unsigned int flags; /* X: flags */
+ int id; /* I: worker id */
+
+ /*
+ * Opaque string set with work_set_desc(). Printed out with task
+ * dump for debugging - WARN, BUG, panic or sysrq.
+ */
+ char desc[WORKER_DESC_LEN];
+
+ /* used only by rescuers to point to the target workqueue */
+ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
+};
+
+/**
+ * current_wq_worker - return struct worker if %current is a workqueue worker
+ */
+static inline struct worker *current_wq_worker(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return kthread_data(current);
+ return NULL;
+}
+
+/*
+ * Scheduler hooks for concurrency managed workqueue. Only to be used from
+ * sched/core.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
+
+#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc7..00000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * kernel/workqueue_sched.h
- *
- * Scheduler hooks for concurrency managed workqueue. Only to be
- * included from sched.c and workqueue.c.
- */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
- unsigned int cpu);