aboutsummaryrefslogtreecommitdiff
path: root/arch/blackfin/include/asm/pci.h
diff options
context:
space:
mode:
Diffstat (limited to 'arch/blackfin/include/asm/pci.h')
-rw-r--r--arch/blackfin/include/asm/pci.h13
1 files changed, 13 insertions, 0 deletions
diff --git a/arch/blackfin/include/asm/pci.h b/arch/blackfin/include/asm/pci.h
new file mode 100644
index 00000000000..c737909fba4
--- /dev/null
+++ b/arch/blackfin/include/asm/pci.h
@@ -0,0 +1,13 @@
+/* Changed from asm-m68k version, Lineo Inc. May 2001 */
+
+#ifndef _ASM_BFIN_PCI_H
+#define _ASM_BFIN_PCI_H
+
+#include <asm/scatterlist.h>
+#include <asm-generic/pci-dma-compat.h>
+#include <asm-generic/pci.h>
+
+#define PCIBIOS_MIN_IO 0x00001000
+#define PCIBIOS_MIN_MEM 0x10000000
+
+#endif /* _ASM_BFIN_PCI_H */
'upd'>kernel/async.c167
-rw-r--r--kernel/audit.c1214
-rw-r--r--kernel/audit.h193
-rw-r--r--kernel/audit_tree.c86
-rw-r--r--kernel/audit_watch.c60
-rw-r--r--kernel/auditfilter.c615
-rw-r--r--kernel/auditsc.c1202
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/capability.c75
-rw-r--r--kernel/cgroup.c7227
-rw-r--r--kernel/cgroup_freezer.c590
-rw-r--r--kernel/compat.c491
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/context_tracking.c216
-rw-r--r--kernel/cpu.c300
-rw-r--r--kernel/cpu_pm.c16
-rw-r--r--kernel/cpuset.c1759
-rw-r--r--kernel/cred.c216
-rw-r--r--kernel/debug/debug_core.c175
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/gdbstub.c14
-rw-r--r--kernel/debug/kdb/kdb_bp.c27
-rw-r--r--kernel/debug/kdb/kdb_bt.c5
-rw-r--r--kernel/debug/kdb/kdb_debugger.c32
-rw-r--r--kernel/debug/kdb/kdb_io.c50
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c282
-rw-r--r--kernel/debug/kdb/kdb_private.h12
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/delayacct.c14
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/elfcore.c10
-rw-r--r--kernel/events/Makefile3
-rw-r--r--kernel/events/callchain.c38
-rw-r--r--kernel/events/core.c2378
-rw-r--r--kernel/events/hw_breakpoint.c225
-rw-r--r--kernel/events/internal.h108
-rw-r--r--kernel/events/ring_buffer.c152
-rw-r--r--kernel/events/uprobes.c1993
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c486
-rw-r--r--kernel/extable.c12
-rw-r--r--kernel/fork.c615
-rw-r--r--kernel/freezer.c37
-rw-r--r--kernel/futex.c732
-rw-r--r--kernel/futex_compat.c57
-rw-r--r--kernel/gcov/Kconfig32
-rw-r--r--kernel/gcov/Makefile32
-rw-r--r--kernel/gcov/base.c38
-rw-r--r--kernel/gcov/fs.c54
-rw-r--r--kernel/gcov/gcc_3_4.c115
-rw-r--r--kernel/gcov/gcc_4_7.c565
-rw-r--r--kernel/gcov/gcov.h65
-rw-r--r--kernel/groups.c66
-rw-r--r--kernel/hrtimer.c178
-rw-r--r--kernel/hung_task.c58
-rw-r--r--kernel/irq/Kconfig37
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c154
-rw-r--r--kernel/irq/debug.h38
-rw-r--r--kernel/irq/devres.c45
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/generic-chip.c314
-rw-r--r--kernel/irq/handle.c40
-rw-r--r--kernel/irq/internals.h24
-rw-r--r--kernel/irq/irqdesc.c103
-rw-r--r--kernel/irq/irqdomain.c694
-rw-r--r--kernel/irq/manage.c486
-rw-r--r--kernel/irq/migration.c9
-rw-r--r--kernel/irq/pm.c9
-rw-r--r--kernel/irq/proc.c30
-rw-r--r--kernel/irq/resend.c15
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c125
-rw-r--r--kernel/irq_work.c154
-rw-r--r--kernel/itimer.c8
-rw-r--r--kernel/jump_label.c142
-rw-r--r--kernel/kallsyms.c69
-rw-r--r--kernel/kcmp.c197
-rw-r--r--kernel/kexec.c359
-rw-r--r--kernel/kfifo.c608
-rw-r--r--kernel/kmod.c357
-rw-r--r--kernel/kprobes.c830
-rw-r--r--kernel/ksysfs.c33
-rw-r--r--kernel/kthread.c388
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/locking/Makefile28
-rw-r--r--kernel/locking/lglock.c89
-rw-r--r--kernel/locking/lockdep.c (renamed from kernel/lockdep.c)140
-rw-r--r--kernel/locking/lockdep_internals.h (renamed from kernel/lockdep_internals.h)6
-rw-r--r--kernel/locking/lockdep_proc.c (renamed from kernel/lockdep_proc.c)17
-rw-r--r--kernel/locking/lockdep_states.h (renamed from kernel/lockdep_states.h)0
-rw-r--r--kernel/locking/locktorture.c454
-rw-r--r--kernel/locking/mcs_spinlock.c210
-rw-r--r--kernel/locking/mcs_spinlock.h130
-rw-r--r--kernel/locking/mutex-debug.c (renamed from kernel/mutex-debug.c)22
-rw-r--r--kernel/locking/mutex-debug.h (renamed from kernel/mutex-debug.h)0
-rw-r--r--kernel/locking/mutex.c930
-rw-r--r--kernel/locking/mutex.h (renamed from kernel/mutex.h)0
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/qrwlock.c133
-rw-r--r--kernel/locking/rtmutex-debug.c (renamed from kernel/rtmutex-debug.c)9
-rw-r--r--kernel/locking/rtmutex-debug.h (renamed from kernel/rtmutex-debug.h)5
-rw-r--r--kernel/locking/rtmutex-tester.c (renamed from kernel/rtmutex-tester.c)6
-rw-r--r--kernel/locking/rtmutex.c (renamed from kernel/rtmutex.c)447
-rw-r--r--kernel/locking/rtmutex.h (renamed from kernel/rtmutex.h)5
-rw-r--r--kernel/locking/rtmutex_common.h (renamed from kernel/rtmutex_common.h)23
-rw-r--r--kernel/locking/rwsem-spinlock.c296
-rw-r--r--kernel/locking/rwsem-xadd.c513
-rw-r--r--kernel/locking/rwsem.c (renamed from kernel/rwsem.c)58
-rw-r--r--kernel/locking/semaphore.c (renamed from kernel/semaphore.c)10
-rw-r--r--kernel/locking/spinlock.c (renamed from kernel/spinlock.c)16
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/module-internal.h12
-rw-r--r--kernel/module.c1102
-rw-r--r--kernel/module_signing.c250
-rw-r--r--kernel/mutex.c500
-rw-r--r--kernel/notifier.c24
-rw-r--r--kernel/nsproxy.c100
-rw-r--r--kernel/padata.c93
-rw-r--r--kernel/panic.c108
-rw-r--r--kernel/params.c169
-rw-r--r--kernel/pid.c116
-rw-r--r--kernel/pid_namespace.c223
-rw-r--r--kernel/posix-cpu-timers.c804
-rw-r--r--kernel/posix-timers.c132
-rw-r--r--kernel/power/Kconfig75
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/autosleep.c128
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/console.c118
-rw-r--r--kernel/power/hibernate.c257
-rw-r--r--kernel/power/main.c287
-rw-r--r--kernel/power/power.h51
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c84
-rw-r--r--kernel/power/qos.c198
-rw-r--r--kernel/power/snapshot.c79
-rw-r--r--kernel/power/suspend.c272
-rw-r--r--kernel/power/suspend_test.c35
-rw-r--r--kernel/power/swap.c172
-rw-r--r--kernel/power/user.c70
-rw-r--r--kernel/power/wakelock.c268
-rw-r--r--kernel/printk.c1762
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/braille.c49
-rw-r--r--kernel/printk/braille.h48
-rw-r--r--kernel/printk/console_cmdline.h14
-rw-r--r--kernel/printk/printk.c3009
-rw-r--r--kernel/profile.c85
-rw-r--r--kernel/ptrace.c327
-rw-r--r--kernel/range.c22
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h (renamed from kernel/rcu.h)60
-rw-r--r--kernel/rcu/rcutorture.c1707
-rw-r--r--kernel/rcu/srcu.c693
-rw-r--r--kernel/rcu/tiny.c (renamed from kernel/rcutiny.c)123
-rw-r--r--kernel/rcu/tiny_plugin.h174
-rw-r--r--kernel/rcu/tree.c3744
-rw-r--r--kernel/rcu/tree.h (renamed from kernel/rcutree.h)285
-rw-r--r--kernel/rcu/tree_plugin.h2854
-rw-r--r--kernel/rcu/tree_trace.c (renamed from kernel/rcutree_trace.c)385
-rw-r--r--kernel/rcu/update.c (renamed from kernel/rcupdate.c)268
-rw-r--r--kernel/rcutiny_plugin.h1073
-rw-r--r--kernel/rcutorture.c1833
-rw-r--r--kernel/rcutree.c2289
-rw-r--r--kernel/rcutree_plugin.h2199
-rw-r--r--kernel/reboot.c433
-rw-r--r--kernel/relay.c33
-rw-r--r--kernel/res_counter.c148
-rw-r--r--kernel/resource.c305
-rw-r--r--kernel/sched/Makefile9
-rw-r--r--kernel/sched/auto_group.c14
-rw-r--r--kernel/sched/clock.c136
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c5003
-rw-r--r--kernel/sched/cpuacct.c283
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.c229
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/cpupri.c32
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c838
-rw-r--r--kernel/sched/deadline.c1676
-rw-r--r--kernel/sched/debug.c255
-rw-r--r--kernel/sched/fair.c5041
-rw-r--r--kernel/sched/features.h53
-rw-r--r--kernel/sched/idle.c273
-rw-r--r--kernel/sched/idle_task.c13
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c554
-rw-r--r--kernel/sched/sched.h725
-rw-r--r--kernel/sched/stats.c80
-rw-r--r--kernel/sched/stats.h90
-rw-r--r--kernel/sched/stop_task.c39
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)219
-rw-r--r--kernel/seccomp.c473
-rw-r--r--kernel/signal.c796
-rw-r--r--kernel/smp.c591
-rw-r--r--kernel/smpboot.c313
-rw-r--r--kernel/smpboot.h20
-rw-r--r--kernel/softirq.c553
-rw-r--r--kernel/srcu.c315
-rw-r--r--kernel/stop_machine.c460
-rw-r--r--kernel/sys.c1333
-rw-r--r--kernel/sys_ni.c12
-rw-r--r--kernel/sysctl.c921
-rw-r--r--kernel/sysctl_binary.c58
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/task_work.c128
-rw-r--r--kernel/taskstats.c98
-rw-r--r--kernel/test_kprobes.c2
-rw-r--r--kernel/time.c31
-rw-r--r--kernel/time/Kconfig197
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/alarmtimer.c201
-rw-r--r--kernel/time/clockevents.c412
-rw-r--r--kernel/time/clocksource.c314
-rw-r--r--kernel/time/jiffies.c46
-rw-r--r--kernel/time/ntp.c311
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/sched_clock.c217
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c106
-rw-r--r--kernel/time/tick-broadcast.c509
-rw-r--r--kernel/time/tick-common.c244
-rw-r--r--kernel/time/tick-internal.h39
-rw-r--r--kernel/time/tick-sched.c709
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timekeeping.c1277
-rw-r--r--kernel/time/timekeeping_debug.c74
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/time/timer_list.c113
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c492
-rw-r--r--kernel/torture.c733
-rw-r--r--kernel/trace/Kconfig169
-rw-r--r--kernel/trace/Makefile15
-rw-r--r--kernel/trace/blktrace.c101
-rw-r--r--kernel/trace/ftrace.c1775
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c1472
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c4102
-rw-r--r--kernel/trace/trace.h670
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_branch.c14
-rw-r--r--kernel/trace/trace_clock.c16
-rw-r--r--kernel/trace/trace_entries.h89
-rw-r--r--kernel/trace/trace_event_perf.c244
-rw-r--r--kernel/trace/trace_events.c1936
-rw-r--r--kernel/trace/trace_events_filter.c472
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_export.c84
-rw-r--r--kernel/trace/trace_functions.c530
-rw-r--r--kernel/trace/trace_functions_graph.c247
-rw-r--r--kernel/trace/trace_irqsoff.c176
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c1789
-rw-r--r--kernel/trace/trace_mmiotrace.c24
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c324
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_printk.c23
-rw-r--r--kernel/trace/trace_probe.c726
-rw-r--r--kernel/trace/trace_probe.h400
-rw-r--r--kernel/trace/trace_sched_switch.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c229
-rw-r--r--kernel/trace/trace_selftest.c459
-rw-r--r--kernel/trace/trace_stack.c127
-rw-r--r--kernel/trace/trace_stat.c43
-rw-r--r--kernel/trace/trace_syscalls.c301
-rw-r--r--kernel/trace/trace_uprobe.c1340
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/tracepoint.c726
-rw-r--r--kernel/tsacct.c56
-rw-r--r--kernel/uid16.c105
-rw-r--r--kernel/up.c69
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c74
-rw-r--r--kernel/user_namespace.c908
-rw-r--r--kernel/utsname.c38
-rw-r--r--kernel/utsname_sysctl.c13
-rw-r--r--kernel/watchdog.c455
-rw-r--r--kernel/workqueue.c5082
-rw-r--r--kernel/workqueue_internal.h74
-rw-r--r--kernel/workqueue_sched.h9
298 files changed, 78942 insertions, 40714 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f43..790d83c7d16 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,5 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
+x509_certificate_list
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b0..2a202a84675 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
default 1000 if HZ_1000
config SCHED_HRTICK
- def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+ def_bool HIGH_RES_TIMERS
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75..76768ee812b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
bool
+config UNINLINE_SPIN_UNLOCK
+ bool
+
#
# lock_* functions are inlined when:
# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -103,100 +106,134 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
#
+if !DEBUG_SPINLOCK
+
config INLINE_SPIN_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK
config INLINE_SPIN_TRYLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK_BH
config INLINE_SPIN_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
config INLINE_SPIN_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
config INLINE_SPIN_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
config INLINE_SPIN_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQSAVE
-
-config INLINE_SPIN_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
config INLINE_SPIN_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
config INLINE_READ_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_READ_TRYLOCK
config INLINE_READ_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
config INLINE_READ_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
config INLINE_READ_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
config INLINE_READ_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
config INLINE_WRITE_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_WRITE_TRYLOCK
config INLINE_WRITE_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
config INLINE_WRITE_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
config INLINE_WRITE_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
config INLINE_WRITE_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+endif
+
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES
+ def_bool y
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26..3f9c97419f0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
select PREEMPT_COUNT
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e7..f2a8b6246ce 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,56 +2,50 @@
# Makefile for the linux kernel.
#
-obj-y = fork.o exec_domain.o panic.o printk.o \
+obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o cred.o \
- async.o range.o groups.o
+ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ extable.o params.o posix-timers.o \
+ kthread.o sys_ni.o posix-cpu-timers.o \
+ hrtimer.o nsproxy.o \
+ notifier.o ksysfs.o cred.o reboot.o \
+ async.o range.o groups.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
obj-y += sched/
+obj-y += locking/
obj-y += power/
+obj-y += printk/
+obj-y += irq/
+obj-y += rcu/
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
-obj-$(CONFIG_LOCKDEP) += lockdep.o
-ifeq ($(CONFIG_PROC_FS),y)
-obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
-endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -76,14 +70,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_TINY_RCU) += rcutiny.o
-obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -95,7 +82,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
-obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
@@ -107,6 +94,8 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
@@ -123,8 +112,112 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
+#
+# We look in the source root and the build root for all files whose name ends
+# in ".x509". Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
+#
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+ $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
+
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
+
+ifneq ($(wildcard $(obj)/.x509.list),)
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+
+kernel/system_certificates.o: $(obj)/x509_certificate_list
+
+quiet_cmd_x509certs = CERTS $@
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+
+targets += $(obj)/x509_certificate_list
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+ $(call if_changed,x509certs)
+
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+ @echo $(X509_CERTIFICATES) >$@
+endif
+
+clean-files := x509_certificate_list .x509.list
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+ifndef CONFIG_MODULE_SIG_HASH
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and uses a hardware random"
+ @echo "### number generator if one is available."
+ @echo "###"
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv 2>&1
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "O = Magrathea"
+ @echo >>x509.genkey "CN = Glacier signing key"
+ @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 02e6167a53b..808a86ff229 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
- res = act>0;
+ res = act > 0;
goto out;
}
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
}
}
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt;
@@ -201,11 +201,11 @@ static int acct_on(char *name)
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -260,9 +260,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- char *tmp = getname(name);
+ struct filename *tmp = getname(name);
if (IS_ERR(tmp))
- return (PTR_ERR(tmp));
+ return PTR_ERR(tmp);
error = acct_on(tmp);
putname(tmp);
} else {
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
/* we really need to bite the bullet and change layout */
- ac.ac_uid = orig_cred->uid;
- ac.ac_gid = orig_cred->gid;
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+ file_end_write(file);
out:
revert_creds(orig_cred);
}
@@ -566,6 +573,7 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -593,8 +601,9 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime += current->utime;
- pacct->ac_stime += current->stime;
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bb..61f023ce022 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,55 +57,48 @@ asynchronous and synchronous parts of the kernel.
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include "workqueue_internal.h"
+
static async_cookie_t next_cookie = 1;
-#define MAX_WORK 32768
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
-static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
struct async_entry {
- struct list_head list;
+ struct list_head domain_list;
+ struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
- async_func_ptr *func;
+ async_func_t func;
void *data;
- struct list_head *running;
+ struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t __lowest_in_progress(struct list_head *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct async_entry *entry;
-
- if (!list_empty(running)) {
- entry = list_first_entry(running,
- struct async_entry, list);
- return entry->cookie;
- }
+ struct list_head *pending;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
+ unsigned long flags;
- list_for_each_entry(entry, &async_pending, list)
- if (entry->running == running)
- return entry->cookie;
+ spin_lock_irqsave(&async_lock, flags);
- return next_cookie; /* "infinity" value */
-}
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
-static async_cookie_t lowest_in_progress(struct list_head *running)
-{
- unsigned long flags;
- async_cookie_t ret;
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
- spin_lock_irqsave(&async_lock, flags);
- ret = __lowest_in_progress(running);
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
@@ -120,12 +113,7 @@ static void async_run_entry_fn(struct work_struct *work)
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
- /* 1) move self to the running queue */
- spin_lock_irqsave(&async_lock, flags);
- list_move_tail(&entry->list, entry->running);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* 2) run (and print duration) */
+ /* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
(long long)entry->cookie,
@@ -142,21 +130,22 @@ static void async_run_entry_fn(struct work_struct *work)
(long long)ktime_to_ns(delta) >> 10);
}
- /* 3) remove self from the running queue */
+ /* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
- list_del(&entry->list);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
- /* 4) free the entry */
+ /* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 5) wake up any waiters */
+ /* 4) wake up any waiters */
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -176,20 +165,31 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
- ptr(data, newcookie);
+ func(data, newcookie);
return newcookie;
}
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = ptr;
+ entry->func = func;
entry->data = data;
- entry->running = running;
+ entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->list, &async_pending);
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
/* schedule for execution */
queue_work(system_unbound_wq, &entry->work);
@@ -198,34 +198,34 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
/**
* async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
{
- return __async_schedule(ptr, data, &async_running);
+ return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
+ struct async_domain *domain)
{
- return __async_schedule(ptr, data, running);
+ return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -236,36 +236,51 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- do {
- async_synchronize_cookie(next_cookie);
- } while (!list_empty(&async_running) || !list_empty(&async_pending));
+ async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+ spin_lock_irq(&async_lock);
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
+ domain->registered = 0;
+ spin_unlock_irq(&async_lock);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
+/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by @domain have been done.
*/
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, list);
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie,
- struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t uninitialized_var(starttime), delta, endtime;
@@ -274,7 +289,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
starttime = ktime_get();
}
- wait_event(async_done, lowest_in_progress(running) >= cookie);
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
@@ -296,6 +311,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_domain(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+ struct worker *worker = current_wq_worker();
+
+ return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0..3ef2e0e797e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,14 +41,18 @@
* Example user-space utilities: http://people.redhat.com/sgrubb/audit/
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
-#include <asm/types.h>
+#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
#include <linux/audit.h>
@@ -58,9 +62,10 @@
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
-#include <linux/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
+#include <linux/pid_namespace.h>
+#include <net/netns/generic.h>
#include "audit.h"
@@ -74,37 +79,39 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-int audit_enabled;
-int audit_ever_enabled;
+u32 audit_enabled;
+u32 audit_ever_enabled;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static int audit_default;
+static u32 audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
-static int audit_failure = AUDIT_FAIL_PRINTK;
+static u32 audit_failure = AUDIT_FAIL_PRINTK;
/*
* If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_pid contains
- * the pid to use to send netlink messages to that process.
+ * contains the pid of the auditd process and audit_nlk_portid contains
+ * the portid to use to send netlink messages to that process.
*/
int audit_pid;
-static int audit_nlk_pid;
+static __u32 audit_nlk_portid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
-static int audit_rate_limit;
+static u32 audit_rate_limit;
-/* Number of outstanding audit_buffers allowed. */
-static int audit_backlog_limit = 64;
-static int audit_backlog_wait_time = 60 * HZ;
-static int audit_backlog_wait_overflow = 0;
+/* Number of outstanding audit_buffers allowed.
+ * When set to zero, this means unlimited. */
+static u32 audit_backlog_limit = 64;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
-uid_t audit_sig_uid = -1;
+kuid_t audit_sig_uid = INVALID_UID;
pid_t audit_sig_pid = -1;
u32 audit_sig_sid = 0;
@@ -119,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
+int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -137,6 +145,17 @@ static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
+ .mask = -1,
+ .features = 0,
+ .lock = 0,};
+
+static char *audit_feature_names[2] = {
+ "only_unset_loginuid",
+ "loginuid_immutable",
+};
+
+
/* Serialize requests from userspace. */
DEFINE_MUTEX(audit_cmd_mutex);
@@ -162,27 +181,27 @@ struct audit_buffer {
};
struct audit_reply {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff *skb;
};
-static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_pid = pid;
+ nlh->nlmsg_pid = portid;
}
}
void audit_panic(const char *message)
{
- switch (audit_failure)
- {
+ switch (audit_failure) {
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
if (printk_ratelimit())
- printk(KERN_ERR "audit: %s\n", message);
+ pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
/* test audit_pid since printk is always losey, why bother? */
@@ -253,9 +272,7 @@ void audit_log_lost(const char *message)
if (print) {
if (printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_lost=%d audit_rate_limit=%d "
- "audit_backlog_limit=%d\n",
+ pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
atomic_read(&audit_lost),
audit_rate_limit,
audit_backlog_limit);
@@ -263,39 +280,29 @@ void audit_log_lost(const char *message)
}
}
-static int audit_log_config_change(char *function_name, int new, int old,
- uid_t loginuid, u32 sessionid, u32 sid,
+static int audit_log_config_change(char *function_name, u32 new, u32 old,
int allow_changes)
{
struct audit_buffer *ab;
int rc = 0;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
- old, loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
-
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc) {
- audit_log_format(ab, " sid=%u", sid);
- allow_changes = 0; /* Something weird, deny request */
- } else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (unlikely(!ab))
+ return rc;
+ audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_session_info(ab);
+ rc = audit_log_task_context(ab);
+ if (rc)
+ allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
-static int audit_do_config_change(char *function_name, int *to_change,
- int new, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
- int allow_changes, rc = 0, old = *to_change;
+ int allow_changes, rc = 0;
+ u32 old = *to_change;
/* check if we are locked */
if (audit_enabled == AUDIT_LOCKED)
@@ -304,8 +311,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
- rc = audit_log_config_change(function_name, new, old, loginuid,
- sessionid, sid, allow_changes);
+ rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
@@ -319,44 +325,43 @@ static int audit_do_config_change(char *function_name, int *to_change,
return rc;
}
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_rate_limit(u32 limit)
+{
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
+}
+
+static int audit_set_backlog_limit(u32 limit)
{
- return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_backlog_wait_time(u32 timeout)
{
- return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_wait_time",
+ &audit_backlog_wait_time, timeout);
}
-static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(u32 state)
{
int rc;
if (state < AUDIT_OFF || state > AUDIT_LOCKED)
return -EINVAL;
- rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
- loginuid, sessionid, sid);
-
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
-static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(u32 state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
- return audit_do_config_change("audit_failure", &audit_failure, state,
- loginuid, sessionid, sid);
+ return audit_do_config_change("audit_failure", &audit_failure, state);
}
/*
@@ -371,7 +376,8 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
static void audit_hold_skb(struct sk_buff *skb)
{
if (audit_default &&
- skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+ (!audit_backlog_limit ||
+ skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
skb_queue_tail(&audit_skb_hold_queue, skb);
else
kfree_skb(skb);
@@ -384,13 +390,13 @@ static void audit_hold_skb(struct sk_buff *skb)
static void audit_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- char *data = NLMSG_DATA(nlh);
+ char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE) {
if (printk_ratelimit())
- printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
- audit_log_lost("printk limit exceeded\n");
+ audit_log_lost("printk limit exceeded");
}
audit_hold_skb(skb);
@@ -401,12 +407,15 @@ static void kauditd_send_skb(struct sk_buff *skb)
int err;
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
- printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared\n");
- audit_pid = 0;
+ if (audit_pid) {
+ pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd disappeared");
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
} else
@@ -414,96 +423,132 @@ static void kauditd_send_skb(struct sk_buff *skb)
consume_skb(skb);
}
-static int kauditd_thread(void *dummy)
+/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, GFP_KERNEL);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+
+/*
+ * flush_hold_queue - empty the hold queue if auditd appears
+ *
+ * If auditd just started, drain the queue of messages already
+ * sent to syslog/printk. Remember loss here is ok. We already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * If you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+static void flush_hold_queue(void)
{
struct sk_buff *skb;
+ if (!audit_default || !audit_pid)
+ return;
+
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (likely(!skb))
+ return;
+
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+
+ /*
+ * if auditd just disappeared but we
+ * dequeued an skb we need to drop ref
+ */
+ if (skb)
+ consume_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
set_freezable();
while (!kthread_should_stop()) {
- /*
- * if auditd just started drain the queue of messages already
- * sent to syslog/printk. remember loss here is ok. we already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
- *
- * if you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
- */
- if (audit_default && audit_pid) {
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (unlikely(skb)) {
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
- }
- }
+ struct sk_buff *skb;
+ DECLARE_WAITQUEUE(wait, current);
+
+ flush_hold_queue();
skb = skb_dequeue(&audit_skb_queue);
- wake_up(&audit_backlog_wait);
+
if (skb) {
+ if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
else
audit_printk_skb(skb);
- } else {
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
-
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ continue;
}
- }
- return 0;
-}
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
-{
- struct task_struct *tsk;
- int err;
+ if (!skb_queue_len(&audit_skb_queue)) {
+ try_to_freeze();
+ schedule();
+ }
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- return -ESRCH;
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
}
- get_task_struct(tsk);
- rcu_read_unlock();
- err = tty_audit_push_task(tsk, loginuid, sessionid);
- put_task_struct(tsk);
- return err;
+ return 0;
}
int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
- int pid = dest->pid;
struct sk_buff *skb;
+ struct net *net = dest->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(audit_sock, skb, pid, 0);
+ netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
}
-struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
int multi, const void *payload, int size)
{
struct sk_buff *skb;
@@ -516,33 +561,37 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
if (!skb)
return NULL;
- nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
- data = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ if (!nlh)
+ goto out_kfree_skb;
+ data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_NEW */
- if (skb)
- kfree_skb(skb);
+out_kfree_skb:
+ kfree_skb(skb);
return NULL;
}
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
+ struct net *net = reply->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
- * @pid: process id to send reply to
+ * @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
@@ -550,12 +599,14 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(int pid, int seq, int type, int done, int multi,
- const void *payload, int size)
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -564,11 +615,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi,
if (!reply)
return;
- skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
+ skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
if (!skb)
goto out;
- reply->pid = pid;
+ reply->net = get_net(net);
+ reply->portid = portid;
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -587,27 +639,49 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
+ /* Only support initial user namespace for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
+ if (current_user_ns() != &init_user_ns)
+ return -ECONNREFUSED;
+
switch (msg_type) {
- case AUDIT_GET:
case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- case AUDIT_SET:
case AUDIT_ADD:
- case AUDIT_ADD_RULE:
case AUDIT_DEL:
+ return -EOPNOTSUPP;
+ case AUDIT_GET:
+ case AUDIT_SET:
+ case AUDIT_GET_FEATURE:
+ case AUDIT_SET_FEATURE:
+ case AUDIT_LIST_RULES:
+ case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (!capable(CAP_AUDIT_CONTROL))
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if ((task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (!capable(CAP_AUDIT_WRITE))
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -617,45 +691,125 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
- u32 pid, u32 uid, uid_t auid, u32 ses,
- u32 sid)
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
- char *ctx = NULL;
- u32 len;
+ uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
- if (!audit_enabled) {
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return rc;
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
- audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
- pid, uid, auid, ses);
- if (sid) {
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc)
- audit_log_format(*ab, " ssid=%u", sid);
- else {
- audit_log_format(*ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
+ if (unlikely(!*ab))
+ return rc;
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_session_info(*ab);
+ audit_log_task_context(*ab);
+
+ return rc;
+}
+
+int is_audit_feature_set(int i)
+{
+ return af.features & AUDIT_FEATURE_TO_MASK(i);
+}
+
+
+static int audit_get_feature(struct sk_buff *skb)
+{
+ u32 seq;
+
+ seq = nlmsg_hdr(skb)->nlmsg_seq;
+
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
+
+ return 0;
+}
+
+static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
+ u32 old_lock, u32 new_lock, int res)
+{
+ struct audit_buffer *ab;
+
+ if (audit_enabled == AUDIT_OFF)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_feature_names[which], !!old_feature, !!new_feature,
+ !!old_lock, !!new_lock, res);
+ audit_log_end(ab);
+}
+
+static int audit_set_feature(struct sk_buff *skb)
+{
+ struct audit_features *uaf;
+ int i;
+
+ BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+ uaf = nlmsg_data(nlmsg_hdr(skb));
+
+ /* if there is ever a version 2 we should handle that here */
+
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
+
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+ old_lock = af.lock & feature;
+
+ /* are we changing a locked feature? */
+ if (old_lock && (new_feature != old_feature)) {
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 0);
+ return -EPERM;
}
}
+ /* nothing invalid, do the changes */
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
- return rc;
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ old_lock = af.lock & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+
+ if (new_feature != old_feature)
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 1);
+
+ if (new_feature)
+ af.features |= feature;
+ else
+ af.features &= ~feature;
+ af.lock |= new_lock;
+ }
+
+ return 0;
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- u32 uid, pid, seq, sid;
+ u32 seq;
void *data;
- struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
- uid_t loginuid; /* loginuid of sender */
- u32 sessionid;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
@@ -666,70 +820,90 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
- pid = NETLINK_CREDS(skb)->pid;
- uid = NETLINK_CREDS(skb)->uid;
- loginuid = audit_get_loginuid(current);
- sessionid = audit_get_sessionid(current);
- security_task_getsecid(current, &sid);
seq = nlh->nlmsg_seq;
- data = NLMSG_DATA(nlh);
+ data = nlmsg_data(nlh);
switch (msg_type) {
- case AUDIT_GET:
- status_set.enabled = audit_enabled;
- status_set.failure = audit_failure;
- status_set.pid = audit_pid;
- status_set.rate_limit = audit_rate_limit;
- status_set.backlog_limit = audit_backlog_limit;
- status_set.lost = atomic_read(&audit_lost);
- status_set.backlog = skb_queue_len(&audit_skb_queue);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
- &status_set, sizeof(status_set));
+ case AUDIT_GET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
+ s.pid = audit_pid;
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_skb_queue);
+ s.version = AUDIT_VERSION_LATEST;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
- case AUDIT_SET:
- if (nlh->nlmsg_len < sizeof(struct audit_status))
- return -EINVAL;
- status_get = (struct audit_status *)data;
- if (status_get->mask & AUDIT_STATUS_ENABLED) {
- err = audit_set_enabled(status_get->enabled,
- loginuid, sessionid, sid);
+ }
+ case AUDIT_SET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ if (s.mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(s.enabled);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_FAILURE) {
- err = audit_set_failure(status_get->failure,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(s.failure);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_PID) {
- int new_pid = status_get->pid;
+ if (s.mask & AUDIT_STATUS_PID) {
+ int new_pid = s.pid;
+ if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+ return -EACCES;
if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid,
- audit_pid, loginuid,
- sessionid, sid, 1);
-
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_pid = new_pid;
- audit_nlk_pid = NETLINK_CB(skb).pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
+ audit_sock = skb->sk;
+ }
+ if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
+ err = audit_set_rate_limit(s.rate_limit);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
+ err = audit_set_backlog_limit(s.backlog_limit);
+ if (err < 0)
+ return err;
}
- if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
- err = audit_set_rate_limit(status_get->rate_limit,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+ if (sizeof(s) > (size_t)nlh->nlmsg_len)
+ return -EINVAL;
+ if (s.backlog_wait_time < 0 ||
+ s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+ return -EINVAL;
+ err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
- err = audit_set_backlog_limit(status_get->backlog_limit,
- loginuid, sessionid, sid);
+ break;
+ }
+ case AUDIT_GET_FEATURE:
+ err = audit_get_feature(skb);
+ if (err)
+ return err;
+ break;
+ case AUDIT_SET_FEATURE:
+ err = audit_set_feature(skb);
+ if (err)
+ return err;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -737,79 +911,54 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(&NETLINK_CB(skb));
- if (err == 1) {
+ err = audit_filter_user(msg_type);
+ if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = audit_prepare_user_tty(pid, loginuid,
- sessionid);
+ err = tty_audit_push_current();
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type, pid, uid,
- loginuid, sessionid, sid);
-
+ mutex_unlock(&audit_cmd_mutex);
+ audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
- audit_log_format(ab, " msg='%.1024s'",
+ audit_log_format(ab, " msg='%.*s'",
+ AUDIT_MESSAGE_TEXT_MAX,
(char *)data);
else {
int size;
- audit_log_format(ab, " msg=");
+ audit_log_format(ab, " data=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
size--;
audit_log_n_untrustedstring(ab, data, size);
}
- audit_set_pid(ab, pid);
+ audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
+ mutex_lock(&audit_cmd_mutex);
}
break;
- case AUDIT_ADD:
- case AUDIT_DEL:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule))
- return -EINVAL;
- if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
- audit_log_end(ab);
- return -EPERM;
- }
- /* fallthrough */
- case AUDIT_LIST:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
- break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
audit_log_end(ab);
return -EPERM;
}
- /* fallthrough */
+ err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh));
+ break;
case AUDIT_LIST_RULES:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
+ err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
-
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -839,8 +988,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
@@ -865,53 +1013,56 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
return -ENOMEM;
}
- sig_data->uid = audit_sig_uid;
+ sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk;
- unsigned long flags;
-
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk && lock_task_sighand(tsk, &flags)) {
- s.enabled = tsk->signal->audit_tty != 0;
- unlock_task_sighand(tsk, &flags);
- } else
- err = -ESRCH;
- rcu_read_unlock();
-
- if (!err)
- audit_send_reply(NETLINK_CB(skb).pid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ struct task_struct *tsk = current;
+
+ spin_lock(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty;
+ s.log_passwd = tsk->signal->audit_tty_log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
- struct audit_tty_status *s;
- struct task_struct *tsk;
- unsigned long flags;
+ struct audit_tty_status s, old;
+ struct task_struct *tsk = current;
+ struct audit_buffer *ab;
+
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ /* check if new data is valid */
+ if ((s.enabled != 0 && s.enabled != 1) ||
+ (s.log_passwd != 0 && s.log_passwd != 1))
+ err = -EINVAL;
+
+ spin_lock(&tsk->sighand->siglock);
+ old.enabled = tsk->signal->audit_tty;
+ old.log_passwd = tsk->signal->audit_tty_log_passwd;
+ if (!err) {
+ tsk->signal->audit_tty = s.enabled;
+ tsk->signal->audit_tty_log_passwd = s.log_passwd;
+ }
+ spin_unlock(&tsk->sighand->siglock);
- if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
- return -EINVAL;
- s = data;
- if (s->enabled != 0 && s->enabled != 1)
- return -EINVAL;
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk && lock_task_sighand(tsk, &flags)) {
- tsk->signal->audit_tty = s->enabled != 0;
- unlock_task_sighand(tsk, &flags);
- } else
- err = -ESRCH;
- rcu_read_unlock();
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
+ " old-log_passwd=%d new-log_passwd=%d res=%d",
+ old.enabled, s.enabled, old.log_passwd,
+ s.log_passwd, !err);
+ audit_log_end(ab);
break;
}
default:
@@ -930,7 +1081,7 @@ static void audit_receive_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
- * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
@@ -939,13 +1090,13 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- while (NLMSG_OK(nlh, len)) {
+ while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- nlh = NLMSG_NEXT(nlh, len);
+ nlh = nlmsg_next(nlh, &len);
}
}
@@ -957,6 +1108,56 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
+static int __net_init audit_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
+ };
+
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->nlsk == NULL) {
+ audit_panic("cannot initialize netlink socket in namespace");
+ return -ENOMEM;
+ }
+ aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ return 0;
+}
+
+static void __net_exit audit_net_exit(struct net *net)
+{
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+ if (sock == audit_sock) {
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
+
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
+ synchronize_net();
+ netlink_kernel_release(sock);
+}
+
+static struct pernet_operations audit_net_ops __net_initdata = {
+ .init = audit_net_init,
+ .exit = audit_net_exit,
+ .id = &audit_net_id,
+ .size = sizeof(struct audit_net),
+};
+
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
@@ -965,14 +1166,9 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
- audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
- audit_receive, NULL, THIS_MODULE);
- if (!audit_sock)
- audit_panic("cannot initialize netlink socket");
- else
- audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ pr_info("initializing netlink subsys (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ register_pernet_subsys(&audit_net_ops);
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
@@ -996,22 +1192,32 @@ static int __init audit_enable(char *str)
if (!audit_default)
audit_initialized = AUDIT_DISABLED;
- printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+ pr_info("%s\n", audit_default ?
+ "enabled (after initialization)" : "disabled (until reboot)");
- if (audit_initialized == AUDIT_INITIALIZED) {
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
- } else if (audit_initialized == AUDIT_UNINITIALIZED) {
- printk(" (after initialization)");
- } else {
- printk(" (until reboot)");
+ return 1;
+}
+__setup("audit=", audit_enable);
+
+/* Process kernel command-line parameter at boot time.
+ * audit_backlog_limit=<n> */
+static int __init audit_backlog_limit_set(char *str)
+{
+ u32 audit_backlog_limit_arg;
+
+ pr_info("audit_backlog_limit: ");
+ if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
+ pr_cont("using default of %u, unable to parse %s\n",
+ audit_backlog_limit, str);
+ return 1;
}
- printk("\n");
+
+ audit_backlog_limit = audit_backlog_limit_arg;
+ pr_cont("%d\n", audit_backlog_limit);
return 1;
}
-
-__setup("audit=", audit_enable);
+__setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
@@ -1060,13 +1266,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
- goto nlmsg_failure;
+ goto err;
- nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+ nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+ if (!nlh)
+ goto out_kfree_skb;
return ab;
-nlmsg_failure: /* Used by NLMSG_NEW */
+out_kfree_skb:
kfree_skb(ab->skb);
ab->skb = NULL;
err:
@@ -1117,12 +1325,24 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
-/* Obtain an audit buffer. This routine does locking to obtain the
- * audit buffer, but then no locking is required for calls to
- * audit_log_*format. If the tsk is a task that is currently in a
- * syscall, then the syscall is marked as auditable and an audit record
- * will be written at syscall exit. If there is no associated task, tsk
- * should be NULL. */
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static long wait_for_auditd(long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ sleep_time = schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+
+ return sleep_time;
+}
/**
* audit_log_start - obtain an audit buffer
@@ -1145,7 +1365,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
struct audit_buffer *ab = NULL;
struct timespec t;
unsigned int uninitialized_var(serial);
- int reserve;
+ int reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
unsigned long timeout_start = jiffies;
if (audit_initialized != AUDIT_INITIALIZED)
@@ -1154,42 +1375,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT)
- reserve = 0;
- else
- reserve = 5; /* Allow atomic callers to go up to five
- entries over the normal backlog limit */
+ if (gfp_mask & __GFP_WAIT) {
+ if (audit_pid && audit_pid == current->pid)
+ gfp_mask &= ~__GFP_WAIT;
+ else
+ reserve = 0;
+ }
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
- && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
-
- /* Wait for auditd to drain the queue a little */
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
- continue;
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ long sleep_time;
+
+ sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
+ if (sleep_time > 0) {
+ sleep_time = wait_for_auditd(sleep_time);
+ if (sleep_time > 0)
+ continue;
+ }
}
if (audit_rate_check() && printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_backlog=%d > "
- "audit_backlog_limit=%d\n",
- skb_queue_len(&audit_skb_queue),
- audit_backlog_limit);
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_skb_queue),
+ audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
audit_backlog_wait_time = audit_backlog_wait_overflow;
wake_up(&audit_backlog_wait);
return NULL;
}
+ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
@@ -1307,7 +1523,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
int i, avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
- static const unsigned char *hex = "0123456789ABCDEF";
if (!ab)
return;
@@ -1325,10 +1540,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
}
ptr = skb_tail_pointer(skb);
- for (i=0; i<len; i++) {
- *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
- *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
- }
+ for (i = 0; i < len; i++)
+ ptr = hex_byte_pack_upper(ptr, buf[i]);
*ptr = 0;
skb_put(skb, len << 1); /* new string is twice the old string */
}
@@ -1418,7 +1631,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
- struct path *path)
+ const struct path *path)
{
char *p, *pathname;
@@ -1440,6 +1653,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
kfree(pathname);
}
+void audit_log_session_info(struct audit_buffer *ab)
+{
+ unsigned int sessionid = audit_get_sessionid(current);
+ uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+ audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+}
+
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
@@ -1449,14 +1670,285 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i) {
+ audit_log_format(ab, "%08x",
+ cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ }
+}
+
+void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ kernel_cap_t *perm = &name->fcap.permitted;
+ kernel_cap_t *inh = &name->fcap.inheritable;
+ int log = 0;
+
+ if (!cap_isclear(*perm)) {
+ audit_log_cap(ab, "cap_fp", perm);
+ log = 1;
+ }
+ if (!cap_isclear(*inh)) {
+ audit_log_cap(ab, "cap_fi", inh);
+ log = 1;
+ }
+
+ if (log)
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+ struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ audit_log_format(ab, " nametype=");
+ switch(n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, "NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, "PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, "DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, "CREATE");
+ break;
+ default:
+ audit_log_format(ab, "UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return 0;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return 0;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ const struct cred *cred;
+ char name[sizeof(tsk->comm)];
+ struct mm_struct *mm = tsk->mm;
+ char *tty;
+
+ if (!ab)
+ return;
+
+ /* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
+ from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ tty, audit_get_sessionid(tsk));
+
+ get_task_comm(name, tsk);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
+ audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
+/**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+ struct audit_buffer *ab;
+ struct audit_names *name;
+
+ name = kzalloc(sizeof(*name), GFP_NOFS);
+ if (!name)
+ return;
+
+ /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_ANOM_LINK);
+ if (!ab)
+ goto out;
+ audit_log_format(ab, "op=%s", operation);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " res=0");
+ audit_log_end(ab);
+
+ /* Generate AUDIT_PATH record with object. */
+ name->type = AUDIT_TYPE_NORMAL;
+ audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+ audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+ kfree(name);
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is placed on a queue and a tasklet is scheduled to
- * remove them from the queue outside the irq context. May be called in
- * any context.
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1466,7 +1958,19 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+
+ kauditd_send_multicast_skb(ab->skb);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
+ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 81676680337..7bb65730c89 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
/* 0 = no checking
1 = put_count checking
@@ -29,6 +30,11 @@
*/
#define AUDIT_DEBUG 0
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
+
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
@@ -59,10 +65,167 @@ struct audit_entry {
struct audit_krule rule;
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
-extern int audit_ever_enabled;
+struct audit_cap_data {
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+ union {
+ unsigned int fE; /* effective bit of file cap */
+ kernel_cap_t effective; /* effective set of process */
+ };
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+ struct list_head list; /* audit_context->names_list */
+
+ struct filename *name;
+ int name_len; /* number of chars to log */
+ bool hidden; /* don't log this record */
+ bool name_put; /* call __putname()? */
+
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ unsigned char type; /* record type */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit.
+ */
+ bool should_free;
+};
+
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state, current_state;
+ unsigned int serial; /* serial number for record */
+ int major; /* syscall number */
+ struct timespec ctime; /* time of syscall entry */
+ unsigned long argv[4]; /* syscall arguments */
+ long return_code;/* syscall return code */
+ u64 prio;
+ int return_valid; /* return code is valid */
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* struct audit_names->list anchor */
+ char *filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+ struct sockaddr_storage *sockaddr;
+ size_t sockaddr_len;
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ kuid_t target_auid;
+ kuid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ struct list_head killed_trees;
+ int tree_count;
+
+ int type;
+ union {
+ struct {
+ int nargs;
+ long args[6];
+ } socketcall;
+ struct {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ u32 osid;
+ int has_perm;
+ uid_t perm_uid;
+ gid_t perm_gid;
+ umode_t perm_mode;
+ unsigned long qbytes;
+ } ipc;
+ struct {
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+ } mq_getsetattr;
+ struct {
+ mqd_t mqdes;
+ int sigev_signo;
+ } mq_notify;
+ struct {
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+ } mq_sendrecv;
+ struct {
+ int oflag;
+ umode_t mode;
+ struct mq_attr attr;
+ } mq_open;
+ struct {
+ pid_t pid;
+ struct audit_cap_data cap;
+ } capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
+ struct {
+ int argc;
+ } execve;
+ };
+ int fds[2];
+ struct audit_proctitle proctitle;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
#endif
+};
+
+extern u32 audit_ever_enabled;
+
+extern void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap);
+extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
+extern void audit_log_name(struct audit_context *context,
+ struct audit_names *n, struct path *path,
+ int record_num, int *call_panic);
extern int audit_pid;
@@ -74,22 +237,32 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen);
-extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
- int done, int multi,
- const void *payload, int size);
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
+ int done, int multi,
+ const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff_head q;
};
int audit_send_list(void *);
+struct audit_net {
+ struct sock *nlsk;
+};
+
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
@@ -144,7 +317,7 @@ extern void audit_kill_trees(struct list_head *);
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
-extern uid_t audit_sig_uid;
+extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e..135944a7b28 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,8 +249,7 @@ static void untag_chunk(struct node *p)
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
fsnotify_duplicate_mark(&new->mark, entry);
if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
- free_chunk(new);
+ fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -292,8 +291,8 @@ static void untag_chunk(struct node *p)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
entry = &chunk->mark;
if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
- free_chunk(chunk);
+ fsnotify_put_mark(entry);
return -ENOSPC;
}
@@ -332,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(entry);
return 0;
}
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
insert_hash(chunk);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ fsnotify_put_mark(entry); /* drop initial reference */
return 0;
}
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
fsnotify_duplicate_mark(chunk_entry, old_entry);
if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
- free_chunk(chunk);
+ fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
}
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(chunk_entry);
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(old_entry);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
+ fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
- fsnotify_put_mark(old_entry); /* and kill it */
return 0;
}
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
- struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove rule");
- audit_log_format(ab, " dir=");
- audit_log_untrustedstring(ab, rule->tree->pathname);
- audit_log_key(ab, rule->filterkey);
- audit_log_format(ab, " list=%d res=1", rule->listnr);
- audit_log_end(ab);
+ audit_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
@@ -595,7 +603,7 @@ void audit_trim_trees(void)
root_mnt = collect_mounts(&path);
path_put(&path);
- if (!root_mnt)
+ if (IS_ERR(root_mnt))
goto skip_it;
spin_lock(&hash_lock);
@@ -609,9 +617,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
@@ -650,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
struct vfsmount *mnt;
int err;
+ rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
@@ -669,8 +678,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
- if (!mnt) {
- err = -ENOMEM;
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto Err;
}
@@ -719,8 +728,8 @@ int audit_tag_tree(char *old, char *new)
return err;
tagged = collect_mounts(&path2);
path_put(&path2);
- if (!tagged)
- return -ENOMEM;
+ if (IS_ERR(tagged))
+ return PTR_ERR(tagged);
err = kern_path(old, 0, &path1);
if (err) {
@@ -903,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
}
static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmonut_mark,
- struct fsnotify_event *event)
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
{
- BUG();
- return -EOPNOTSUPP;
+ return 0;
}
static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -916,22 +926,16 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
evict_chunk(chunk);
- fsnotify_put_mark(entry);
-}
-static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- return false;
+ /*
+ * We are guaranteed to have at least one reference to the mark from
+ * either the inode or the caller of fsnotify_destroy_mark().
+ */
+ BUG_ON(atomic_read(&entry->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
.handle_event = audit_tree_handle_event,
- .should_send_event = audit_tree_send_event,
- .free_group_priv = NULL,
- .free_event_priv = NULL,
.freeing_mark = audit_tree_freeing_mark,
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d..70b4554d2fb 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,8 +240,10 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab, "auid=%u ses=%u op=",
- audit_get_loginuid(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
audit_log_string(ab, op);
audit_log_format(ab, " path=");
@@ -265,7 +267,8 @@ static void audit_update_watch(struct audit_parent *parent,
/* Run all of the watches on this parent looking for the one that
* matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
@@ -349,40 +352,21 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
}
mutex_unlock(&audit_filter_mutex);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct nameidata nd;
- struct dentry *d;
- int err;
-
- err = kern_path_parent(watch->path, &nd);
- if (err)
- return err;
-
- if (nd.last_type != LAST_NORM) {
- path_put(&nd.path);
- return -EINVAL;
- }
-
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
- if (IS_ERR(d)) {
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
+ struct dentry *d = kern_path_locked(watch->path, parent);
+ if (IS_ERR(d))
return PTR_ERR(d);
- }
+ mutex_unlock(&parent->dentry->d_inode->i_mutex);
if (d->d_inode) {
/* update watch filter fields */
watch->dev = d->d_inode->i_sb->s_dev;
watch->ino = d->d_inode->i_ino;
}
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-
- *parent = nd.path;
dput(d);
return 0;
}
@@ -475,41 +459,33 @@ void audit_remove_watch_rule(struct audit_krule *krule)
if (list_empty(&parent->watches)) {
audit_get_parent(parent);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
}
}
-static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- return true;
-}
-
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
{
struct inode *inode;
- __u32 mask = event->mask;
- const char *dname = event->file_name;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
BUG_ON(group != audit_watch_group);
- switch (event->data_type) {
+ switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = event->path.dentry->d_inode;
+ inode = ((struct path *)data)->dentry->d_inode;
break;
case (FSNOTIFY_EVENT_INODE):
- inode = event->inode;
+ inode = (struct inode *)data;
break;
default:
BUG();
@@ -528,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .should_send_event = audit_watch_should_send_event,
.handle_event = audit_watch_handle_event,
- .free_group_priv = NULL,
- .freeing_mark = NULL,
- .free_event_priv = NULL,
};
static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6c3f1abd20..8e9bc9c3dbb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -29,6 +31,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -224,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
#endif
/* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
@@ -247,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
- printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -310,103 +314,84 @@ static u32 audit_to_op(u32 op)
return n;
}
-
-/* Translate struct audit_rule to kernel's rule respresentation.
- * Exists for backward compatibility with userspace. */
-static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+/* check if an audit field is valid */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- struct audit_entry *entry;
- int err = 0;
- int i;
-
- entry = audit_to_entry_common(rule);
- if (IS_ERR(entry))
- goto exit_nofree;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &entry->rule.fields[i];
- u32 n;
-
- n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
-
- /* Support for legacy operators where
- * AUDIT_NEGATE bit signifies != and otherwise assumes == */
- if (n & AUDIT_NEGATE)
- f->op = Audit_not_equal;
- else if (!n)
- f->op = Audit_equal;
- else
- f->op = audit_to_op(n);
-
- entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
-
- f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
- f->val = rule->values[i];
-
- err = -EINVAL;
- if (f->op == Audit_bad)
- goto exit_free;
-
- switch(f->type) {
- default:
- goto exit_free;
- case AUDIT_PID:
- case AUDIT_UID:
- case AUDIT_EUID:
- case AUDIT_SUID:
- case AUDIT_FSUID:
- case AUDIT_GID:
- case AUDIT_EGID:
- case AUDIT_SGID:
- case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- /* bit ops are only useful on syscall args */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
- break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- break;
- /* arch is only allowed to be = or != */
- case AUDIT_ARCH:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
- goto exit_free;
- entry->rule.arch_f = f;
- break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if (f->val & ~S_IFMT)
- goto exit_free;
- break;
- case AUDIT_INODE:
- err = audit_to_inode(&entry->rule, f);
- if (err)
- goto exit_free;
- break;
- }
- }
-
- if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
- entry->rule.inode_f = NULL;
-
-exit_nofree:
- return entry;
+ switch(f->type) {
+ case AUDIT_MSGTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ entry->rule.listnr != AUDIT_FILTER_USER)
+ return -EINVAL;
+ break;
+ };
-exit_free:
- audit_free_rule(entry);
- return ERR_PTR(err);
+ switch(f->type) {
+ default:
+ return -EINVAL;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ case AUDIT_PID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_INODE:
+ /* bit ops are only useful on syscall args */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ return -EINVAL;
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_WATCH:
+ case AUDIT_DIR:
+ case AUDIT_FILTERKEY:
+ break;
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ /* FALL THROUGH */
+ case AUDIT_ARCH:
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ return -EINVAL;
+ break;
+ case AUDIT_FILETYPE:
+ if (f->val & ~S_IFMT)
+ return -EINVAL;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
+ return -EINVAL;
+ break;
+ };
+ return 0;
}
/* Translate struct audit_rule_data to kernel's rule respresentation. */
@@ -420,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
int i;
char *str;
- entry = audit_to_entry_common((struct audit_rule *)data);
+ entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
@@ -437,32 +422,54 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->type = data->fields[i];
f->val = data->values[i];
+ f->uid = INVALID_UID;
+ f->gid = INVALID_GID;
f->lsm_str = NULL;
f->lsm_rule = NULL;
- switch(f->type) {
- case AUDIT_PID:
+
+ /* Support legacy tests for a valid loginuid */
+ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ f->type = AUDIT_LOGINUID_SET;
+ f->val = 0;
+ }
+
+ if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+ struct pid *pid;
+ rcu_read_lock();
+ pid = find_vpid(f->val);
+ if (!pid) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto exit_free;
+ }
+ f->val = pid_nr(pid);
+ rcu_read_unlock();
+ }
+
+ err = audit_field_valid(entry, f);
+ if (err)
+ goto exit_free;
+
+ err = -EINVAL;
+ switch (f->type) {
+ case AUDIT_LOGINUID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
+ case AUDIT_OBJ_UID:
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- case AUDIT_OBJ_UID:
case AUDIT_OBJ_GID:
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -487,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM "
- "\'%s\' is invalid\n", str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
err = 0;
}
if (err) {
@@ -534,20 +541,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if (f->val & ~S_IFMT)
- goto exit_free;
- break;
- case AUDIT_FIELD_COMPARE:
- if (f->val > AUDIT_MAX_FIELD_COMPARE)
- goto exit_free;
- break;
- default:
- goto exit_free;
}
}
@@ -558,6 +551,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -573,36 +570,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule.
- * Exists for backward compatibility with userspace. */
-static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
-{
- struct audit_rule *rule;
- int i;
-
- rule = kzalloc(sizeof(*rule), GFP_KERNEL);
- if (unlikely(!rule))
- return NULL;
-
- rule->flags = krule->flags | krule->listnr;
- rule->action = krule->action;
- rule->field_count = krule->field_count;
- for (i = 0; i < rule->field_count; i++) {
- rule->values[i] = krule->fields[i].val;
- rule->fields[i] = krule->fields[i].type;
-
- if (krule->vers_ops == 1) {
- if (krule->fields[i].op == Audit_not_equal)
- rule->fields[i] |= AUDIT_NEGATE;
- } else {
- rule->fields[i] |= audit_ops[krule->fields[i].op];
- }
- }
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
-
- return rule;
-}
-
/* Translate kernel rule respresentation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
@@ -707,6 +674,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+ return 1;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -740,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM \'%s\' is "
- "invalid\n", df->lsm_str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
ret = 0;
}
@@ -899,6 +883,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
+ /*
+ * normally audit_add_tree_rule() will free it
+ * on failure
+ */
+ if (tree)
+ audit_put_tree(tree);
goto error;
}
}
@@ -998,37 +988,8 @@ out:
return ret;
}
-/* List rules using struct audit_rule. Exists for backward
- * compatibility with userspace. */
-static void audit_list(int pid, int seq, struct sk_buff_head *q)
-{
- struct sk_buff *skb;
- struct audit_krule *r;
- int i;
-
- /* This is a blocking read, so use audit_filter_mutex instead of rcu
- * iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
- list_for_each_entry(r, &audit_rules_list[i], list) {
- struct audit_rule *rule;
-
- rule = audit_krule_to_rule(r);
- if (unlikely(!rule))
- break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
- rule, sizeof(*rule));
- if (skb)
- skb_queue_tail(q, skb);
- kfree(rule);
- }
- }
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
- if (skb)
- skb_queue_tail(q, skb);
-}
-
/* List rules using struct audit_rule_data. */
-static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
@@ -1043,24 +1004,25 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
- data, sizeof(*data) + data->buflen);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+ 0, 1, data,
+ sizeof(*data) + data->buflen);
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
/* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
- char *action, struct audit_krule *rule,
- int res)
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
+ uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+ unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1068,17 +1030,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_task_context(ab);
audit_log_format(ab, " op=");
audit_log_string(ab, action);
audit_log_key(ab, rule->filterkey);
@@ -1087,83 +1040,37 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
}
/**
- * audit_receive_filter - apply all rules to the specified message type
+ * audit_rule_change - apply all rules to the specified message type
* @type: audit message type
- * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
+ * @portid: target port id for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
*/
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
- size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+int audit_rule_change(int type, __u32 portid, int seq, void *data,
+ size_t datasz)
{
- struct task_struct *tsk;
- struct audit_netlink_list *dest;
int err = 0;
struct audit_entry *entry;
switch (type) {
- case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- /* We can't just spew out the rules here because we might fill
- * the available socket buffer space and deadlock waiting for
- * auditctl to read from it... which isn't ever going to
- * happen if we're actually running in the context of auditctl
- * trying to _send_ the stuff */
-
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
- if (!dest)
- return -ENOMEM;
- dest->pid = pid;
- skb_queue_head_init(&dest->q);
-
- mutex_lock(&audit_filter_mutex);
- if (type == AUDIT_LIST)
- audit_list(pid, seq, &dest->q);
- else
- audit_list_rules(pid, seq, &dest->q);
- mutex_unlock(&audit_filter_mutex);
-
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
- if (IS_ERR(tsk)) {
- skb_queue_purge(&dest->q);
- kfree(dest);
- err = PTR_ERR(tsk);
- }
- break;
- case AUDIT_ADD:
case AUDIT_ADD_RULE:
- if (type == AUDIT_ADD)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("add rule", &entry->rule, !err);
if (err)
audit_free_rule(entry);
break;
- case AUDIT_DEL:
case AUDIT_DEL_RULE:
- if (type == AUDIT_DEL)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("remove rule", &entry->rule, !err);
audit_free_rule(entry);
break;
default:
@@ -1173,6 +1080,46 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return err;
}
+/**
+ * audit_list_rules_send - list the audit rules
+ * @request_skb: skb of request we are replying to (used to target the reply)
+ * @seq: netlink audit message sequence (serial) number
+ */
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
+{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct task_struct *tsk;
+ struct audit_netlink_list *dest;
+ int err = 0;
+
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest->net = get_net(net);
+ dest->portid = portid;
+ skb_queue_head_init(&dest->q);
+
+ mutex_lock(&audit_filter_mutex);
+ audit_list_rules(portid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+
+ return err;
+}
+
int audit_comparator(u32 left, u32 op, u32 right)
{
switch (op) {
@@ -1198,69 +1145,142 @@ int audit_comparator(u32 left, u32 op, u32 right)
}
}
-/* Compare given dentry name with last component in given path,
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen)
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
- int dlen, plen;
- const char *p;
+ switch (op) {
+ case Audit_equal:
+ return uid_eq(left, right);
+ case Audit_not_equal:
+ return !uid_eq(left, right);
+ case Audit_lt:
+ return uid_lt(left, right);
+ case Audit_le:
+ return uid_lte(left, right);
+ case Audit_gt:
+ return uid_gt(left, right);
+ case Audit_ge:
+ return uid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
- if (!dname || !path)
- return 1;
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return gid_eq(left, right);
+ case Audit_not_equal:
+ return !gid_eq(left, right);
+ case Audit_lt:
+ return gid_lt(left, right);
+ case Audit_le:
+ return gid_lte(left, right);
+ case Audit_gt:
+ return gid_gt(left, right);
+ case Audit_ge:
+ return gid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+ int plen;
+ const char *p;
- dlen = strlen(dname);
plen = strlen(path);
- if (plen < dlen)
- return 1;
+
+ if (plen == 0)
+ return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
- /* find last path component */
- p = p - dlen + 1;
- if (p < path)
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
return 1;
- else if (p > path) {
- if (*--p != '/')
- return 1;
- else
- p++;
- }
- /* return length of path's directory component */
- if (dirlen)
- *dirlen = p - path;
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
- struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
enum audit_state *state)
{
int i;
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ pid_t pid;
int result = 0;
u32 sid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(cb->creds.pid, f->op, f->val);
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
- result = audit_comparator(cb->creds.uid, f->op, f->val);
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cb->creds.gid, f->op, f->val);
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(audit_get_loginuid(current),
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
f->op, f->val);
break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(type, f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -1287,23 +1307,26 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
return 1;
}
-int audit_filter_user(struct netlink_skb_parms *cb)
+int audit_filter_user(int type)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
- int ret = 1;
+ int rc, ret;
+
+ ret = 1; /* Audit by default */
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(cb, &e->rule, &state)) {
- if (state == AUDIT_DISABLED)
+ rc = audit_filter_user_rules(&e->rule, type, &state);
+ if (rc) {
+ if (rc > 0 && state == AUDIT_DISABLED)
ret = 0;
break;
}
}
rcu_read_unlock();
- return ret; /* Audit by default */
+ return ret;
}
int audit_filter_type(int type)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34ea..21eae3c05ec 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,6 +42,8 @@
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <asm/types.h>
#include <linux/atomic.h>
@@ -67,6 +69,8 @@
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
#include "audit.h"
@@ -75,59 +79,18 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). If we get more names we will allocate
- * a name dynamically and also add those to the list anchored by names_list. */
-#define AUDIT_NAMES 5
-
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
-
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
/* number of audit rules */
int audit_n_rules;
/* determines whether we collect data for signals sent */
int audit_signals;
-struct audit_cap_data {
- kernel_cap_t permitted;
- kernel_cap_t inheritable;
- union {
- unsigned int fE; /* effective bit of a file capability */
- kernel_cap_t effective; /* effective set of a process */
- };
-};
-
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
- *
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
-struct audit_names {
- struct list_head list; /* audit_context->names_list */
- const char *name;
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- uid_t uid;
- gid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
- int name_len; /* number of name's characters to log */
- bool name_put; /* call __putname() for this name */
- /*
- * This was an allocated audit_names and not from the array of
- * names allocated in the task audit context. Thus this name
- * should be freed on syscall exit
- */
- bool should_free;
-};
-
struct audit_aux_data {
struct audit_aux_data *next;
int type;
@@ -138,18 +101,11 @@ struct audit_aux_data {
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
-struct audit_aux_data_execve {
- struct audit_aux_data d;
- int argc;
- int envc;
- struct mm_struct *mm;
-};
-
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
- uid_t target_auid[AUDIT_AUX_PIDS];
- uid_t target_uid[AUDIT_AUX_PIDS];
+ kuid_t target_auid[AUDIT_AUX_PIDS];
+ kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -164,118 +120,11 @@ struct audit_aux_data_bprm_fcaps {
struct audit_cap_data new_pcap;
};
-struct audit_aux_data_capset {
- struct audit_aux_data d;
- pid_t pid;
- struct audit_cap_data cap;
-};
-
struct audit_tree_refs {
struct audit_tree_refs *next;
struct audit_chunk *c[31];
};
-/* The per-task audit context. */
-struct audit_context {
- int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
- enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
- int major; /* syscall number */
- struct timespec ctime; /* time of syscall entry */
- unsigned long argv[4]; /* syscall arguments */
- long return_code;/* syscall return code */
- u64 prio;
- int return_valid; /* return code is valid */
- /*
- * The names_list is the list of all audit_names collected during this
- * syscall. The first AUDIT_NAMES entries in the names_list will
- * actually be from the preallocated_names array for performance
- * reasons. Except during allocation they should never be referenced
- * through the preallocated_names array and should only be found/used
- * by running the names_list.
- */
- struct audit_names preallocated_names[AUDIT_NAMES];
- int name_count; /* total records in names_list */
- struct list_head names_list; /* anchor for struct audit_names->list */
- char * filterkey; /* key for rule that triggered record */
- struct path pwd;
- struct audit_context *previous; /* For nested syscalls */
- struct audit_aux_data *aux;
- struct audit_aux_data *aux_pids;
- struct sockaddr_storage *sockaddr;
- size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
- uid_t uid, euid, suid, fsuid;
- gid_t gid, egid, sgid, fsgid;
- unsigned long personality;
- int arch;
-
- pid_t target_pid;
- uid_t target_auid;
- uid_t target_uid;
- unsigned int target_sessionid;
- u32 target_sid;
- char target_comm[TASK_COMM_LEN];
-
- struct audit_tree_refs *trees, *first_trees;
- struct list_head killed_trees;
- int tree_count;
-
- int type;
- union {
- struct {
- int nargs;
- long args[6];
- } socketcall;
- struct {
- uid_t uid;
- gid_t gid;
- umode_t mode;
- u32 osid;
- int has_perm;
- uid_t perm_uid;
- gid_t perm_gid;
- umode_t perm_mode;
- unsigned long qbytes;
- } ipc;
- struct {
- mqd_t mqdes;
- struct mq_attr mqstat;
- } mq_getsetattr;
- struct {
- mqd_t mqdes;
- int sigev_signo;
- } mq_notify;
- struct {
- mqd_t mqdes;
- size_t msg_len;
- unsigned int msg_prio;
- struct timespec abs_timeout;
- } mq_sendrecv;
- struct {
- int oflag;
- umode_t mode;
- struct mq_attr attr;
- } mq_open;
- struct {
- pid_t pid;
- struct audit_cap_data cap;
- } capset;
- struct {
- int fd;
- int flags;
- } mmap;
- };
- int fds[2];
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
-};
-
static inline int open_arg(int flags, int mask)
{
int n = ACC_MODE(flags);
@@ -463,37 +312,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
-static int audit_compare_id(uid_t uid1,
- struct audit_names *name,
- unsigned long name_offset,
- struct audit_field *f,
- struct audit_context *ctx)
+static int audit_compare_uid(kuid_t uid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
{
struct audit_names *n;
- unsigned long addr;
- uid_t uid2;
int rc;
-
- BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
-
+
if (name) {
- addr = (unsigned long)name;
- addr += name_offset;
-
- uid2 = *(uid_t *)addr;
- rc = audit_comparator(uid1, f->op, uid2);
+ rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- addr = (unsigned long)n;
- addr += name_offset;
-
- uid2 = *(uid_t *)addr;
+ rc = audit_uid_comparator(uid, f->op, n->uid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
- rc = audit_comparator(uid1, f->op, uid2);
+static int audit_compare_gid(kgid_t gid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_gid_comparator(gid, f->op, name->gid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_gid_comparator(gid, f->op, n->gid);
if (rc)
return rc;
}
@@ -510,80 +369,62 @@ static int audit_field_compare(struct task_struct *tsk,
switch (f->val) {
/* process to file object comparisons */
case AUDIT_COMPARE_UID_TO_OBJ_UID:
- return audit_compare_id(cred->uid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->uid, name, f, ctx);
case AUDIT_COMPARE_GID_TO_OBJ_GID:
- return audit_compare_id(cred->gid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->gid, name, f, ctx);
case AUDIT_COMPARE_EUID_TO_OBJ_UID:
- return audit_compare_id(cred->euid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->euid, name, f, ctx);
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
- return audit_compare_id(cred->egid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
- return audit_compare_id(tsk->loginuid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(tsk->loginuid, name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
- return audit_compare_id(cred->suid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
- return audit_compare_id(cred->sgid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->sgid, name, f, ctx);
case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
- return audit_compare_id(cred->fsuid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->fsuid, name, f, ctx);
case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
- return audit_compare_id(cred->fsgid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
- return audit_comparator(cred->uid, f->op, tsk->loginuid);
+ return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
case AUDIT_COMPARE_UID_TO_EUID:
- return audit_comparator(cred->uid, f->op, cred->euid);
+ return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
- return audit_comparator(cred->uid, f->op, cred->suid);
+ return audit_uid_comparator(cred->uid, f->op, cred->suid);
case AUDIT_COMPARE_UID_TO_FSUID:
- return audit_comparator(cred->uid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
- return audit_comparator(tsk->loginuid, f->op, cred->euid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
- return audit_comparator(tsk->loginuid, f->op, cred->suid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
- return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
- return audit_comparator(cred->euid, f->op, cred->suid);
+ return audit_uid_comparator(cred->euid, f->op, cred->suid);
case AUDIT_COMPARE_EUID_TO_FSUID:
- return audit_comparator(cred->euid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
/* suid comparisons */
case AUDIT_COMPARE_SUID_TO_FSUID:
- return audit_comparator(cred->suid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
/* gid comparisons */
case AUDIT_COMPARE_GID_TO_EGID:
- return audit_comparator(cred->gid, f->op, cred->egid);
+ return audit_gid_comparator(cred->gid, f->op, cred->egid);
case AUDIT_COMPARE_GID_TO_SGID:
- return audit_comparator(cred->gid, f->op, cred->sgid);
+ return audit_gid_comparator(cred->gid, f->op, cred->sgid);
case AUDIT_COMPARE_GID_TO_FSGID:
- return audit_comparator(cred->gid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
/* egid comparisons */
case AUDIT_COMPARE_EGID_TO_SGID:
- return audit_comparator(cred->egid, f->op, cred->sgid);
+ return audit_gid_comparator(cred->egid, f->op, cred->sgid);
case AUDIT_COMPARE_EGID_TO_FSGID:
- return audit_comparator(cred->egid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
/* sgid comparison */
case AUDIT_COMPARE_SGID_TO_FSGID:
- return audit_comparator(cred->sgid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
default:
WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
return 0;
@@ -616,41 +457,57 @@ static int audit_filter_rules(struct task_struct *tsk,
struct audit_field *f = &rule->fields[i];
struct audit_names *n;
int result = 0;
+ pid_t pid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(tsk->pid, f->op, f->val);
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
- ctx->ppid = sys_getppid();
+ ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
case AUDIT_UID:
- result = audit_comparator(cred->uid, f->op, f->val);
+ result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
case AUDIT_EUID:
- result = audit_comparator(cred->euid, f->op, f->val);
+ result = audit_uid_comparator(cred->euid, f->op, f->uid);
break;
case AUDIT_SUID:
- result = audit_comparator(cred->suid, f->op, f->val);
+ result = audit_uid_comparator(cred->suid, f->op, f->uid);
break;
case AUDIT_FSUID:
- result = audit_comparator(cred->fsuid, f->op, f->val);
+ result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cred->gid, f->op, f->val);
+ result = audit_gid_comparator(cred->gid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_group_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_group_p(f->gid);
+ }
break;
case AUDIT_EGID:
- result = audit_comparator(cred->egid, f->op, f->val);
+ result = audit_gid_comparator(cred->egid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_egroup_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_egroup_p(f->gid);
+ }
break;
case AUDIT_SGID:
- result = audit_comparator(cred->sgid, f->op, f->val);
+ result = audit_gid_comparator(cred->sgid, f->op, f->gid);
break;
case AUDIT_FSGID:
- result = audit_comparator(cred->fsgid, f->op, f->val);
+ result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
@@ -704,7 +561,7 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_INODE:
if (name)
- result = (name->ino == f->val);
+ result = audit_comparator(name->ino, f->op, f->val);
else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_comparator(n->ino, f->op, f->val)) {
@@ -716,10 +573,10 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_OBJ_UID:
if (name) {
- result = audit_comparator(name->uid, f->op, f->val);
+ result = audit_uid_comparator(name->uid, f->op, f->uid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (audit_comparator(n->uid, f->op, f->val)) {
+ if (audit_uid_comparator(n->uid, f->op, f->uid)) {
++result;
break;
}
@@ -728,10 +585,10 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_OBJ_GID:
if (name) {
- result = audit_comparator(name->gid, f->op, f->val);
+ result = audit_gid_comparator(name->gid, f->op, f->gid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (audit_comparator(n->gid, f->op, f->val)) {
+ if (audit_gid_comparator(n->gid, f->op, f->gid)) {
++result;
break;
}
@@ -749,7 +606,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID:
result = 0;
if (ctx)
- result = audit_comparator(tsk->loginuid, f->op, f->val);
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
@@ -868,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
return AUDIT_BUILD_CONTEXT;
}
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+ int word, bit;
+
+ if (val > 0xffffffff)
+ return false;
+
+ word = AUDIT_WORD(val);
+ if (word >= AUDIT_BITMASK_SIZE)
+ return false;
+
+ bit = AUDIT_BIT(val);
+
+ return rule->mask[word] & bit;
+}
+
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
@@ -885,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
-
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
rcu_read_unlock();
@@ -909,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
struct audit_context *ctx) {
- int word, bit;
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
struct audit_entry *e;
enum audit_state state;
- word = AUDIT_WORD(ctx->major);
- bit = AUDIT_BIT(ctx->major);
-
if (list_empty(list))
return 0;
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
ctx->current_state = state;
return 1;
@@ -953,7 +822,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
int return_valid,
long return_code)
{
@@ -990,22 +860,30 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
return context;
}
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d"
- " ino_count=%d [NOT freeing]\n",
- __FILE__, __LINE__,
+ int i = 0;
+
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d ino_count=%d"
+ " [NOT freeing]\n", __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ pr_err("names[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -1019,7 +897,7 @@ static inline void audit_free_names(struct audit_context *context)
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
if (n->name && n->name_put)
- __putname(n->name);
+ final_putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1043,21 +921,15 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
-static inline void audit_zero_context(struct audit_context *context,
- enum audit_state state)
-{
- memset(context, 0, sizeof(*context));
- context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
- if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
return NULL;
- audit_zero_context(context, state);
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
return context;
@@ -1082,8 +954,10 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED)
+ if (state == AUDIT_DISABLED) {
+ clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
return 0;
+ }
if (!(context = audit_alloc_context(state))) {
kfree(key);
@@ -1099,91 +973,18 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
- struct audit_context *previous;
- int count = 0;
-
- do {
- previous = context->previous;
- if (previous || (count && count < 10)) {
- ++count;
- printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
- " freeing multiple contexts (%d)\n",
- context->serial, context->major,
- context->name_count, count);
- }
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- free_tree_refs(context);
- audit_free_aux(context);
- kfree(context->filterkey);
- kfree(context->sockaddr);
- kfree(context);
- context = previous;
- } while (context);
- if (count >= 10)
- printk(KERN_ERR "audit: freed %d contexts\n", count);
-}
-
-void audit_log_task_context(struct audit_buffer *ab)
-{
- char *ctx = NULL;
- unsigned len;
- int error;
- u32 sid;
-
- security_task_getsecid(current, &sid);
- if (!sid)
- return;
-
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
- return;
- }
-
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- return;
-
-error_path:
- audit_panic("error in audit_log_task_context");
- return;
-}
-
-EXPORT_SYMBOL(audit_log_task_context);
-
-static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-{
- char name[sizeof(tsk->comm)];
- struct mm_struct *mm = tsk->mm;
- struct vm_area_struct *vma;
-
- /* tsk == current */
-
- get_task_comm(name, tsk);
- audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, name);
-
- if (mm) {
- down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) &&
- vma->vm_file) {
- audit_log_d_path(ab, " exe=",
- &vma->vm_file->f_path);
- break;
- }
- vma = vma->vm_next;
- }
- up_read(&mm->mmap_sem);
- }
- audit_log_task_context(ab);
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ free_tree_refs(context);
+ audit_free_aux(context);
+ kfree(context->filterkey);
+ kfree(context->sockaddr);
+ audit_proctitle_free(context);
+ kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- uid_t auid, uid_t uid, unsigned int sessionid,
+ kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
{
struct audit_buffer *ab;
@@ -1195,14 +996,17 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
if (!ab)
return rc;
- audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
- uid, sessionid);
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid), sessionid);
+ if (sid) {
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
}
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
@@ -1359,20 +1163,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,
}
static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab,
- struct audit_aux_data_execve *axi)
+ struct audit_buffer **ab)
{
int i, len;
size_t len_sent = 0;
const char __user *p;
char *buf;
- if (axi->mm != current->mm)
- return; /* execve failed, no additional info */
+ p = (const char __user *)current->mm->arg_start;
- p = (const char __user *)axi->mm->arg_start;
-
- audit_log_format(*ab, "argc=%d", axi->argc);
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1382,11 +1182,11 @@ static void audit_log_execve_info(struct audit_context *context,
*/
buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf) {
- audit_panic("out of memory for argv string\n");
+ audit_panic("out of memory for argv string");
return;
}
- for (i = 0; i < axi->argc; i++) {
+ for (i = 0; i < context->execve.argc; i++) {
len = audit_log_single_execve_arg(context, ab, i,
&len_sent, p, buf);
if (len <= 0)
@@ -1396,35 +1196,6 @@ static void audit_log_execve_info(struct audit_context *context,
kfree(buf);
}
-static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
-}
-
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1446,7 +1217,9 @@ static void show_special(struct audit_context *context, int *call_panic)
u32 osid = context->ipc.osid;
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
- context->ipc.uid, context->ipc.gid, context->ipc.mode);
+ from_kuid(&init_user_ns, context->ipc.uid),
+ from_kgid(&init_user_ns, context->ipc.gid),
+ context->ipc.mode);
if (osid) {
char *ctx = NULL;
u32 len;
@@ -1462,14 +1235,14 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab,
"qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
- if (!ab)
- return;
}
break; }
case AUDIT_MQ_OPEN: {
@@ -1516,94 +1289,74 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break; }
+ case AUDIT_EXECVE: {
+ audit_log_execve_info(context, &ab);
+ break; }
}
audit_log_end(ab);
}
-static void audit_log_name(struct audit_context *context, struct audit_names *n,
- int record_num, int *call_panic)
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
- return; /* audit_panic has been called */
+ return; /* audit_panic or being filtered */
- audit_log_format(ab, "item=%d", record_num);
+ audit_log_format(ab, "proctitle=");
- if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
- n->name_len);
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
}
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- n->uid,
- n->gid,
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
}
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
}
-
- audit_log_fcaps(ab, n);
-
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
audit_log_end(ab);
}
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
- const struct cred *cred;
int i, call_panic = 0;
struct audit_buffer *ab;
struct audit_aux_data *aux;
- const char *tty;
struct audit_names *n;
/* tsk == current */
- context->pid = tsk->pid;
- if (!context->ppid)
- context->ppid = sys_getppid();
- cred = current_cred();
- context->uid = cred->uid;
- context->gid = cred->gid;
- context->euid = cred->euid;
- context->suid = cred->suid;
- context->fsuid = cred->fsuid;
- context->egid = cred->egid;
- context->sgid = cred->sgid;
- context->fsgid = cred->fsgid;
context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1618,32 +1371,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " ppid=%d pid=%d auid=%u uid=%u gid=%u"
- " euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count,
- context->ppid,
- context->pid,
- tsk->loginuid,
- context->uid,
- context->gid,
- context->euid, context->suid, context->fsuid,
- context->egid, context->sgid, context->fsgid, tty,
- tsk->sessionid);
-
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
audit_log_task_info(ab, tsk);
audit_log_key(ab, context->filterkey);
@@ -1657,11 +1391,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
switch (aux->type) {
- case AUDIT_EXECVE: {
- struct audit_aux_data_execve *axi = (void *)aux;
- audit_log_execve_info(context, &ab, axi);
- break; }
-
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1731,8 +1460,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
i = 0;
- list_for_each_entry(n, &context->names_list, list)
- audit_log_name(context, n, i++, &call_panic);
+ list_for_each_entry(n, &context->names_list, list) {
+ if (n->hidden)
+ continue;
+ audit_log_name(context, n, NULL, i++, &call_panic);
+ }
+
+ audit_log_proctitle(tsk, context);
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1752,7 +1486,7 @@ void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- context = audit_get_context(tsk, 0, 0);
+ context = audit_take_context(tsk, 0, 0);
if (!context)
return;
@@ -1797,42 +1531,6 @@ void __audit_syscall_entry(int arch, int major,
if (!context)
return;
- /*
- * This happens only on certain architectures that make system
- * calls in kernel_thread via the entry.S interface, instead of
- * with direct calls. (If you are porting to a new
- * architecture, hitting this condition can indicate that you
- * got the _exit/_leave calls backward in entry.S.)
- *
- * i386 no
- * x86_64 no
- * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
- *
- * This also happens with vm86 emulation in a non-nested manner
- * (entries without exits), so this case must be caught.
- */
- if (context->in_syscall) {
- struct audit_context *newctx;
-
-#if AUDIT_DEBUG
- printk(KERN_ERR
- "audit(:%d) pid=%d in syscall=%d;"
- " entering syscall=%d\n",
- context->serial, tsk->pid, context->major, major);
-#endif
- newctx = audit_alloc_context(context->state);
- if (newctx) {
- newctx->previous = context;
- context = newctx;
- tsk->audit_context = newctx;
- } else {
- /* If we can't alloc a new context, the best we
- * can do is to leak memory (any pending putname
- * will be lost). The only other alternative is
- * to abandon auditing. */
- audit_zero_context(context, context->state);
- }
- }
BUG_ON(context->in_syscall || context->name_count);
if (!audit_enabled)
@@ -1882,7 +1580,7 @@ void __audit_syscall_exit(int success, long return_code)
else
success = AUDITSC_FAILURE;
- context = audit_get_context(tsk, success, return_code);
+ context = audit_take_context(tsk, success, return_code);
if (!context)
return;
@@ -1895,28 +1593,21 @@ void __audit_syscall_exit(int success, long return_code)
if (!list_empty(&context->killed_trees))
audit_kill_trees(&context->killed_trees);
- if (context->previous) {
- struct audit_context *new_context = context->previous;
- context->previous = NULL;
- audit_free_context(context);
- tsk->audit_context = new_context;
- } else {
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
- tsk->audit_context = context;
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ audit_free_aux(context);
+ context->aux = NULL;
+ context->aux_pids = NULL;
+ context->target_pid = 0;
+ context->target_sid = 0;
+ context->sockaddr_len = 0;
+ context->type = 0;
+ context->fds[0] = -1;
+ if (context->state != AUDIT_RECORD_CONTEXT) {
+ kfree(context->filterkey);
+ context->filterkey = NULL;
}
+ tsk->audit_context = context;
}
static inline void handle_one(const struct inode *inode)
@@ -1939,7 +1630,7 @@ static inline void handle_one(const struct inode *inode)
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
- printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
@@ -1998,8 +1689,7 @@ retry:
goto retry;
}
/* too bad */
- printk(KERN_WARNING
- "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
@@ -2008,7 +1698,8 @@ retry:
#endif
}
-static struct audit_names *audit_alloc_name(struct audit_context *context)
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
{
struct audit_names *aname;
@@ -2023,6 +1714,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
aname->ino = (unsigned long)-1;
+ aname->type = type;
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
@@ -2033,33 +1725,62 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr)
+ return n->name;
+ }
+ return NULL;
+}
+
+/**
* audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
{
struct audit_context *context = current->audit_context;
struct audit_names *n;
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ pr_err("%s:%d(:%d): ignoring getname(%p)\n",
__FILE__, __LINE__, context->serial, name);
dump_stack();
#endif
return;
}
- n = audit_alloc_name(context);
+#if AUDIT_DEBUG
+ /* The filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
n->name = name;
n->name_len = AUDIT_NAME_FULL;
n->name_put = true;
+ name->aname = n;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
@@ -2072,112 +1793,124 @@ void __audit_getname(const char *name)
* then we delay the putname until syscall exit.
* Called from include/linux/fs.h:putname().
*/
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
{
struct audit_context *context = current->audit_context;
BUG_ON(!context);
- if (!context->in_syscall) {
+ if (!name->aname || !context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ pr_err("%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
struct audit_names *n;
- int i;
+ int i = 0;
list_for_each_entry(n, &context->names_list, list)
- printk(KERN_ERR "name[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ pr_err("name[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
#endif
- __putname(name);
+ final_putname(name);
}
#if AUDIT_DEBUG
else {
++context->put_count;
if (context->put_count > context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d"
- " in_syscall=%d putname(%p) name_count=%d"
- " put_count=%d\n",
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+ " name_count=%d put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
- context->in_syscall, name, context->name_count,
- context->put_count);
+ context->in_syscall, name->name,
+ context->name_count, context->put_count);
dump_stack();
}
}
#endif
}
-static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-
-/* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
/**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- *
- * Called from fs/namei.c:path_lookup().
+ * @flags: attributes for this particular entry
*/
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int flags)
{
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
struct audit_names *n;
+ bool parent = flags & AUDIT_INODE_PARENT;
if (!context->in_syscall)
return;
+ if (!name)
+ goto out_alloc;
+
+#if AUDIT_DEBUG
+ /* The struct filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (n->name && (n->name == name))
- goto out;
+ /* does the name pointer match? */
+ if (!n->name || n->name->name != name->name)
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
}
- /* unable to find the name from a previous getname() */
- n = audit_alloc_name(context);
+out_alloc:
+ /* unable to find the name from a previous getname(). Allocate a new
+ * anonymous entry.
+ */
+ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
if (!n)
return;
out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ if (flags & AUDIT_INODE_HIDDEN)
+ n->hidden = true;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
+ }
handle_path(dentry);
audit_copy_inode(n, dentry, inode);
}
/**
- * audit_inode_child - collect inode info for created/removed objects
- * @dentry: dentry being audited
+ * __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -2187,15 +1920,14 @@ out:
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct dentry *dentry,
- const struct inode *parent)
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
const char *dname = dentry->d_name.name;
- struct audit_names *n;
- int dirlen = 0;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
if (!context->in_syscall)
return;
@@ -2203,62 +1935,65 @@ void __audit_inode_child(const struct dentry *dentry,
if (inode)
handle_one(inode);
- /* parent is more likely, look for it first */
+ /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ if (!n->name || n->type != AUDIT_TYPE_PARENT)
continue;
if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- n->name_len = dirlen; /* update parent data in place */
- found_parent = n->name;
- goto add_names;
+ !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ found_parent = n;
+ break;
}
}
- /* no matching parent, look for matching child */
+ /* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ /* can only match entries that have a name */
+ if (!n->name || n->type != type)
continue;
- /* strcmp() is the more likely scenario */
- if (!strcmp(dname, n->name) ||
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- if (inode)
- audit_copy_inode(n, NULL, inode);
- else
- n->ino = (unsigned long)-1;
- found_child = n->name;
- goto add_names;
+ /* if we found a parent, make sure this one is a child of it */
+ if (found_parent && (n->name != found_parent->name))
+ continue;
+
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ found_child = n;
+ break;
}
}
-add_names:
if (!found_parent) {
- n = audit_alloc_name(context);
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- n = audit_alloc_name(context);
- if (!n)
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
return;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- n->name = found_parent;
- n->name_len = AUDIT_NAME_FULL;
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- n->name_put = false;
+ found_child->name_put = false;
}
-
- if (inode)
- audit_copy_inode(n, NULL, inode);
}
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2290,6 +2025,47 @@ int auditsc_get_stamp(struct audit_context *ctx,
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid, unsigned int sessionid,
+ int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+
+ if (!audit_enabled)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+ audit_log_end(ab);
+}
+
/**
* audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
@@ -2298,38 +2074,29 @@ static atomic_t session_id = ATOMIC_INIT(0);
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(uid_t loginuid)
+int audit_set_loginuid(kuid_t loginuid)
{
struct task_struct *task = current;
- struct audit_context *context = task->audit_context;
- unsigned int sessionid;
+ unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ kuid_t oldloginuid;
+ int rc;
-#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
- if (task->loginuid != -1)
- return -EPERM;
-#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
- sessionid = atomic_inc_return(&session_id);
- if (context && context->in_syscall) {
- struct audit_buffer *ab;
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (ab) {
- audit_log_format(ab, "login pid=%d uid=%u "
- "old auid=%u new auid=%u"
- " old ses=%u new ses=%u",
- task->pid, task_uid(task),
- task->loginuid, loginuid,
- task->sessionid, sessionid);
- audit_log_end(ab);
- }
- }
task->sessionid = sessionid;
task->loginuid = loginuid;
- return 0;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
}
/**
@@ -2450,38 +2217,31 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
context->ipc.has_perm = 1;
}
-int __audit_bprm(struct linux_binprm *bprm)
+void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- ax = kmalloc(sizeof(*ax), GFP_KERNEL);
- if (!ax)
- return -ENOMEM;
-
- ax->argc = bprm->argc;
- ax->envc = bprm->envc;
- ax->mm = bprm->mm;
- ax->d.type = AUDIT_EXECVE;
- ax->d.next = context->aux;
- context->aux = (void *)ax;
- return 0;
+ context->type = AUDIT_EXECVE;
+ context->execve.argc = bprm->argc;
}
/**
* audit_socketcall - record audit data for sys_socketcall
- * @nargs: number of args
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
-void __audit_socketcall(int nargs, unsigned long *args)
+int __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
+ if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+ return -EINVAL;
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+ return 0;
}
/**
@@ -2524,7 +2284,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = t->pid;
+ context->target_pid = task_pid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2545,12 +2305,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
- uid_t uid = current_uid(), t_uid = task_uid(t);
+ kuid_t uid = current_uid(), t_uid = task_uid(t);
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = tsk->pid;
- if (tsk->loginuid != -1)
+ audit_sig_pid = task_pid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
@@ -2563,7 +2323,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
- ctx->target_pid = t->tgid;
+ ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
@@ -2584,7 +2344,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
- axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2643,18 +2403,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
/**
* __audit_log_capset - store information about the arguments to the capset syscall
- * @pid: target pid of the capset call
* @new: the new credentials
* @old: the old (current) credentials
*
* Record the aguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
-void __audit_log_capset(pid_t pid,
- const struct cred *new, const struct cred *old)
+void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = pid;
+ context->capset.pid = task_pid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2669,25 +2427,34 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+static void audit_log_task(struct audit_buffer *ab)
{
- uid_t auid, uid;
- gid_t gid;
+ kuid_t auid, uid;
+ kgid_t gid;
unsigned int sessionid;
+ struct mm_struct *mm = current->mm;
auid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
current_uid_gid(&uid, &gid);
audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
audit_log_untrustedstring(ab, current->comm);
- audit_log_format(ab, " reason=");
- audit_log_string(ab, reason);
- audit_log_format(ab, " sig=%ld", signr);
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
}
+
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
@@ -2706,17 +2473,26 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "memory violation", signr);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall)
+void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", SIGKILL);
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c..1323360d90e 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
static void backtrace_test_normal(void)
{
- printk("Testing a backtrace from process context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from process context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
- printk("Testing a backtrace from irq context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from irq context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
struct stack_trace trace;
unsigned long entries[8];
- printk("Testing a saved backtrace.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a saved backtrace.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
#else
static void backtrace_test_saved(void)
{
- printk("Saved backtrace test skipped.\n");
+ pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
- printk("====[ backtrace testing ]===========\n");
+ pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
- printk("====[ end of backtrace testing ]====\n");
+ pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b..9fd4246b04b 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,8 @@
#include <linux/mmzone.h>
#include <linux/kbuild.h>
#include <linux/page_cgroup.h>
+#include <linux/log2.h>
+#include <linux/spinlock_types.h>
void foo(void)
{
@@ -17,5 +19,9 @@ void foo(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+ DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
+ DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 3f1adb6c647..a5cf13c018c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,8 @@
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
@@ -22,7 +24,6 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-
EXPORT_SYMBOL(__cap_empty_set);
int file_caps_enabled = 1;
@@ -42,15 +43,10 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- static int warned;
- if (!warned) {
- char name[sizeof(current->comm)];
-
- printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
- " (legacy support in use)\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+ get_task_comm(name, current));
}
/*
@@ -71,16 +67,10 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- static int warned;
-
- if (!warned) {
- char name[sizeof(current->comm)];
+ char name[sizeof(current->comm)];
- printk(KERN_INFO "warning: `%s' uses deprecated v2"
- " capabilities in a way that may be insecure.\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+ get_task_comm(name, current));
}
/*
@@ -198,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
- * unexpectidly fail; the capget/modify/capset aborts
+ * unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
@@ -277,7 +267,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (ret < 0)
goto error;
- audit_log_capset(pid, new, current_cred());
+ audit_log_capset(new, current_cred());
return commit_creds(new);
@@ -380,7 +370,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
- printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+ pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
@@ -393,6 +383,31 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+ int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -409,13 +424,19 @@ bool capable(int cap)
EXPORT_SYMBOL(capable);
/**
- * nsown_capable - Check superior capability to one's own user_ns
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @inode: The inode in question
* @cap: The capability in question
*
- * Return true if the current task has the given superior capability
- * targeted at its own user namespace.
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
*/
-bool nsown_capable(int cap)
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
- return ns_capable(current_user_ns(), cap);
+ struct user_namespace *ns = current_user_ns();
+
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
}
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f7..70776aec256 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,14 +26,16 @@
* distribution for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cgroup.h>
#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/fs.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
@@ -41,182 +43,135 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
-#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
-#include <linux/namei.h>
+#include <linux/hashtable.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/atomic.h>
/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on. Modifying requires both cgroup_mutex and
- * cgroup_root_mutex. Readers can acquire either of the two. This is to
- * break the following locking order cycle.
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
*
- * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- * B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
*/
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
+#else
static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_MUTEX(cgroup_root_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
+#endif
/*
- * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
- * registered after that. The mutable section of this array is protected by
- * cgroup_mutex.
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
*/
-#define SUBSYS(_x) &_x ## _subsys,
-static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-#include <linux/cgroup_subsys.h>
-};
-
-#define MAX_CGROUP_ROOT_NAMELEN 64
+static DEFINE_SPINLOCK(cgroup_idr_lock);
/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
-struct cgroupfs_root {
- struct super_block *sb;
-
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
- unsigned long subsys_bits;
-
- /* Unique id for this hierarchy. */
- int hierarchy_id;
-
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_bits;
+static DEFINE_SPINLOCK(release_agent_path_lock);
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
+#define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
- /* The root cgroup for this hierarchy */
- struct cgroup top_cgroup;
-
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
-
- /* A list running through the active hierarchies */
- struct list_head root_list;
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
- /* Hierarchy-specific flags */
- unsigned long flags;
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
- /* The path to use for release notifications. */
- char release_agent_path[PATH_MAX];
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
- /* The name for this hierarchy - may be empty */
- char name[MAX_CGROUP_ROOT_NAMELEN];
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
/*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
- * subsystems that are otherwise unattached - it never has more than a
- * single cgroup, and all tasks are part of that cgroup.
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
*/
-static struct cgroupfs_root rootnode;
+struct cgroup_root cgrp_dfl_root;
/*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
*/
-#define CSS_ID_MAX (65535)
-struct css_id {
- /*
- * The css to which this ID points. This pointer is set to valid value
- * after cgroup is populated. If cgroup is removed, this will be NULL.
- * This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
- */
- struct cgroup_subsys_state __rcu *css;
- /*
- * ID of this css.
- */
- unsigned short id;
- /*
- * Depth in hierarchy which this ID belongs to.
- */
- unsigned short depth;
- /*
- * ID is freed by RCU. (and lookup routine is RCU safe.)
- */
- struct rcu_head rcu_head;
- /*
- * Hierarchy of CSS ID belongs to.
- */
- unsigned short stack[0]; /* Array of Length (depth+1) */
-};
+static bool cgrp_dfl_root_visible;
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
- /*
- * Cgroup which the event belongs to.
- */
- struct cgroup *cgrp;
- /*
- * Control file which the event associated.
- */
- struct cftype *cft;
- /*
- * eventfd to signal userspace about the event.
- */
- struct eventfd_ctx *eventfd;
- /*
- * Each of these stored in a list by the cgroup.
- */
- struct list_head list;
- /*
- * All fields below needed to unregister event when
- * userspace closes eventfd.
- */
- poll_table pt;
- wait_queue_head_t *wqh;
- wait_queue_t wait;
- struct work_struct remove;
-};
+/* some controllers are not supported in the default hierarchy */
+static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+#ifdef CONFIG_CGROUP_DEBUG
+ | (1 << debug_cgrp_id)
+#endif
+ ;
/* The list of hierarchy roots */
-static LIST_HEAD(roots);
-static int root_count;
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
-static DEFINE_IDA(hierarchy_ida);
-static int next_hierarchy_id;
-static DEFINE_SPINLOCK(hierarchy_id_lock);
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
-#define dummytop (&rootnode.top_cgroup)
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
@@ -225,30 +180,152 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
*/
static int need_forkexit_callback __read_mostly;
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
+static struct cftype cgroup_base_files[];
+
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned int ss_mask);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
{
- return lockdep_is_held(&cgroup_mutex);
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
- return mutex_is_locked(&cgroup_mutex);
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgroup_parent(cgrp);
+
+ return cgroup_css(cgrp, ss);
+}
/* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
- return test_bit(CGRP_REMOVED, &cgrp->flags);
+ return !(cgrp->self.flags & CSS_ONLINE);
}
-/* bits in struct cgroupfs_root flags field */
-enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
-};
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgroup_parent(cgrp);
+ }
+ return false;
+}
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -263,21 +340,55 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
-/*
- * for_each_subsys() allows you to iterate on each subsystem attached to
- * an active hierarchy
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_subsys(_root, _ss) \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
-/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_active_root(_root) \
-list_for_each_entry(_root, &roots, root_list)
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
@@ -287,40 +398,80 @@ static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
-/* Link structure for associating css_set objects with cgroups */
-struct cg_cgroup_link {
- /*
- * List running through cg_cgroup_links associated with a
- * cgroup, anchored on cgroup->css_sets
- */
- struct list_head cgrp_link_list;
- struct cgroup *cgrp;
- /*
- * List running through cg_cgroup_links pointing at a
- * single css_set object, anchored on css_set->cg_links
- */
- struct list_head cg_link_list;
- struct css_set *cg;
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
};
-/* The default css_set - used by init and its children prior to any
+/*
+ * The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
-static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static int css_set_count = 1; /* 1 for init_css_set */
-static int cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *css);
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
-/* css_set_lock protects the list of css_set objects, and the
- * chain of tasks off each css_set. Nests outside task->alloc_lock
- * due to cgroup_iter_start() */
-static DEFINE_RWLOCK(css_set_lock);
-static int css_set_count;
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
/*
* hash table for cgroup groups. This improves the performance to find
@@ -328,141 +479,136 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
int i;
- int index;
- unsigned long tmp = 0UL;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- return &css_set_table[index];
+ return key;
}
-/* We don't maintain the lists running through each css_set to its
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
-
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cg->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cg->refcount)) {
- write_unlock(&css_set_lock);
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (!atomic_dec_and_test(&cset->refcount))
return;
- }
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
+ hash_del(&cset->hlist);
css_set_count--;
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
struct cgroup *cgrp = link->cgrp;
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- if (atomic_dec_and_test(&cgrp->count) &&
- notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+
+ /* @cgrp can't go away while we're holding css_set_rwsem */
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ if (notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
}
kfree(link);
}
- write_unlock(&css_set_lock);
- kfree_rcu(cg, rcu_head);
+ kfree_rcu(cset, rcu_head);
}
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
+static void put_css_set(struct css_set *cset, bool taskexit)
{
- atomic_inc(&cg->refcount);
-}
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
-static inline void put_css_set(struct css_set *cg)
-{
- __put_css_set(cg, 0);
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
}
-static inline void put_css_set_taskexit(struct css_set *cg)
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
{
- __put_css_set(cg, 1);
+ atomic_inc(&cset->refcount);
}
-/*
+/**
* compare_css_sets - helper function for find_existing_css_set().
- * @cg: candidate css_set being tested
- * @old_cg: existing css_set for a task
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
- * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
-static bool compare_css_sets(struct css_set *cg,
- struct css_set *old_cg,
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
- if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
- /* Not all subsystems matched */
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
- }
/*
* Compare cgroup pointers in order to distinguish between
- * different cgroups in heirarchies with no subsystems. We
- * could get by with just this check alone (and skip the
- * memcmp above) but on most setups the memcmp check will
- * avoid the need for this more expensive check on almost all
- * candidates.
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
*/
-
- l1 = &cg->cg_links;
- l2 = &old_cg->cg_links;
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
while (1) {
- struct cg_cgroup_link *cgl1, *cgl2;
- struct cgroup *cg1, *cg2;
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
- if (l1 == &cg->cg_links) {
- BUG_ON(l2 != &old_cg->cg_links);
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
- BUG_ON(l2 == &old_cg->cg_links);
+ BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
- cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
- cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
- cg1 = cgl1->cgrp;
- cg2 = cgl2->cgrp;
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
- BUG_ON(cg1->root != cg2->root);
+ BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
@@ -471,239 +617,340 @@ static bool compare_css_sets(struct css_set *cg,
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
- if (cg1->root == new_cgrp->root) {
- if (cg1 != new_cgrp)
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
return false;
} else {
- if (cg1 != cg2)
+ if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
-/*
- * find_existing_css_set() is a helper for
- * find_css_set(), and checks to see whether an existing
- * css_set is suitable.
- *
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
*/
-static struct css_set *find_existing_css_set(
- struct css_set *oldcg,
- struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
int i;
- struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
- struct css_set *cg;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- if (root->subsys_bits & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgrp->subsys[i];
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
- template[i] = oldcg->subsys[i];
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
- if (!compare_css_sets(cg, oldcg, cgrp, template))
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
- return cg;
+ return cset;
}
/* No existing cgroup group matched */
return NULL;
}
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+ struct cgrp_cset_link *link, *tmp_link;
- list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
- list_del(&link->cgrp_link_list);
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
kfree(link);
}
}
-/*
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
- * success or a negative error
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
*/
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
int i;
- INIT_LIST_HEAD(tmp);
+
+ INIT_LIST_HEAD(tmp_links);
+
for (i = 0; i < count; i++) {
- link = kmalloc(sizeof(*link), GFP_KERNEL);
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
- free_cg_links(tmp);
+ free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
- list_add(&link->cgrp_link_list, tmp);
+ list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
- * @cg: the css_set to be linked
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
-static void link_css_set(struct list_head *tmp_cg_links,
- struct css_set *cg, struct cgroup *cgrp)
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
- BUG_ON(list_empty(tmp_cg_links));
- link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
- cgrp_link_list);
- link->cg = cg;
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
link->cgrp = cgrp;
- atomic_inc(&cgrp->count);
- list_move(&link->cgrp_link_list, &cgrp->css_sets);
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
+ list_move(&link->cset_link, &cgrp->cset_links);
+
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
- list_add_tail(&link->cg_link_list, &cg->cg_links);
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
}
-/*
- * find_css_set() takes an existing cgroup group and a
- * cgroup object, and returns a css_set object that's
- * equivalent to the old group, but with the given cgroup
- * substituted into the appropriate hierarchy. Must be called with
- * cgroup_mutex held
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
*/
-static struct css_set *find_css_set(
- struct css_set *oldcg, struct cgroup *cgrp)
+static struct css_set *find_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp)
{
- struct css_set *res;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
- struct list_head tmp_cg_links;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct css_set *cset;
+ struct list_head tmp_links;
+ struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid;
- struct hlist_head *hhead;
- struct cg_cgroup_link *link;
+ lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
- res = find_existing_css_set(oldcg, cgrp, template);
- if (res)
- get_css_set(res);
- read_unlock(&css_set_lock);
+ down_read(&css_set_rwsem);
+ cset = find_existing_css_set(old_cset, cgrp, template);
+ if (cset)
+ get_css_set(cset);
+ up_read(&css_set_rwsem);
- if (res)
- return res;
+ if (cset)
+ return cset;
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
+ cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ if (!cset)
return NULL;
- /* Allocate all the cg_cgroup_link objects that we'll need */
- if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
- kfree(res);
+ /* Allocate all the cgrp_cset_link objects that we'll need */
+ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+ kfree(cset);
return NULL;
}
- atomic_set(&res->refcount, 1);
- INIT_LIST_HEAD(&res->cg_links);
- INIT_LIST_HEAD(&res->tasks);
- INIT_HLIST_NODE(&res->hlist);
+ atomic_set(&cset->refcount, 1);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
+ INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
- memcpy(res->subsys, template, sizeof(res->subsys));
+ memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
- list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
+
if (c->root == cgrp->root)
c = cgrp;
- link_css_set(&tmp_cg_links, res, c);
+ link_css_set(&tmp_links, cset, c);
}
- BUG_ON(!list_empty(&tmp_cg_links));
+ BUG_ON(!list_empty(&tmp_links));
css_set_count++;
- /* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ /* Add @cset to the hash table */
+ key = css_set_hash(cset->subsys);
+ hash_add(css_set_table, &cset->hlist, key);
- write_unlock(&css_set_lock);
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
- return res;
+ up_write(&css_set_rwsem);
+
+ return cset;
}
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroupfs_root *root)
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct css_set *css;
- struct cgroup *res = NULL;
+ struct cgroup *root_cgrp = kf_root->kn->priv;
+
+ return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- read_lock(&css_set_lock);
/*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
*/
- css = task->cgroups;
- if (css == &init_css_set) {
- res = &root->top_cgroup;
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
} else {
- struct cg_cgroup_link *link;
- list_for_each_entry(link, &css->cg_links, cg_link_list) {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
+
if (c->root == root) {
res = c;
break;
}
}
}
- read_unlock(&css_set_lock);
+
BUG_ON(!res);
return res;
}
/*
- * There is one global cgroup mutex. We also require taking
- * task_lock() when dereferencing a task's cgroup subsys pointers.
- * See "The task_lock() exception", at the end of this comment.
- *
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
@@ -729,384 +976,302 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
- * need a special hack to ensure that top_cgroup cannot be deleted.
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
- * another. It does so using cgroup_mutex, however there are
- * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
- * mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
- * the task_struct routinely used for such matters.
+ * need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- mutex_lock(&cgroup_mutex);
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
-EXPORT_SYMBOL_GPL(cgroup_lock);
/**
- * cgroup_unlock - release lock on cgroup changes
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
*
- * Undo the lock taken in a previous cgroup_lock() call.
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
*/
-void cgroup_unlock(void)
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unlock);
+ umode_t mode = 0;
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
+ if (cft->mode)
+ return cft->mode;
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
-static const struct inode_operations cgroup_dir_inode_operations;
-static const struct file_operations proc_cgroupstats_operations;
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .name = "cgroup",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
+ if (cft->write_u64 || cft->write_s64 || cft->write)
+ mode |= S_IWUSR;
-static int alloc_css_id(struct cgroup_subsys *ss,
- struct cgroup *parent, struct cgroup *child);
+ return mode;
+}
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
+static void cgroup_get(struct cgroup *cgrp)
{
- struct inode *inode = new_inode(sb);
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
- if (inode) {
- inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
*/
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
+static void cgroup_kn_unlock(struct kernfs_node *kn)
{
- struct cgroup_subsys *ss;
- int ret = 0;
+ struct cgroup *cgrp;
- for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy) {
- ret = ss->pre_destroy(ss, cgrp);
- if (ret)
- break;
- }
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- return ret;
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
-
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->destroy(ss, cgrp);
+ struct cgroup *cgrp;
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(kn);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ mutex_lock(&cgroup_mutex);
- kfree_rcu(cgrp, rcu_head);
- }
- iput(inode);
-}
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
-static int cgroup_delete(const struct dentry *d)
-{
- return 1;
+ cgroup_kn_unlock(kn);
+ return NULL;
}
-static void remove_dir(struct dentry *d)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
- struct dentry *parent = dget(d->d_parent);
+ char name[CGROUP_FILE_NAME_MAX];
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ lockdep_assert_held(&cgroup_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
-static void cgroup_clear_directory(struct dentry *dentry)
+/**
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- struct list_head *node;
+ struct cgroup_subsys *ss;
+ int i;
- BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dentry->d_lock);
- node = dentry->d_subdirs.next;
- while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(node);
- if (d->d_inode) {
- /* This should never be called on a cgroup
- * directory with child cgroups */
- BUG_ON(d->d_inode->i_mode & S_IFDIR);
- dget_dlock(d);
- spin_unlock(&d->d_lock);
- spin_unlock(&dentry->d_lock);
- d_delete(d);
- simple_unlink(dentry->d_inode, d);
- dput(d);
- spin_lock(&dentry->d_lock);
- } else
- spin_unlock(&d->d_lock);
- node = dentry->d_subdirs.next;
+ if (!(subsys_mask & (1 << i)))
+ continue;
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
}
- spin_unlock(&dentry->d_lock);
}
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
{
- struct dentry *parent;
-
- cgroup_clear_directory(dentry);
-
- parent = dentry->d_parent;
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
- remove_dir(dentry);
-}
+ struct cgroup_subsys *ss;
+ unsigned int tmp_ss_mask;
+ int ssid, i, ret;
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+ lockdep_assert_held(&cgroup_mutex);
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
+ for_each_subsys(ss, ssid) {
+ if (!(ss_mask & (1 << ssid)))
+ continue;
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ return -EBUSY;
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+ return -EBUSY;
+ }
-/*
- * Call with cgroup_mutex held. Drops reference counts on modules, including
- * any duplicate ones that parse_cgroupfs_options took. If this function
- * returns an error, no reference counts are touched.
- */
-static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long final_bits)
-{
- unsigned long added_bits, removed_bits;
- struct cgroup *cgrp = &root->top_cgroup;
- int i;
+ /* skip creating root files on dfl_root for inhibited subsystems */
+ tmp_ss_mask = ss_mask;
+ if (dst_root == &cgrp_dfl_root)
+ tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
+ ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+ if (ret) {
+ if (dst_root != &cgrp_dfl_root)
+ return ret;
- removed_bits = root->actual_subsys_bits & ~final_bits;
- added_bits = final_bits & ~root->actual_subsys_bits;
- /* Check that any added subsystems are currently free */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
- struct cgroup_subsys *ss = subsys[i];
- if (!(bit & added_bits))
- continue;
/*
- * Nobody should tell us to do a subsys that doesn't exist:
- * parse_cgroupfs_options should catch that case and refcounts
- * ensure that subsystems won't disappear once selected.
+ * Rebinding back to the default root is not allowed to
+ * fail. Using both default and non-default roots should
+ * be rare. Moving subsystems back and forth even more so.
+ * Just warn about it and continue.
*/
- BUG_ON(ss == NULL);
- if (ss->root != &rootnode) {
- /* Subsystem isn't free */
- return -EBUSY;
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
}
- /* Currently we don't handle adding/removing subsystems when
- * any child cgroups exist. This is theoretically supportable
- * but involves complex error handling, so it's being left until
- * later */
- if (root->number_of_cgroups > 1)
- return -EBUSY;
+ /*
+ * Nothing can fail from this point on. Remove files for the
+ * removed subsystems and rebind each subsystem.
+ */
+ for_each_subsys(ss, ssid)
+ if (ss_mask & (1 << ssid))
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- /* Process each subsystem */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- unsigned long bit = 1UL << i;
- if (bit & added_bits) {
- /* We're binding this subsystem to this hierarchy */
- BUG_ON(ss == NULL);
- BUG_ON(cgrp->subsys[i]);
- BUG_ON(!dummytop->subsys[i]);
- BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
- mutex_lock(&ss->hierarchy_mutex);
- cgrp->subsys[i] = dummytop->subsys[i];
- cgrp->subsys[i]->cgroup = cgrp;
- list_move(&ss->sibling, &root->subsys_list);
- ss->root = root;
- if (ss->bind)
- ss->bind(ss, cgrp);
- mutex_unlock(&ss->hierarchy_mutex);
- /* refcount was already taken, and we're keeping it */
- } else if (bit & removed_bits) {
- /* We're removing this subsystem */
- BUG_ON(ss == NULL);
- BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
- mutex_lock(&ss->hierarchy_mutex);
- if (ss->bind)
- ss->bind(ss, dummytop);
- dummytop->subsys[i]->cgroup = dummytop;
- cgrp->subsys[i] = NULL;
- subsys[i]->root = &rootnode;
- list_move(&ss->sibling, &rootnode.subsys_list);
- mutex_unlock(&ss->hierarchy_mutex);
- /* subsystem is now free - drop reference on module */
- module_put(ss->module);
- } else if (bit & final_bits) {
- /* Subsystem state should already exist */
- BUG_ON(ss == NULL);
- BUG_ON(!cgrp->subsys[i]);
- /*
- * a refcount was taken, but we already had one, so
- * drop the extra reference.
- */
- module_put(ss->module);
-#ifdef CONFIG_MODULE_UNLOAD
- BUG_ON(ss->module && !module_refcount(ss->module));
-#endif
- } else {
- /* Subsystem state shouldn't exist */
- BUG_ON(cgrp->subsys[i]);
- }
+ for_each_subsys(ss, ssid) {
+ struct cgroup_root *src_root;
+ struct cgroup_subsys_state *css;
+ struct css_set *cset;
+
+ if (!(ss_mask & (1 << ssid)))
+ continue;
+
+ src_root = ss->root;
+ css = cgroup_css(&src_root->cgrp, ss);
+
+ WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+
+ RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+ rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ ss->root = dst_root;
+ css->cgroup = &dst_root->cgrp;
+
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root)
+ dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+
+ if (ss->bind)
+ ss->bind(css);
}
- root->subsys_bits = root->actual_subsys_bits = final_bits;
- synchronize_rcu();
+ kernfs_activate(dst_root->cgrp.kn);
return 0;
}
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
-
- mutex_lock(&cgroup_root_mutex);
- for_each_subsys(root, ss)
- seq_printf(seq, ",%s", ss->name);
- if (test_bit(ROOT_NOPREFIX, &root->flags))
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->name);
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+ seq_puts(seq, ",sane_behavior");
+ if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- if (clone_children(&root->top_cgroup))
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_root_mutex);
return 0;
}
struct cgroup_sb_opts {
- unsigned long subsys_bits;
- unsigned long flags;
+ unsigned int subsys_mask;
+ unsigned int flags;
char *release_agent;
- bool clone_children;
+ bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
-
- struct cgroupfs_root *new_root;
-
};
-/*
- * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
- * with cgroup_mutex held to protect the subsys[] array. This function takes
- * refcounts on subsystems to be used, unless it returns error, in which case
- * no refcounts are taken.
- */
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = (unsigned long)-1;
+ unsigned int mask = -1U;
+ struct cgroup_subsys *ss;
int i;
- bool module_pin_failed = false;
-
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_subsys_id);
+ mask = ~(1U << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1126,12 +1291,20 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
if (!strcmp(token, "noprefix")) {
- set_bit(ROOT_NOPREFIX, &opts->flags);
+ opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
- opts->clone_children = true;
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
continue;
}
if (!strncmp(token, "release_agent=", 14)) {
@@ -1170,10 +1343,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
}
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
+ for_each_subsys(ss, i) {
if (strcmp(token, ss->name))
continue;
if (ss->disabled)
@@ -1182,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
- set_bit(i, &opts->subsys_bits);
+ opts->subsys_mask |= (1 << i);
one_ss = true;
break;
@@ -1191,501 +1361,479 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- /*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options
- * were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name)) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->disabled)
- continue;
- set_bit(i, &opts->subsys_bits);
+ /* Consistency checks */
+
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ return -EINVAL;
}
- }
+ } else {
+ /*
+ * If the 'all' option was specified select all the
+ * subsystems, otherwise if 'none', 'name=' and a subsystem
+ * name options were not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
- /* Consistency checks */
+ /*
+ * We either have to specify by name or by subsystems. (So
+ * all empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+ }
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
- (opts->subsys_bits & mask))
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
/* Can't specify "none" and some subsystems */
- if (opts->subsys_bits && opts->none)
- return -EINVAL;
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_bits && !opts->name)
+ if (opts->subsys_mask && opts->none)
return -EINVAL;
- /*
- * Grab references on all the modules we'll need, so the subsystems
- * don't dance around before rebind_subsystems attaches them. This may
- * take duplicate reference counts on a subsystem that's already used,
- * but rebind_subsystems handles this case.
- */
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
-
- if (!(bit & opts->subsys_bits))
- continue;
- if (!try_module_get(subsys[i]->module)) {
- module_pin_failed = true;
- break;
- }
- }
- if (module_pin_failed) {
- /*
- * oops, one of the modules was going away. this means that we
- * raced with a module_delete call, and to the user this is
- * essentially a "subsystem doesn't exist" case.
- */
- for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
- /* drop refcounts only on the ones we took */
- unsigned long bit = 1UL << i;
-
- if (!(bit & opts->subsys_bits))
- continue;
- module_put(subsys[i]->module);
- }
- return -ENOENT;
- }
-
return 0;
}
-static void drop_parsed_module_refcounts(unsigned long subsys_bits)
-{
- int i;
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
-
- if (!(bit & subsys_bits))
- continue;
- module_put(subsys[i]->module);
- }
-}
-
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
+ unsigned int added_mask, removed_mask;
+
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: remount is not allowed\n");
+ return -EINVAL;
+ }
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
/* Don't allow flags or name to change at remount */
- if (opts.flags != root->flags ||
+ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
(opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+ root->flags & CGRP_ROOT_OPTION_MASK, root->name);
ret = -EINVAL;
- drop_parsed_module_refcounts(opts.subsys_bits);
goto out_unlock;
}
- ret = rebind_subsystems(root, opts.subsys_bits);
- if (ret) {
- drop_parsed_module_refcounts(opts.subsys_bits);
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
goto out_unlock;
}
- /* (re)populate subsystem files */
- cgroup_populate_dir(cgrp);
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
- if (opts.release_agent)
+ rebind_subsystems(&cgrp_dfl_root, removed_mask);
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
}
-static const struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
-static void init_cgroup_housekeeping(struct cgroup *cgrp)
+static void cgroup_enable_task_cg_lists(void)
{
- INIT_LIST_HEAD(&cgrp->sibling);
- INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->css_sets);
- INIT_LIST_HEAD(&cgrp->release_list);
- INIT_LIST_HEAD(&cgrp->pidlists);
- mutex_init(&cgrp->pidlist_mutex);
- INIT_LIST_HEAD(&cgrp->event_list);
- spin_lock_init(&cgrp->event_list_lock);
-}
+ struct task_struct *p, *g;
-static void init_cgroup_root(struct cgroupfs_root *root)
-{
- struct cgroup *cgrp = &root->top_cgroup;
- INIT_LIST_HEAD(&root->subsys_list);
- INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
- cgrp->root = root;
- cgrp->top_cgroup = cgrp;
- init_cgroup_housekeeping(cgrp);
-}
+ down_write(&css_set_rwsem);
-static bool init_root_id(struct cgroupfs_root *root)
-{
- int ret = 0;
+ if (use_task_css_set_links)
+ goto out_unlock;
- do {
- if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
- return false;
- spin_lock(&hierarchy_id_lock);
- /* Try to allocate the next unused ID */
- ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
- &root->hierarchy_id);
- if (ret == -ENOSPC)
- /* Try again starting from 0 */
- ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
- if (!ret) {
- next_hierarchy_id = root->hierarchy_id + 1;
- } else if (ret != -EAGAIN) {
- /* Can only get here if the 31-bit IDR is full ... */
- BUG_ON(ret);
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING)) {
+ struct css_set *cset = task_css_set(p);
+
+ list_add(&p->cg_list, &cset->tasks);
+ get_css_set(cset);
}
- spin_unlock(&hierarchy_id_lock);
- } while (ret);
- return true;
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
}
-static int cgroup_test_super(struct super_block *sb, void *data)
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
- struct cgroup_sb_opts *opts = data;
- struct cgroupfs_root *root = sb->s_fs_info;
+ struct cgroup_subsys *ss;
+ int ssid;
- /* If we asked for a name then it must match */
- if (opts->name && strcmp(opts->name, root->name))
- return 0;
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
+ INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match
- */
- if ((opts->subsys_bits || opts->none)
- && (opts->subsys_bits != root->subsys_bits))
- return 0;
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
- return 1;
+ init_waitqueue_head(&cgrp->offline_waitq);
}
-static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+static void init_cgroup_root(struct cgroup_root *root,
+ struct cgroup_sb_opts *opts)
{
- struct cgroupfs_root *root;
-
- if (!opts->subsys_bits && !opts->none)
- return NULL;
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root)
- return ERR_PTR(-ENOMEM);
+ struct cgroup *cgrp = &root->cgrp;
- if (!init_root_id(root)) {
- kfree(root);
- return ERR_PTR(-ENOMEM);
- }
- init_cgroup_root(root);
+ INIT_LIST_HEAD(&root->root_list);
+ atomic_set(&root->nr_cgrps, 1);
+ cgrp->root = root;
+ init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
- root->subsys_bits = opts->subsys_bits;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
- if (opts->clone_children)
- set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
- return root;
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static void cgroup_drop_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
- if (!root)
- return;
-
- BUG_ON(!root->hierarchy_id);
- spin_lock(&hierarchy_id_lock);
- ida_remove(&hierarchy_ida, root->hierarchy_id);
- spin_unlock(&hierarchy_id_lock);
- kfree(root);
-}
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct css_set *cset;
+ int i, ret;
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
- int ret;
- struct cgroup_sb_opts *opts = data;
+ lockdep_assert_held(&cgroup_mutex);
- /* If we don't have a new root, we can't set up a new sb */
- if (!opts->new_root)
- return -EINVAL;
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
- BUG_ON(!opts->subsys_bits && !opts->none);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out;
- ret = set_anon_super(sb, NULL);
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
if (ret)
- return ret;
+ goto cancel_ref;
- sb->s_fs_info = opts->new_root;
- opts->new_root->sb = sb;
+ ret = cgroup_init_root_id(root);
+ if (ret)
+ goto cancel_ref;
+
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CGROUP_SUPER_MAGIC;
- sb->s_op = &cgroup_ops;
+ ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (ret)
+ goto destroy_root;
- return 0;
-}
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto destroy_root;
-static int cgroup_get_rootdir(struct super_block *sb)
-{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- .d_delete = cgroup_delete,
- };
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
- struct inode *inode =
- cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
- struct dentry *dentry;
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
- if (!inode)
- return -ENOMEM;
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &cgroup_dir_inode_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- dentry = d_alloc_root(inode);
- if (!dentry) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = dentry;
- /* for everything else we want ->d_op set */
- sb->s_d_op = &cgroup_dops;
- return 0;
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
+
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
}
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
+ struct cgroup_root *root;
struct cgroup_sb_opts opts;
- struct cgroupfs_root *root;
- int ret = 0;
- struct super_block *sb;
- struct cgroupfs_root *new_root;
- struct inode *inode;
+ struct dentry *dentry;
+ int ret;
+ int i;
+ bool new_sb;
+
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
- /* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
+
+ /* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- mutex_unlock(&cgroup_mutex);
if (ret)
- goto out_err;
+ goto out_unlock;
- /*
- * Allocate a new cgroup root. We may not need it if we're
- * reusing an existing hierarchy.
- */
- new_root = cgroup_root_from_opts(&opts);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
- goto drop_modules;
- }
- opts.new_root = new_root;
-
- /* Locate an existing or new sb for this hierarchy */
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- cgroup_drop_root(opts.new_root);
- goto drop_modules;
+ /* look for a matching existing root */
+ if (!opts.subsys_mask && !opts.none && !opts.name) {
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ ret = 0;
+ goto out_unlock;
}
- root = sb->s_fs_info;
- BUG_ON(!root);
- if (root == opts.new_root) {
- /* We used the new root structure, so this is a new hierarchy */
- struct list_head tmp_cg_links;
- struct cgroup *root_cgrp = &root->top_cgroup;
- struct cgroupfs_root *existing_root;
- const struct cred *cred;
- int i;
-
- BUG_ON(sb->s_root != NULL);
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
- ret = cgroup_get_rootdir(sb);
- if (ret)
- goto drop_new_super;
- inode = sb->s_root->d_inode;
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
+ for_each_root(root) {
+ bool name_match = false;
- /* Check for name clashes with existing mounts */
- ret = -EBUSY;
- if (strlen(root->name))
- for_each_active_root(existing_root)
- if (!strcmp(existing_root->name, root->name))
- goto unlock_drop;
+ if (root == &cgrp_dfl_root)
+ continue;
/*
- * We're accessing css_set_count without locking
- * css_set_lock here, but that's OK - it can only be
- * increased by someone holding cgroup_lock, and
- * that's us. The worst that can happen is that we
- * have some link structures left over
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
*/
- ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret)
- goto unlock_drop;
-
- ret = rebind_subsystems(root, root->subsys_bits);
- if (ret == -EBUSY) {
- free_cg_links(&tmp_cg_links);
- goto unlock_drop;
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
}
+
/*
- * There must be no failure case after here, since rebinding
- * takes care of subsystems' refcounts, which are explicitly
- * dropped in the failure exit path.
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
*/
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
- /* EBUSY should be the only error here */
- BUG_ON(ret);
-
- list_add(&root->root_list, &roots);
- root_count++;
-
- sb->s_root->d_fsdata = root_cgrp;
- root->top_cgroup.dentry = sb->s_root;
-
- /* Link the top cgroup in this hierarchy into all
- * the css_set objects */
- write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
+ if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
+ if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ } else {
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+ }
+ }
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
}
- write_unlock(&css_set_lock);
- free_cg_links(&tmp_cg_links);
+ ret = 0;
+ goto out_unlock;
+ }
- BUG_ON(!list_empty(&root_cgrp->sibling));
- BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- cred = override_creds(&init_cred);
- cgroup_populate_dir(root_cgrp);
- revert_creds(cred);
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- /*
- * We re-used an existing hierarchy - the new root (if
- * any) is not needed
- */
- cgroup_drop_root(opts.new_root);
- /* no subsys rebinding, so refcounts don't change */
- drop_parsed_module_refcounts(opts.subsys_bits);
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- kfree(opts.release_agent);
- kfree(opts.name);
- return dget(sb->s_root);
+ init_cgroup_root(root, &opts);
+
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
- unlock_drop:
- mutex_unlock(&cgroup_root_mutex);
+out_unlock:
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- drop_new_super:
- deactivate_locked_super(sb);
- drop_modules:
- drop_parsed_module_refcounts(opts.subsys_bits);
- out_err:
+out_free:
kfree(opts.release_agent);
kfree(opts.name);
- return ERR_PTR(ret);
-}
-
-static void cgroup_kill_sb(struct super_block *sb) {
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
- int ret;
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
- BUG_ON(!root);
-
- BUG_ON(root->number_of_cgroups != 1);
- BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
-
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
+ if (ret)
+ return ERR_PTR(ret);
- /* Rebind all subsystems back to the default hierarchy */
- ret = rebind_subsystems(root, 0);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
+ cgroup_put(&root->cgrp);
/*
- * Release all the links from css_sets to this hierarchy's
- * root cgroup
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
*/
- write_lock(&css_set_lock);
-
- list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
- cgrp_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- kfree(link);
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
}
- write_unlock(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- root_count--;
- }
+ return dentry;
+}
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+
+ /*
+ * If @root doesn't have any mounts or children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ * cgroup_mount() may wait for @root's release.
+ *
+ * And don't kill the default root.
+ */
+ if (css_has_online_children(&root->cgrp.self) ||
+ root == &cgrp_dfl_root)
+ cgroup_put(&root->cgrp);
+ else
+ percpu_ref_kill(&root->cgrp.self.refcnt);
- kill_litter_super(sb);
- cgroup_drop_root(root);
+ kernfs_kill_sb(sb);
}
static struct file_system_type cgroup_fs_type = {
@@ -1696,81 +1844,66 @@ static struct file_system_type cgroup_fs_type = {
static struct kobject *cgroup_kobj;
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Return value is the same as kernfs_path().
*/
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
- char *start;
- struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
-
- if (!dentry || cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
- return 0;
- }
-
- start = buf + buflen;
+ struct cgroup_root *root;
+ struct cgroup *cgrp;
+ int hierarchy_id = 1;
+ char *path = NULL;
- *--start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
+ root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
- dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
- if (!cgrp->parent)
- continue;
- if (--start < buf)
- return -ENAMETOOLONG;
- *start = '/';
+ if (root) {
+ cgrp = task_cgroup_from_root(task, root);
+ path = cgroup_path(cgrp, buf, buflen);
+ } else {
+ /* if no hierarchy exists, everyone is in "/" */
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
}
- memmove(buf, start, buf + buflen - start);
- return 0;
-}
-EXPORT_SYMBOL_GPL(cgroup_path);
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
- struct task_struct *task;
- struct cgroup *cgrp;
-};
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return path;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
+/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
- struct task_and_cgroup single;
- struct flex_array *tc_array;
- int tc_array_len;
- int idx;
- struct cgroup *cur_cgrp;
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
};
/**
@@ -1781,15 +1914,11 @@ struct cgroup_taskset {
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
- if (tset->tc_array) {
- tset->idx = 0;
- return cgroup_taskset_next(tset);
- } else {
- tset->cur_cgrp = tset->single.cgrp;
- return tset->single.task;
- }
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_first);
/**
* cgroup_taskset_next - iterate to the next task in taskset
@@ -1800,54 +1929,45 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
- struct task_and_cgroup *tc;
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
- if (!tset->tc_array || tset->idx >= tset->tc_array_len)
- return NULL;
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
- tc = flex_array_get(tset->tc_array, tset->idx++);
- tset->cur_cgrp = tc->cgrp;
- return tc->task;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
-/**
- * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
- * @tset: taskset of interest
- *
- * Return the cgroup for the current (last returned) task of @tset. This
- * function must be preceded by either cgroup_taskset_first() or
- * cgroup_taskset_next().
- */
-struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
-{
- return tset->cur_cgrp;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
-/**
- * cgroup_taskset_size - return the number of tasks in taskset
- * @tset: taskset of interest
- */
-int cgroup_taskset_size(struct cgroup_taskset *tset)
-{
- return tset->tc_array ? tset->tc_array_len : 1;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_size);
-
-/*
+/**
* cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
*
- * 'guarantee' is set if the caller promises that a new css_set for the task
- * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
*/
-static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
- struct task_struct *tsk, bool guarantee)
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
+ struct task_struct *tsk,
+ struct css_set *new_cset)
{
- struct css_set *oldcg;
- struct css_set *newcg;
+ struct css_set *old_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
/*
* We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1855,1182 +1975,1584 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
* css_set to init_css_set and dropping the old one.
*/
WARN_ON_ONCE(tsk->flags & PF_EXITING);
- oldcg = tsk->cgroups;
-
- /* locate or allocate a new css_set for this task. */
- if (guarantee) {
- /* we know the css_set we want already exists. */
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
- read_lock(&css_set_lock);
- newcg = find_existing_css_set(oldcg, cgrp, template);
- BUG_ON(!newcg);
- get_css_set(newcg);
- read_unlock(&css_set_lock);
- } else {
- might_sleep();
- /* find_css_set will give us newcg already referenced. */
- newcg = find_css_set(oldcg, cgrp);
- if (!newcg)
- return -ENOMEM;
- }
-
- task_lock(tsk);
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ old_cset = task_css_set(tsk);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &newcg->tasks);
- write_unlock(&css_set_lock);
+ get_css_set(new_cset);
+ rcu_assign_pointer(tsk->cgroups, new_cset);
/*
- * We just gained a reference on oldcg by taking it from the task. As
- * trading it for newcg is protected by cgroup_mutex, we're safe to drop
- * it here; it will be freed under RCU.
+ * Use move_tail so that cgroup_taskset_first() still returns the
+ * leader after migration. This works because cgroup_migrate()
+ * ensures that the dst_cset of the leader is the first on the
+ * tset's dst_csets list.
*/
- put_css_set(oldcg);
+ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- return 0;
+ /*
+ * We just gained a reference on old_cset by taking it from the
+ * task. As trading it for new_cset is protected by cgroup_mutex,
+ * we're safe to drop it here; it will be freed under RCU.
+ */
+ set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
+ put_css_set_locked(old_cset, false);
}
/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
*
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
*/
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
{
- int retval;
- struct cgroup_subsys *ss, *failed_ss = NULL;
- struct cgroup *oldcgrp;
- struct cgroupfs_root *root = cgrp->root;
- struct cgroup_taskset tset = { };
-
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
- return -ESRCH;
-
- /* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- return 0;
-
- tset.single.task = tsk;
- tset.single.cgrp = oldcgrp;
-
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, &tset);
- if (retval) {
- /*
- * Remember on which subsystem the can_attach()
- * failed, so that we only call cancel_attach()
- * against the subsystems whose can_attach()
- * succeeded. (See below)
- */
- failed_ss = ss;
- goto out;
- }
- }
- }
+ struct css_set *cset, *tmp_cset;
- retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
- if (retval)
- goto out;
+ lockdep_assert_held(&cgroup_mutex);
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(ss, cgrp, &tset);
+ down_write(&css_set_rwsem);
+ list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_preload_node);
+ put_css_set_locked(cset, false);
}
-
- synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
-out:
- if (retval) {
- for_each_subsys(root, ss) {
- if (ss == failed_ss)
- /*
- * This subsystem was the one that failed the
- * can_attach() check earlier, so we don't need
- * to call cancel_attach() against it or any
- * remaining subsystems.
- */
- break;
- if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, &tset);
- }
- }
- return retval;
+ up_write(&css_set_rwsem);
}
/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
*/
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- struct cgroupfs_root *root;
- int retval = 0;
+ struct cgroup *src_cgrp;
- cgroup_lock();
- for_each_active_root(root) {
- struct cgroup *from_cg = task_cgroup_from_root(from, root);
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
- retval = cgroup_attach_task(from_cg, tsk);
- if (retval)
- break;
- }
- cgroup_unlock();
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
- return retval;
+ if (!list_empty(&src_cset->mg_preload_node))
+ return;
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
+
+ src_cset->mg_src_cgrp = src_cgrp;
+ get_css_set(src_cset);
+ list_add(&src_cset->mg_preload_node, preloaded_csets);
}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-/*
- * cgroup_attach_proc works in two stages, the first of which prefetches all
- * new css_sets needed (to make sure we have enough memory before committing
- * to the move) and stores them in a list of entries of the following type.
- * TODO: possible optimization: use css_set->rcu_head for chaining instead
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup (may be %NULL)
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
*/
-struct cg_list_entry {
- struct css_set *cg;
- struct list_head links;
-};
-
-static bool css_set_check_fetched(struct cgroup *cgrp,
- struct task_struct *tsk, struct css_set *cg,
- struct list_head *newcg_list)
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- struct css_set *newcg;
- struct cg_list_entry *cg_entry;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ LIST_HEAD(csets);
+ struct css_set *src_cset, *tmp_cset;
- read_lock(&css_set_lock);
- newcg = find_existing_css_set(cg, cgrp, template);
- read_unlock(&css_set_lock);
+ lockdep_assert_held(&cgroup_mutex);
- /* doesn't exist at all? */
- if (!newcg)
- return false;
- /* see if it's already in the list */
- list_for_each_entry(cg_entry, newcg_list, links)
- if (cg_entry->cg == newcg)
- return true;
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
- /* not found */
- return false;
-}
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ struct css_set *dst_cset;
-/*
- * Find the new css_set and store it in the list in preparation for moving the
- * given task to the given cgroup. Returns 0 or -ENOMEM.
- */
-static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
- struct list_head *newcg_list)
-{
- struct css_set *newcg;
- struct cg_list_entry *cg_entry;
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
+ if (!dst_cset)
+ goto err;
- /* ensure a new css_set will exist for this thread */
- newcg = find_css_set(cg, cgrp);
- if (!newcg)
- return -ENOMEM;
- /* add it to the list */
- cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
- if (!cg_entry) {
- put_css_set(newcg);
- return -ENOMEM;
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
+
+ src_cset->mg_dst_cset = dst_cset;
+
+ if (list_empty(&dst_cset->mg_preload_node))
+ list_add(&dst_cset->mg_preload_node, &csets);
+ else
+ put_css_set(dst_cset, false);
}
- cg_entry->cg = newcg;
- list_add(&cg_entry->links, newcg_list);
+
+ list_splice_tail(&csets, preloaded_csets);
return 0;
+err:
+ cgroup_migrate_finish(&csets);
+ return -ENOMEM;
}
/**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
- * @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ *
+ * Migrate a process or task denoted by @leader to @cgrp. If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader. The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
*
- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
*/
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
-{
- int retval, i, group_size;
- struct cgroup_subsys *ss, *failed_ss = NULL;
- /* guaranteed to be initialized later, but the compiler needs this */
- struct css_set *oldcg;
- struct cgroupfs_root *root = cgrp->root;
- /* threadgroup list cursor and array */
- struct task_struct *tsk;
- struct task_and_cgroup *tc;
- struct flex_array *group;
- struct cgroup_taskset tset = { };
- /*
- * we need to make sure we have css_sets for all the tasks we're
- * going to move -before- we actually start moving them, so that in
- * case we get an ENOMEM we can bail out before making any changes.
- */
- struct list_head newcg_list;
- struct cg_list_entry *cg_entry, *temp_nobe;
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+ bool threadgroup)
+{
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
/*
- * step 0: in order to do expensive, possibly blocking operations for
- * every thread, we cannot iterate the thread group list, since it needs
- * rcu or tasklist locked. instead, build an array of all threads in the
- * group - group_rwsem prevents new threads from appearing, and if
- * threads exit, this will just be an over-estimate.
+ * Prevent freeing of tasks while we take a snapshot. Tasks that are
+ * already PF_EXITING could be freed from underneath us unless we
+ * take an rcu_read_lock.
*/
- group_size = get_nr_threads(leader);
- /* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
- if (!group)
- return -ENOMEM;
- /* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
- if (retval)
- goto out_free_group_list;
-
- /* prevent changes to the threadgroup list while we take a snapshot. */
- read_lock(&tasklist_lock);
- if (!thread_group_leader(leader)) {
- /*
- * a race with de_thread from another thread's exec() may strip
- * us of our leadership, making while_each_thread unsafe to use
- * on this task. if this happens, there is no choice but to
- * throw this task away and try again (from cgroup_procs_write);
- * this is "double-double-toil-and-trouble-check locking".
- */
- read_unlock(&tasklist_lock);
- retval = -EAGAIN;
- goto out_free_group_list;
- }
-
- tsk = leader;
- i = 0;
+ down_write(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
do {
- struct task_and_cgroup ent;
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ goto next;
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
- continue;
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ goto next;
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ goto next;
- /* as per above, nr_threads may decrease, but not increase. */
- BUG_ON(i >= group_size);
/*
- * saying GFP_ATOMIC has no effect here because we did prealloc
- * earlier, but it's good form to communicate our expectations.
+ * cgroup_taskset_first() must always return the leader.
+ * Take care to avoid disturbing the ordering.
*/
- ent.task = tsk;
- ent.cgrp = task_cgroup_from_root(tsk, root);
- /* nothing to do if this task is already in the cgroup */
- if (ent.cgrp == cgrp)
- continue;
- retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
- BUG_ON(retval != 0);
- i++;
- } while_each_thread(leader, tsk);
- /* remember the number of threads in the array for later. */
- group_size = i;
- tset.tc_array = group;
- tset.tc_array_len = group_size;
- read_unlock(&tasklist_lock);
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset.dst_csets);
+ next:
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_write(&css_set_rwsem);
/* methods shouldn't be called if no task is actually migrating */
- retval = 0;
- if (!group_size)
- goto out_free_group_list;
+ if (list_empty(&tset.src_csets))
+ return 0;
- /*
- * step 1: check that we can legitimately attach to the cgroup.
- */
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, &tset);
- if (retval) {
- failed_ss = ss;
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
+ failed_css = css;
goto out_cancel_attach;
}
}
}
/*
- * step 2: make sure css_sets exist for all threads to be migrated.
- * we use find_css_set, which allocates a new one if necessary.
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- INIT_LIST_HEAD(&newcg_list);
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- oldcg = tc->task->cgroups;
-
- /* if we don't already have it in the list get a new one */
- if (!css_set_check_fetched(cgrp, tc->task, oldcg,
- &newcg_list)) {
- retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
- if (retval)
- goto out_list_teardown;
- }
+ down_write(&css_set_rwsem);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
+ up_write(&css_set_rwsem);
/*
- * step 3: now that we're guaranteed success wrt the css_sets,
- * proceed to move all tasks to the new cgroup. There are no
- * failure cases after here, so this is the commit point.
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
*/
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
- BUG_ON(retval);
- }
- /* nothing is sensitive to fork() after this point. */
+ tset.csets = &tset.dst_csets;
- /*
- * step 4: do subsystem attach callbacks.
- */
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(ss, cgrp, &tset);
- }
+ for_each_e_css(css, i, cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, &tset);
+
+ ret = 0;
+ goto out_release_tset;
- /*
- * step 5: success! and cleanup
- */
- synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
- retval = 0;
-out_list_teardown:
- /* clean up the list of prefetched css_sets. */
- list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
- list_del(&cg_entry->links);
- put_css_set(cg_entry->cg);
- kfree(cg_entry);
- }
out_cancel_attach:
- /* same deal as in cgroup_attach_task */
- if (retval) {
- for_each_subsys(root, ss) {
- if (ss == failed_ss)
- break;
- if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, &tset);
- }
+ for_each_e_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
-out_free_group_list:
- flex_array_free(group);
- return retval;
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ up_write(&css_set_rwsem);
+ return ret;
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+ struct task_struct *leader, bool threadgroup)
+{
+ LIST_HEAD(preloaded_csets);
+ struct task_struct *task;
+ int ret;
+
+ /* look up all src csets */
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+ &preloaded_csets);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ if (!ret)
+ ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup; may take task_lock of task.
+ * cgroup_mutex and threadgroup.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
+ struct cgroup *cgrp;
+ pid_t pid;
int ret;
- if (!cgroup_lock_live_group(cgrp))
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
+retry_find_task:
+ rcu_read_lock();
if (pid) {
- rcu_read_lock();
tsk = find_task_by_vpid(pid);
if (!tsk) {
rcu_read_unlock();
- cgroup_unlock();
- return -ESRCH;
- }
- if (threadgroup) {
- /*
- * RCU protects this access, since tsk was found in the
- * tid map. a race with de_thread may cause group_leader
- * to stop being the leader, but cgroup_attach_proc will
- * detect it later.
- */
- tsk = tsk->group_leader;
+ ret = -ESRCH;
+ goto out_unlock_cgroup;
}
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
*/
tcred = __task_cred(tsk);
- if (cred->euid &&
- cred->euid != tcred->uid &&
- cred->euid != tcred->suid) {
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
rcu_read_unlock();
- cgroup_unlock();
- return -EACCES;
+ ret = -EACCES;
+ goto out_unlock_cgroup;
}
- get_task_struct(tsk);
+ } else
+ tsk = current;
+
+ if (threadgroup)
+ tsk = tsk->group_leader;
+
+ /*
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
+ * trapped in a cpuset, or RT worker may be born in a cgroup
+ * with no rt_runtime allocated. Just say no.
+ */
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
rcu_read_unlock();
- } else {
- if (threadgroup)
- tsk = current->group_leader;
- else
- tsk = current;
- get_task_struct(tsk);
+ goto out_unlock_cgroup;
}
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
- if (threadgroup)
- ret = cgroup_attach_proc(cgrp, tsk);
- else
- ret = cgroup_attach_task(cgrp, tsk);
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
threadgroup_unlock(tsk);
put_task_struct(tsk);
- cgroup_unlock();
- return ret;
+out_unlock_cgroup:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
}
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
- return attach_task_by_pid(cgrp, pid, false);
+ struct cgroup_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int ret;
- do {
- /*
- * attach_proc fails with -EAGAIN if threadgroup leadership
- * changes in the middle of the operation, in which case we need
- * to find the task_struct for the new leader and start over.
- */
- ret = attach_task_by_pid(cgrp, tgid, true);
- } while (ret == -EAGAIN);
- return ret;
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
}
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
- */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- mutex_lock(&cgroup_mutex);
- if (cgroup_is_removed(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
- }
- return true;
+ return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup *cgrp;
+
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- if (strlen(buffer) >= PATH_MAX)
- return -EINVAL;
- if (!cgroup_lock_live_group(cgrp))
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- mutex_lock(&cgroup_root_mutex);
- strcpy(cgrp->root->release_agent_path, buffer);
- mutex_unlock(&cgroup_root_mutex);
- cgroup_unlock();
- return 0;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
}
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
seq_putc(seq, '\n');
- cgroup_unlock();
return 0;
}
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
-
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- char buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- char *end;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (!nbytes)
- return -EINVAL;
- if (nbytes >= sizeof(buffer))
- return -E2BIG;
- if (copy_from_user(buffer, userbuf, nbytes))
- return -EFAULT;
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ return 0;
+}
- buffer[nbytes] = 0; /* nul-terminate */
- if (cft->write_u64) {
- u64 val = simple_strtoull(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_u64(cgrp, cft, val);
- } else {
- s64 val = simple_strtoll(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_s64(cgrp, cft, val);
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
}
- if (!retval)
- retval = nbytes;
- return retval;
+ if (printed)
+ seq_putc(seq, '\n');
}
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
{
- char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- size_t max_bytes = cft->max_write_len;
- char *buffer = local_buffer;
-
- if (!max_bytes)
- max_bytes = sizeof(local_buffer) - 1;
- if (nbytes >= max_bytes)
- return -E2BIG;
- /* Allocate a dynamic buffer if we need one */
- if (nbytes >= sizeof(local_buffer)) {
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
- }
- if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out;
- }
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- buffer[nbytes] = 0; /* nul-terminate */
- retval = cft->write_string(cgrp, cft, strstrip(buffer));
- if (!retval)
- retval = nbytes;
-out:
- if (buffer != local_buffer)
- kfree(buffer);
- return retval;
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+ ~cgrp_dfl_root_inhibit_ss_mask);
+ return 0;
}
-static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (cgroup_is_removed(cgrp))
- return -ENODEV;
- if (cft->write)
- return cft->write(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_u64 || cft->write_s64)
- return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_string)
- return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->trigger) {
- int ret = cft->trigger(cgrp, (unsigned int)cft->private);
- return ret ? ret : nbytes;
- }
- return -EINVAL;
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ return 0;
}
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- u64 val = cft->read_u64(cgrp, cft);
- int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ return 0;
}
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- s64 val = cft->read_s64(cgrp, cft);
- int len = sprintf(tmp, "%lld\n", (long long) val);
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
+ }
+ up_read(&css_set_rwsem);
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
+ }
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
-static ssize_t cgroup_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ unsigned int enable = 0, disable = 0;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
+
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name) ||
+ ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
- if (cgroup_is_removed(cgrp))
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- if (cft->read)
- return cft->read(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_u64)
- return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_s64)
- return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
- return -EINVAL;
-}
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->child_subsys_mask & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
-struct cgroup_seqfile_state {
- struct cftype *cft;
- struct cgroup *cgroup;
-};
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->child_subsys_mask & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
-static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+ /*
+ * Create csses for enables and update child_subsys_mask. This
+ * changes cgroup_e_css() results which in turn makes the
+ * subsequent cgroup_update_dfl_csses() associate all tasks in the
+ * subtree to the updated csses.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ ret = create_css(child, ss);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ cgrp->child_subsys_mask |= enable;
+ cgrp->child_subsys_mask &= ~disable;
+
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /* all tasks are now migrated away from the old csses, kill them */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp)
+ kill_css(cgroup_css(child, ss));
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+
+err_undo_css:
+ cgrp->child_subsys_mask &= ~enable;
+ cgrp->child_subsys_mask |= disable;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ if (css)
+ kill_css(css);
+ }
+ }
+ goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
{
- struct seq_file *sf = cb->state;
- return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
}
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- struct cgroup_seqfile_state *state = m->private;
- struct cftype *cft = state->cft;
- if (cft->read_map) {
- struct cgroup_map_cb cb = {
- .fill = cgroup_map_add,
- .state = m,
- };
- return cft->read_map(state->cgroup, cft, &cb);
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
}
- return cft->read_seq_string(state->cgroup, cft, m);
+
+ return ret ?: nbytes;
}
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- struct seq_file *seq = file->private_data;
- kfree(seq->private);
- return single_release(inode, file);
+ return seq_cft(seq)->seq_start(seq, ppos);
}
-static const struct file_operations cgroup_seqfile_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = seq_lseek,
- .release = cgroup_seqfile_release,
-};
-
-static int cgroup_file_open(struct inode *inode, struct file *file)
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- int err;
- struct cftype *cft;
-
- err = generic_file_open(inode, file);
- if (err)
- return err;
- cft = __d_cft(file->f_dentry);
-
- if (cft->read_map || cft->read_seq_string) {
- struct cgroup_seqfile_state *state =
- kzalloc(sizeof(*state), GFP_USER);
- if (!state)
- return -ENOMEM;
- state->cft = cft;
- state->cgroup = __d_cgrp(file->f_dentry->d_parent);
- file->f_op = &cgroup_seqfile_operations;
- err = single_open(file, cgroup_seqfile_show, state);
- if (err < 0)
- kfree(state);
- } else if (cft->open)
- err = cft->open(inode, file);
- else
- err = 0;
+ return seq_cft(seq)->seq_next(seq, v, ppos);
+}
- return err;
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+ seq_cft(seq)->seq_stop(seq, v);
}
-static int cgroup_file_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- if (cft->release)
- return cft->release(inode, file);
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
+
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
+
+ if (cft->read_u64)
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
return 0;
}
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
+
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
{
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
+ if (kn->parent != new_parent)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ /*
+ * This isn't a proper migration and its usefulness is very
+ * limited. Disallow if sane_behavior.
+ */
+ if (cgroup_sane_behavior(cgrp))
+ return -EPERM;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
}
-static const struct file_operations cgroup_file_operations = {
- .read = cgroup_file_read,
- .write = cgroup_file_write,
- .llseek = generic_file_llseek,
- .open = cgroup_file_open,
- .release = cgroup_file_release,
-};
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
-static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = cgroup_lookup,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
-};
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
- d_add(dentry, NULL);
- return NULL;
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+ int ret;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, false, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
}
-/*
- * Check if a file is a control file
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @cgrp: the target cgroup
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails. If addition fails, this
+ * function doesn't remove files already added. The caller is responsible
+ * for cleaning up.
*/
-static inline struct cftype *__file_cft(struct file *file)
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
- return ERR_PTR(-EINVAL);
- return __d_cft(file->f_dentry);
+ struct cftype *cft;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+ continue;
+
+ if (is_add) {
+ ret = cgroup_add_file(cgrp, cft);
+ if (ret) {
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
+ return ret;
+ }
+ } else {
+ cgroup_rm_file(cgrp, cft);
+ }
+ }
+ return 0;
}
-static int cgroup_create_file(struct dentry *dentry, umode_t mode,
- struct super_block *sb)
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- struct inode *inode;
+ LIST_HEAD(pending);
+ struct cgroup_subsys *ss = cfts[0].ss;
+ struct cgroup *root = &ss->root->cgrp;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
+ lockdep_assert_held(&cgroup_mutex);
- inode = cgroup_new_inode(mode, sb);
- if (!inode)
- return -ENOMEM;
+ /* add/rm files for all cgroups created before */
+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_is_dead(cgrp))
+ continue;
+
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ if (ret)
+ break;
+ }
- if (S_ISDIR(mode)) {
- inode->i_op = &cgroup_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
+}
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+ struct cftype *cft;
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cgroup_file_operations;
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
}
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
+ }
+
return 0;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
+ return 0;
+}
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
*/
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- umode_t mode)
-{
- struct dentry *parent;
- int error = 0;
-
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
- }
- dput(dentry);
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+ int ret;
- return error;
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
*
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
*/
-static umode_t cgroup_file_mode(const struct cftype *cft)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
- umode_t mode = 0;
+ int ret;
- if (cft->mode)
- return cft->mode;
+ if (ss->disabled)
+ return 0;
- if (cft->read || cft->read_u64 || cft->read_s64 ||
- cft->read_map || cft->read_seq_string)
- mode |= S_IRUGO;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
- if (cft->write || cft->write_u64 || cft->write_s64 ||
- cft->write_string || cft->trigger)
- mode |= S_IWUSR;
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
- return mode;
+ mutex_lock(&cgroup_mutex);
+
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
+
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
-int cgroup_add_file(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype *cft)
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
{
- struct dentry *dir = cgrp->dentry;
- struct dentry *dentry;
- int error;
- umode_t mode;
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ up_read(&css_set_rwsem);
+ return count;
+}
- char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
- if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
- strcpy(name, subsys->name);
- strcat(name, ".");
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /*
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() inbetween iterations.
+ *
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
+ */
+ if (!pos) {
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+ } else {
+ list_for_each_entry_rcu(next, &parent->children, sibling)
+ if (next->serial_nr > pos->serial_nr)
+ break;
}
- strcat(name, cft->name);
- BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
- dentry = lookup_one_len(name, dir, strlen(name));
- if (!IS_ERR(dentry)) {
- mode = cgroup_file_mode(cft);
- error = cgroup_create_file(dentry, mode | S_IFREG,
- cgrp->root->sb);
- if (!error)
- dentry->d_fsdata = (void *)cft;
- dput(dentry);
- } else
- error = PTR_ERR(dentry);
- return error;
+
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_add_file);
-int cgroup_add_files(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype cft[],
- int count)
+/**
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_pre(). Find the next descendant
+ * to visit for pre-order traversal of @root's descendants. @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
{
- int i, err;
- for (i = 0; i < count; i++) {
- err = cgroup_add_file(cgrp, subsys, &cft[i]);
- if (err)
- return err;
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = css_next_child(NULL, pos);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
}
- return 0;
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_add_files);
/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
*
- * Return the number of tasks in the cgroup.
+ * Return the rightmost descendant of @pos. If there's no descendant, @pos
+ * is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
*/
-int cgroup_task_count(const struct cgroup *cgrp)
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
- int count = 0;
- struct cg_cgroup_link *link;
+ struct cgroup_subsys_state *last, *tmp;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ css_for_each_child(tmp, last)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last;
+
+ do {
+ last = pos;
+ pos = css_next_child(NULL, pos);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_post(). Find the next descendant
+ * to visit for post-order traversal of @root's descendants. @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return css_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *child;
+ bool ret = false;
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
- count += atomic_read(&link->cg->refcount);
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
}
- read_unlock(&css_set_lock);
- return count;
+ rcu_read_unlock();
+ return ret;
}
-/*
- * Advance a list_head iterator. The iterator should be positioned at
- * the start of a css_set
+/**
+ * css_advance_task_iter - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
*/
-static void cgroup_advance_iter(struct cgroup *cgrp,
- struct cgroup_iter *it)
+static void css_advance_task_iter(struct css_task_iter *it)
{
- struct list_head *l = it->cg_link;
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct list_head *l = it->cset_pos;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &cgrp->css_sets) {
- it->cg_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
- cg = link->cg;
- } while (list_empty(&cg->tasks));
- it->cg_link = l;
- it->task = cg->tasks.next;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+ } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
+ it->cset_pos = l;
+
+ if (!list_empty(&cset->tasks))
+ it->task_pos = cset->tasks.next;
+ else
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
-/*
- * To reduce the fork() overhead for systems that are not actually
- * using their cgroups capability, we don't maintain the lists running
- * through each css_set to its tasks until we see the list actually
- * used - in other words after the first call to cgroup_iter_start().
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
*
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
+ * Initiate iteration through the tasks of @css. The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL. On completion of iteration, css_task_iter_end() must be
+ * called.
+ *
+ * Note that this function acquires a lock which is released when the
+ * iteration finishes. The caller can't sleep while iteration is in
+ * progress.
*/
-static void cgroup_enable_task_cg_lists(void)
+void css_task_iter_start(struct cgroup_subsys_state *css,
+ struct css_task_iter *it)
+ __acquires(css_set_rwsem)
{
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- do_each_thread(g, p) {
- task_lock(p);
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- */
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
- list_add(&p->cg_list, &p->cgroups->tasks);
- task_unlock(p);
- } while_each_thread(g, p);
- write_unlock(&css_set_lock);
-}
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
-void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
- __acquires(css_set_lock)
-{
- /*
- * The first time anyone tries to iterate across a cgroup,
- * we need to enable the list linking each css_set to its
- * tasks, and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ down_read(&css_set_rwsem);
+
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
- read_lock(&css_set_lock);
- it->cg_link = &cgrp->css_sets;
- cgroup_advance_iter(cgrp, it);
+ it->cset_head = it->cset_pos;
+
+ css_advance_task_iter(it);
}
-struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
- struct cgroup_iter *it)
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration. @it should have been
+ * initialized via css_task_iter_start(). Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cg_cgroup_link *link;
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cg_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
- /* Advance iterator to find next entry */
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
l = l->next;
- link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
- if (l == &link->cg->tasks) {
- /* We reached the end of this task list - move on to
- * the next cg_cgroup_link */
- cgroup_advance_iter(cgrp, it);
- } else {
- it->task = l;
- }
- return res;
-}
-void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
- __releases(css_set_lock)
-{
- read_unlock(&css_set_lock);
-}
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
-{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively) simultaneously.
- */
- return t1 > t2;
- }
+ if (l == it->mg_tasks_head)
+ css_advance_task_iter(it);
+ else
+ it->task_pos = l;
+
+ return res;
}
-/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
*/
-static inline int started_after(void *p1, void *p2)
+void css_task_iter_end(struct css_task_iter *it)
+ __releases(css_set_rwsem)
{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
+ up_read(&css_set_rwsem);
}
/**
- * cgroup_scan_tasks - iterate though all the tasks in a cgroup
- * @scan: struct cgroup_scanner containing arguments for the scan
- *
- * Arguments include pointers to callback functions test_task() and
- * process_task().
- * Iterate through all the tasks in a cgroup, calling test_task() for each,
- * and if it returns true, call process_task() for it also.
- * The test_task pointer may be NULL, meaning always true (select all tasks).
- * Effectively duplicates cgroup_iter_{start,next,end}()
- * but does not lock css_set_lock for the call to process_task().
- * The struct cgroup_scanner may be embedded in any structure of the caller's
- * creation.
- * It is guaranteed that process_task() will act on every task that
- * is a member of the cgroup for the duration of this call. This
- * function may or may not call process_task() for tasks that exit
- * or move to a different cgroup during the call, or are forked or
- * move into the cgroup during the call.
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
*
- * Note that test_task() may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should
- * be cheap.
- * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
- * pre-allocated and will be used for heap operations (and its "gt" member will
- * be overwritten), else a temporary heap will be used (allocation of which
- * may cause this function to fail).
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
*/
-int cgroup_scan_tasks(struct cgroup_scanner *scan)
-{
- int retval, i;
- struct cgroup_iter it;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
- struct ptr_heap tmp_heap;
- struct ptr_heap *heap;
- struct timespec latest_time = { 0, 0 };
-
- if (scan->heap) {
- /* The caller supplied our heap and pre-allocated its memory */
- heap = scan->heap;
- heap->gt = &started_after;
- } else {
- /* We need to allocate our own heap memory */
- heap = &tmp_heap;
- retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- /* cannot allocate the heap */
- return retval;
- }
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* all tasks in @from are being moved, all csets are source */
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+ up_read(&css_set_rwsem);
+
+ ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ if (ret)
+ goto out_err;
- again:
/*
- * Scan tasks in the cgroup, using the scanner's "test_task" callback
- * to determine which are of interest, and using the scanner's
- * "process_task" callback to process any of them that need an update.
- * Since we don't want to hold any locks during the task updates,
- * gather tasks to be processed in a heap structure.
- * The heap is sorted by descending task start time.
- * If the statically-sized heap fills up, we overflow tasks that
- * started later, and in future iterations only consider tasks that
- * started after the latest task in the previous pass. This
- * guarantees forward progress and that we don't miss any tasks.
+ * Migrate tasks one-by-one until @form is empty. This fails iff
+ * ->can_attach() fails.
*/
- heap->size = 0;
- cgroup_iter_start(scan->cg, &it);
- while ((p = cgroup_iter_next(scan->cg, &it))) {
- /*
- * Only affect tasks that qualify per the caller's callback,
- * if he provided one
- */
- if (scan->test_task && !scan->test_task(p, scan))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(heap, p);
- if (dropped == NULL) {
- /*
- * The new task was inserted; the heap wasn't
- * previously full
- */
- get_task_struct(p);
- } else if (dropped != p) {
- /*
- * The new task was inserted, and pushed out a
- * different task
- */
- get_task_struct(p);
- put_task_struct(dropped);
- }
- /*
- * Else the new task was newer than anything already in
- * the heap and wasn't inserted
- */
- }
- cgroup_iter_end(scan->cg, &it);
-
- if (heap->size) {
- for (i = 0; i < heap->size; i++) {
- struct task_struct *q = heap->ptrs[i];
- if (i == 0) {
- latest_time = q->start_time;
- latest_task = q;
- }
- /* Process the task per the caller's callback */
- scan->process_task(q, scan);
- put_task_struct(q);
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(to, task, false);
+ put_task_struct(task);
}
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't get processed.
- * Not the most efficient way to do it, but it avoids
- * having to take callback_mutex in the fork path
- */
- goto again;
- }
- if (heap == &tmp_heap)
- heap_free(&tmp_heap);
- return 0;
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&preloaded_csets);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
/*
@@ -3043,6 +3565,36 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*
*/
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3056,6 +3608,7 @@ static void *pidlist_allocate(int count)
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
+
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
@@ -3063,35 +3616,55 @@ static void pidlist_free(void *p)
else
kfree(p);
}
-static void *pidlist_resize(void *p, int newcount)
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
- void *newlist;
- /* note: if new alloc fails, old p will still be valid either way */
- if (is_vmalloc_addr(p)) {
- newlist = vmalloc(newcount * sizeof(pid_t));
- if (!newlist)
- return NULL;
- memcpy(newlist, p, newcount * sizeof(pid_t));
- vfree(p);
- } else {
- newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
}
- return newlist;
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
*/
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
{
int src, dest = 1;
- pid_t *list = *p;
- pid_t *newlist;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
@@ -3112,67 +3685,95 @@ static int pidlist_uniq(pid_t **p, int length)
dest++;
}
after:
- /*
- * if the length difference is large enough, we want to allocate a
- * smaller buffer to save memory. if this fails due to out of memory,
- * we'll just stay with what we've got.
- */
- if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = pidlist_resize(list, dest);
- if (newlist)
- *p = newlist;
- }
return dest;
}
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+ if (cgroup_sane_behavior(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+}
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
+static int fried_cmppid(const void *a, const void *b)
+{
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
- /*
- * We can't drop the pidlist_mutex before taking the l->mutex in case
- * the last ref-holder is trying to remove l from the list at the same
- * time. Holding the pidlist_mutex precludes somebody taking whichever
- * list we find out from under us - compare release_pid_array().
- */
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry(l, &cgrp->pidlists, links) {
- if (l->key.type == type && l->key.ns == ns) {
- /* make sure l doesn't vanish out from under us */
- down_write(&l->mutex);
- mutex_unlock(&cgrp->pidlist_mutex);
- return l;
- }
- }
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
/* entry not found; create a new one */
- l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
return l;
- }
- init_rwsem(&l->mutex);
- down_write(&l->mutex);
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
l->key.type = type;
- l->key.ns = get_pid_ns(ns);
- l->use_count = 0; /* don't increment here */
- l->list = NULL;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
- mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
@@ -3185,10 +3786,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
struct cgroup_pidlist *l;
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
@@ -3200,8 +3803,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
@@ -3212,23 +3815,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (cgroup_sane_behavior(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(&array, length);
- l = cgroup_pidlist_find(cgrp, type);
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
- /* store array, freeing old if necessary - lock already held */
+
+ /* store array, freeing old if necessary */
pidlist_free(l->list);
l->list = array;
l->length = length;
- l->use_count++;
- up_write(&l->mutex);
*lp = l;
return 0;
}
@@ -3244,24 +3851,34 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
- int ret = -EINVAL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
/*
- * Validate dentry by checking the superblock operations,
- * and make sure it's a directory.
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
*/
- if (dentry->d_sb->s_op != &cgroup_ops ||
- !S_ISDIR(dentry->d_inode->i_mode))
- goto err;
-
- ret = 0;
- cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
@@ -3281,10 +3898,10 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
break;
}
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
-err:
- return ret;
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
@@ -3302,20 +3919,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup_pidlist *l = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
- int *iter;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
- down_read(&l->mutex);
if (pid) {
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
- if (l->list[mid] == pid) {
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
@@ -3326,19 +3968,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
- *pos = *iter;
+ *pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup_pidlist *l = s->private;
- up_read(&l->mutex);
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup_pidlist *l = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -3349,7 +3997,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
- *pos = *p;
+ *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
@@ -3359,756 +4007,714 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return seq_printf(s, "%d\n", *(int *)v);
}
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
- .start = cgroup_pidlist_start,
- .stop = cgroup_pidlist_stop,
- .next = cgroup_pidlist_next,
- .show = cgroup_pidlist_show,
-};
-
-static void cgroup_release_pid_array(struct cgroup_pidlist *l)
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- /*
- * the case where we're the last user of this particular pidlist will
- * have us remove it from the cgroup's list, which entails taking the
- * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
- * pidlist_mutex, we have to take pidlist_mutex first.
- */
- mutex_lock(&l->owner->pidlist_mutex);
- down_write(&l->mutex);
- BUG_ON(!l->use_count);
- if (!--l->use_count) {
- /* we're the last user if refcount is 0; remove and free */
- list_del(&l->links);
- mutex_unlock(&l->owner->pidlist_mutex);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- up_write(&l->mutex);
- kfree(l);
- return;
- }
- mutex_unlock(&l->owner->pidlist_mutex);
- up_write(&l->mutex);
+ return notify_on_release(css->cgroup);
}
-static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-{
- struct cgroup_pidlist *l;
- if (!(file->f_mode & FMODE_READ))
- return 0;
- /*
- * the seq_file will only be initialized if the file was opened for
- * reading; hence we check if it's not null only in that case.
- */
- l = ((struct seq_file *)file->private_data)->private;
- cgroup_release_pid_array(l);
- return seq_release(inode, file);
-}
-
-static const struct file_operations cgroup_pidlist_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .write = cgroup_file_write,
- .release = cgroup_pidlist_release,
-};
-
-/*
- * The following functions handle opens on a file that displays a pidlist
- * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
- * in the cgroup.
- */
-/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l;
- int retval;
-
- /* Nothing to do for write-only files */
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- /* have the array populated */
- retval = pidlist_array_load(cgrp, type, &l);
- if (retval)
- return retval;
- /* configure file information */
- file->f_op = &cgroup_pidlist_operations;
-
- retval = seq_open(file, &cgroup_pidlist_seq_operations);
- if (retval) {
- cgroup_release_pid_array(l);
- return retval;
- }
- ((struct seq_file *)file->private_data)->private = l;
+ clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
-{
- return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
-}
-static int cgroup_procs_open(struct inode *unused, struct file *file)
-{
- return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
-}
-static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft)
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return notify_on_release(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}
-static int cgroup_write_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft,
- u64 val)
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
return 0;
}
-/*
- * Unregister event and free resources.
+static struct cftype cgroup_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .flags = CFTYPE_INSANE,
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_ONLY_ON_DFL,
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
+ },
+
+ /*
+ * Historical crazy stuff. These don't have "cgroup." prefix and
+ * don't exist if sane_behavior. If you're depending on these, be
+ * prepared to be burned.
+ */
+ {
+ .name = "tasks",
+ .flags = CFTYPE_INSANE, /* use "procs" instead */
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "notify_on_release",
+ .flags = CFTYPE_INSANE,
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/**
+ * cgroup_populate_dir - create subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be added
*
- * Gets called from workqueue.
+ * On failure, no file is added.
*/
-static void cgroup_event_remove(struct work_struct *work)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- struct cgroup_event *event = container_of(work, struct cgroup_event,
- remove);
- struct cgroup *cgrp = event->cgrp;
+ struct cgroup_subsys *ss;
+ int i, ret = 0;
- event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+ /* process cftsets of each subsystem */
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- eventfd_ctx_put(event->eventfd);
- kfree(event);
- dput(cgrp->dentry);
-}
+ if (!(subsys_mask & (1 << i)))
+ continue;
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
- int sync, void *key)
-{
- struct cgroup_event *event = container_of(wait,
- struct cgroup_event, wait);
- struct cgroup *cgrp = event->cgrp;
- unsigned long flags = (unsigned long)key;
-
- if (flags & POLLHUP) {
- __remove_wait_queue(event->wqh, &event->wait);
- spin_lock(&cgrp->event_list_lock);
- list_del(&event->list);
- spin_unlock(&cgrp->event_list_lock);
- /*
- * We are in atomic context, but cgroup_event_remove() may
- * sleep, so we have to call it in workqueue.
- */
- schedule_work(&event->remove);
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
+ if (ret < 0)
+ goto err;
+ }
}
-
return 0;
-}
-
-static void cgroup_event_ptable_queue_proc(struct file *file,
- wait_queue_head_t *wqh, poll_table *pt)
-{
- struct cgroup_event *event = container_of(pt,
- struct cgroup_event, pt);
-
- event->wqh = wqh;
- add_wait_queue(wqh, &event->wait);
+err:
+ cgroup_clear_dir(cgrp, subsys_mask);
+ return ret;
}
/*
- * Parse input and register new cgroup event handler.
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts. Killing of the percpu_ref is initiated.
+ * Implemented in kill_css().
*
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ * accessors are inside RCU read sections. css_release() schedules the
+ * RCU callback.
+ *
+ * 4. After the grace period, the css can be freed. Implemented in
+ * css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
*/
-static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
-{
- struct cgroup_event *event = NULL;
- unsigned int efd, cfd;
- struct file *efile = NULL;
- struct file *cfile = NULL;
- char *endp;
- int ret;
+static void css_free_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup *cgrp = css->cgroup;
- efd = simple_strtoul(buffer, &endp, 10);
- if (*endp != ' ')
- return -EINVAL;
- buffer = endp + 1;
+ if (css->ss) {
+ /* css free path */
+ if (css->parent)
+ css_put(css->parent);
- cfd = simple_strtoul(buffer, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
- return -EINVAL;
- buffer = endp + 1;
+ css->ss->css_free(css);
+ cgroup_put(cgrp);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ cgroup_pidlist_destroy_all(cgrp);
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
- event->cgrp = cgrp;
- INIT_LIST_HEAD(&event->list);
- init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
- init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
- INIT_WORK(&event->remove, cgroup_event_remove);
-
- efile = eventfd_fget(efd);
- if (IS_ERR(efile)) {
- ret = PTR_ERR(efile);
- goto fail;
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
+}
- event->eventfd = eventfd_ctx_fileget(efile);
- if (IS_ERR(event->eventfd)) {
- ret = PTR_ERR(event->eventfd);
- goto fail;
- }
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct cgroup_subsys_state *css =
+ container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
- cfile = fget(cfd);
- if (!cfile) {
- ret = -EBADF;
- goto fail;
- }
+ INIT_WORK(&css->destroy_work, css_free_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
- /* the process need read permission on control file */
- /* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
- if (ret < 0)
- goto fail;
+static void css_release_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
- event->cft = __file_cft(cfile);
- if (IS_ERR(event->cft)) {
- ret = PTR_ERR(event->cft);
- goto fail;
- }
+ mutex_lock(&cgroup_mutex);
- if (!event->cft->register_event || !event->cft->unregister_event) {
- ret = -EINVAL;
- goto fail;
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (ss) {
+ /* css release path */
+ cgroup_idr_remove(&ss->css_idr, css->id);
+ } else {
+ /* cgroup release path */
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
}
- ret = event->cft->register_event(cgrp, event->cft,
- event->eventfd, buffer);
- if (ret)
- goto fail;
+ mutex_unlock(&cgroup_mutex);
- if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
- event->cft->unregister_event(cgrp, event->cft, event->eventfd);
- ret = 0;
- goto fail;
- }
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+}
- /*
- * Events should be removed after rmdir of cgroup directory, but before
- * destroying subsystem state objects. Let's take reference to cgroup
- * directory dentry to do that.
- */
- dget(cgrp->dentry);
+static void css_release(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
- spin_lock(&cgrp->event_list_lock);
- list_add(&event->list, &cgrp->event_list);
- spin_unlock(&cgrp->event_list_lock);
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
- fput(cfile);
- fput(efile);
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ lockdep_assert_held(&cgroup_mutex);
- return 0;
+ cgroup_get(cgrp);
-fail:
- if (cfile)
- fput(cfile);
+ memset(css, 0, sizeof(*css));
+ css->cgroup = cgrp;
+ css->ss = ss;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
+
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
+ }
- if (event && event->eventfd && !IS_ERR(event->eventfd))
- eventfd_ctx_put(event->eventfd);
+ BUG_ON(cgroup_css(cgrp, ss));
+}
- if (!IS_ERR_OR_NULL(efile))
- fput(efile);
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ int ret = 0;
- kfree(event);
+ lockdep_assert_held(&cgroup_mutex);
+ if (ss->css_online)
+ ret = ss->css_online(css);
+ if (!ret) {
+ css->flags |= CSS_ONLINE;
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+ }
return ret;
}
-static u64 cgroup_clone_children_read(struct cgroup *cgrp,
- struct cftype *cft)
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
{
- return clone_children(cgrp);
-}
+ struct cgroup_subsys *ss = css->ss;
-static int cgroup_clone_children_write(struct cgroup *cgrp,
- struct cftype *cft,
- u64 val)
-{
- if (val)
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
- else
- clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
- return 0;
-}
+ lockdep_assert_held(&cgroup_mutex);
-/*
- * for the common functions, 'private' gives the type of file
- */
-/* for hysterical raisins, we can't put this on the older files */
-#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
-static struct cftype files[] = {
- {
- .name = "tasks",
- .open = cgroup_tasks_open,
- .write_u64 = cgroup_tasks_write,
- .release = cgroup_pidlist_release,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = CGROUP_FILE_GENERIC_PREFIX "procs",
- .open = cgroup_procs_open,
- .write_u64 = cgroup_procs_write,
- .release = cgroup_pidlist_release,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = "notify_on_release",
- .read_u64 = cgroup_read_notify_on_release,
- .write_u64 = cgroup_write_notify_on_release,
- },
- {
- .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
- .write_string = cgroup_write_event_control,
- .mode = S_IWUGO,
- },
- {
- .name = "cgroup.clone_children",
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
-};
+ if (!(css->flags & CSS_ONLINE))
+ return;
-static struct cftype cft_release_agent = {
- .name = "release_agent",
- .read_seq_string = cgroup_release_agent_show,
- .write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
-};
+ if (ss->css_offline)
+ ss->css_offline(css);
+
+ css->flags &= ~CSS_ONLINE;
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
+}
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
+ */
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+ struct cgroup_subsys_state *css;
int err;
- struct cgroup_subsys *ss;
- /* First clear out any existing files */
- cgroup_clear_directory(cgrp->dentry);
+ lockdep_assert_held(&cgroup_mutex);
- err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
- if (err < 0)
- return err;
+ css = ss->css_alloc(parent_css);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
- if (cgrp == cgrp->top_cgroup) {
- if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
- return err;
- }
+ init_and_link_css(css, ss, cgrp);
- for_each_subsys(cgrp->root, ss) {
- if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
- return err;
- }
- /* This cgroup is ready now */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- /*
- * Update id->css pointer and make this css visible from
- * CSS ID functions. This pointer will be dereferened
- * from RCU-read-side without locks.
- */
- if (css->id)
- rcu_assign_pointer(css->id->css, css);
- }
+ err = percpu_ref_init(&css->refcnt, css_release);
+ if (err)
+ goto err_free_css;
- return 0;
-}
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
-static void init_cgroup_css(struct cgroup_subsys_state *css,
- struct cgroup_subsys *ss,
- struct cgroup *cgrp)
-{
- css->cgroup = cgrp;
- atomic_set(&css->refcnt, 1);
- css->flags = 0;
- css->id = NULL;
- if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
- BUG_ON(cgrp->subsys[ss->subsys_id]);
- cgrp->subsys[ss->subsys_id] = css;
-}
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-{
- /* We need to take each hierarchy_mutex in a consistent order */
- int i;
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
- /*
- * No worry about a race with rebind_subsystems that might mess up the
- * locking order, since both parties are under cgroup_mutex.
- */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->root == root)
- mutex_lock(&ss->hierarchy_mutex);
+ err = online_css(css);
+ if (err)
+ goto err_list_del;
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ cgroup_parent(parent)) {
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
+ ss->warned_broken_hierarchy = true;
}
-}
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-{
- int i;
+ return 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->root == root)
- mutex_unlock(&ss->hierarchy_mutex);
- }
+err_list_del:
+ list_del_rcu(&css->sibling);
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
+err_free_percpu_ref:
+ percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+ return err;
}
-/*
- * cgroup_create - create a cgroup
- * @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
- *
- * Must be called with the mutex on the parent inode held
- */
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- umode_t mode)
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
{
- struct cgroup *cgrp;
- struct cgroupfs_root *root = parent->root;
- int err = 0;
+ struct cgroup *parent, *cgrp;
+ struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct super_block *sb = root->sb;
+ struct kernfs_node *kn;
+ int ssid, ret;
+
+ parent = cgroup_kn_lock_live(parent_kn);
+ if (!parent)
+ return -ENODEV;
+ root = parent->root;
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp)
- return -ENOMEM;
+ if (!cgrp) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- /* Grab a reference on the superblock so the hierarchy doesn't
- * get deleted on unmount if there are child cgroups. This
- * can be done outside cgroup_mutex, since the sb can't
- * disappear while someone has an open control file on the
- * fs */
- atomic_inc(&sb->s_active);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out_free_cgrp;
- mutex_lock(&cgroup_mutex);
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (cgrp->id < 0) {
+ ret = -ENOMEM;
+ goto out_cancel_ref;
+ }
init_cgroup_housekeeping(cgrp);
- cgrp->parent = parent;
- cgrp->root = parent->root;
- cgrp->top_cgroup = parent->top_cgroup;
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- if (clone_children(parent))
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-
- for_each_subsys(root, ss) {
- struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- if (IS_ERR(css)) {
- err = PTR_ERR(css);
- goto err_destroy;
- }
- init_cgroup_css(css, ss, cgrp);
- if (ss->use_id) {
- err = alloc_css_id(ss, parent, cgrp);
- if (err)
- goto err_destroy;
- }
- /* At error, ->destroy() callback has to free assigned ID. */
- if (clone_children(parent) && ss->post_clone)
- ss->post_clone(ss, cgrp);
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_id;
}
+ cgrp->kn = kn;
- cgroup_lock_hierarchy(root);
- list_add(&cgrp->sibling, &cgrp->parent->children);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups++;
+ /*
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
- err = cgroup_create_dir(cgrp, dentry, mode);
- if (err < 0)
- goto err_remove;
+ cgrp->self.serial_nr = css_serial_nr_next++;
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+ /* allocation complete, commit to creation */
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
- err = cgroup_populate_dir(cgrp);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ /*
+ * @cgrp is now fully operational. If something fails after this
+ * point, it'll be released via the normal destruction path.
+ */
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
- return 0;
+ ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (ret)
+ goto out_destroy;
- err_remove:
+ /* let's create and online css's */
+ for_each_subsys(ss, ssid) {
+ if (parent->child_subsys_mask & (1 << ssid)) {
+ ret = create_css(cgrp, ss);
+ if (ret)
+ goto out_destroy;
+ }
+ }
- cgroup_lock_hierarchy(root);
- list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups--;
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * child_subsys_mask from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->child_subsys_mask = parent->child_subsys_mask;
- err_destroy:
+ kernfs_activate(kn);
- for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
- }
+ ret = 0;
+ goto out_unlock;
- mutex_unlock(&cgroup_mutex);
+out_free_id:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+ percpu_ref_cancel_init(&cgrp->self.refcnt);
+out_free_cgrp:
+ kfree(cgrp);
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
+
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ goto out_unlock;
+}
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
- kfree(cgrp);
- return err;
+ mutex_lock(&cgroup_mutex);
+ offline_css(css);
+ mutex_unlock(&cgroup_mutex);
+
+ css_put(css);
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
{
- struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
- /* the vfs holds inode->i_mutex already */
- return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-static int cgroup_has_css_refs(struct cgroup *cgrp)
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
{
- /* Check the reference count on each subsystem. Since we
- * already established that there are no tasks in the
- * cgroup, if the css refcount is also 1, then there should
- * be no outstanding references, so the subsystem is safe to
- * destroy. We scan across all subsystems rather than using
- * the per-hierarchy linked list of mounted subsystems since
- * we can be called via check_for_release() with no
- * synchronization other than RCU, and the subsystem linked
- * list isn't RCU-safe */
- int i;
+ lockdep_assert_held(&cgroup_mutex);
+
/*
- * We won't need to lock the subsys array, because the subsystems
- * we're concerned about aren't going anywhere since our cgroup root
- * has a reference on them.
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
*/
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
- /* Skip subsystems not present or not in this hierarchy */
- if (ss == NULL || ss->root != cgrp->root)
- continue;
- css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway. */
- if (css && (atomic_read(&css->refcnt) > 1))
- return 1;
- }
- return 0;
-}
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- */
+ /*
+ * Killing would put the base ref, but we need to keep it alive
+ * until after ->css_offline().
+ */
+ css_get(css);
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
- local_irq_save(flags);
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- int refcnt;
- while (1) {
- /* We can only remove a CSS with a refcnt==1 */
- refcnt = atomic_read(&css->refcnt);
- if (refcnt > 1) {
- failed = true;
- goto done;
- }
- BUG_ON(!refcnt);
- /*
- * Drop the refcnt to 0 while we check other
- * subsystems. This will cause any racing
- * css_tryget() to spin until we set the
- * CSS_REMOVED bits or abort
- */
- if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
- break;
- cpu_relax();
- }
- }
- done:
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- if (failed) {
- /*
- * Restore old refcnt if we previously managed
- * to clear it from 1 to 0
- */
- if (!atomic_read(&css->refcnt))
- atomic_set(&css->refcnt, 1);
- } else {
- /* Commit the fact that the CSS is removed */
- set_bit(CSS_REMOVED, &css->flags);
- }
- }
- local_irq_restore(flags);
- return !failed;
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is
+ * invoked, no new css reference will be given out via
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
+ * proceed to offlining css's because percpu_ref_kill() doesn't
+ * guarantee that the ref is seen as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
- DEFINE_WAIT(wait);
- struct cgroup_event *event, *tmp;
- int ret;
+ struct cgroup_subsys_state *css;
+ bool empty;
+ int ssid;
- /* the vfs holds both inode->i_mutex already */
-again:
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
+ */
+ down_read(&css_set_rwsem);
+ empty = list_empty(&cgrp->cset_links);
+ up_read(&css_set_rwsem);
+ if (!empty)
return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
/*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
*/
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ if (css_has_online_children(&cgrp->self))
+ return -EBUSY;
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Mark @cgrp dead. This prevents further task migration and child
+ * creation by disabling cgroup_lock_live_group().
*/
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
+ cgrp->self.flags &= ~CSS_ONLINE;
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ /* initiate massacre of all css's */
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
+ /* CSS_ONLINE is clear, remove from ->release_list for the last time */
raw_spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- cgroup_lock_hierarchy(cgrp->root);
- /* delete this cgroup from parent->children */
- list_del_init(&cgrp->sibling);
- cgroup_unlock_hierarchy(cgrp->root);
+ /*
+ * Remove @cgrp directory along with the base files. @cgrp has an
+ * extra ref on its kn.
+ */
+ kernfs_remove(cgrp->kn);
+
+ set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
+ check_for_release(cgroup_parent(cgrp));
+
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
- d = dget(cgrp->dentry);
+ return 0;
+};
+
+static int cgroup_rmdir(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ cgrp = cgroup_kn_lock_live(kn);
+ if (!cgrp)
+ return 0;
+ cgroup_get(cgrp); /* for @kn->priv clearing */
- cgroup_d_remove_dir(d);
- dput(d);
+ ret = cgroup_destroy_locked(cgrp);
- set_bit(CGRP_RELEASABLE, &parent->flags);
- check_for_release(parent);
+ cgroup_kn_unlock(kn);
/*
- * Unregister events and notify userspace.
- * Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace
+ * There are two control paths which try to determine cgroup from
+ * dentry without going through kernfs - cgroupstats_build() and
+ * css_tryget_online_from_dir(). Those are supported by RCU
+ * protecting clearing of cgrp->kn->priv backpointer, which should
+ * happen after all files under it have been removed.
*/
- spin_lock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
- list_del(&event->list);
- remove_wait_queue(event->wqh, &event->wait);
- eventfd_signal(event->eventfd, 1);
- schedule_work(&event->remove);
- }
- spin_unlock(&cgrp->event_list_lock);
+ if (!ret)
+ RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
- mutex_unlock(&cgroup_mutex);
- return 0;
+ cgroup_put(cgrp);
+ return ret;
}
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
- /* Create the top cgroup state for this subsystem */
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
- css = ss->create(ss, dummytop);
+ mutex_lock(&cgroup_mutex);
+
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
+
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
- init_cgroup_css(css, ss, dummytop);
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
- * init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -4117,322 +4723,152 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
- ss->active = 1;
+ BUG_ON(online_css(css));
- /* this function shouldn't be used with modular subsystems, since they
- * need to register a subsys_id, among other things */
- BUG_ON(ss->module);
+ mutex_unlock(&cgroup_mutex);
}
/**
- * cgroup_load_subsys: load and register a modular subsystem at runtime
- * @ss: the subsystem to load
+ * cgroup_init_early - cgroup initialization at system boot
*
- * This function should be called in a modular subsystem's initcall. If the
- * subsystem is built as a module, it will be assigned a new subsys_id and set
- * up for use. If the subsystem is built-in anyway, work is delegated to the
- * simpler cgroup_init_subsys.
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
*/
-int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+int __init cgroup_init_early(void)
{
+ static struct cgroup_sb_opts __initdata opts =
+ { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ struct cgroup_subsys *ss;
int i;
- struct cgroup_subsys_state *css;
-
- /* check name and function validity */
- if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->create == NULL || ss->destroy == NULL)
- return -EINVAL;
- /*
- * we don't support callbacks in modular subsystems. this check is
- * before the ss->module check for consistency; a subsystem that could
- * be a module should still have no callbacks even if the user isn't
- * compiling it as one.
- */
- if (ss->fork || ss->exit)
- return -EINVAL;
+ init_cgroup_root(&cgrp_dfl_root, &opts);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
- /*
- * an optionally modular subsystem is built-in: we want to do nothing,
- * since cgroup_init_subsys will have already taken care of it.
- */
- if (ss->module == NULL) {
- /* a few sanity checks */
- BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
- BUG_ON(subsys[ss->subsys_id] != ss);
- return 0;
- }
+ RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
- /*
- * need to register a subsys id before anything else - for example,
- * init_cgroup_css needs it.
- */
- mutex_lock(&cgroup_mutex);
- /* find the first empty slot in the array */
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- if (subsys[i] == NULL)
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT) {
- /* maximum number of subsystems already registered! */
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- /* assign ourselves the subsys_id */
- ss->subsys_id = i;
- subsys[i] = ss;
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
- /*
- * no ss->create seems to need anything important in the ss struct, so
- * this can happen first (i.e. before the rootnode attachment).
- */
- css = ss->create(ss, dummytop);
- if (IS_ERR(css)) {
- /* failure case - need to deassign the subsys[] slot. */
- subsys[i] = NULL;
- mutex_unlock(&cgroup_mutex);
- return PTR_ERR(css);
- }
-
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
-
- /* our new subsystem will be attached to the dummy hierarchy. */
- init_cgroup_css(css, ss, dummytop);
- /* init_idr must be after init_cgroup_css because it sets css->id. */
- if (ss->use_id) {
- int ret = cgroup_init_idr(ss, css);
- if (ret) {
- dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(ss, dummytop);
- subsys[i] = NULL;
- mutex_unlock(&cgroup_mutex);
- return ret;
- }
- }
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
- /*
- * Now we need to entangle the css into the existing css_sets. unlike
- * in cgroup_init_subsys, there are now multiple css_sets, so each one
- * will need a new pointer to it; done by iterating the css_set_table.
- * furthermore, modifying the existing css_sets will corrupt the hash
- * table state, so each changed css_set will need its hash recomputed.
- * this is all done under the css_set_lock.
- */
- write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct css_set *cg;
- struct hlist_node *node, *tmp;
- struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
- hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
- /* skip entries that we already rehashed */
- if (cg->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hlist_del(&cg->hlist);
- /* set new value */
- cg->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- new_bucket = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, new_bucket);
- }
+ if (ss->early_init)
+ cgroup_init_subsys(ss, true);
}
- write_unlock(&css_set_lock);
-
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
- ss->active = 1;
-
- /* success! */
- mutex_unlock(&cgroup_mutex);
return 0;
}
-EXPORT_SYMBOL_GPL(cgroup_load_subsys);
/**
- * cgroup_unload_subsys: unload a modular subsystem
- * @ss: the subsystem to unload
+ * cgroup_init - cgroup initialization
*
- * This function should be called in a modular subsystem's exitcall. When this
- * function is invoked, the refcount on the subsystem's module will be 0, so
- * the subsystem will not be attached to any hierarchy.
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
*/
-void cgroup_unload_subsys(struct cgroup_subsys *ss)
+int __init cgroup_init(void)
{
- struct cg_cgroup_link *link;
- struct hlist_head *hhead;
-
- BUG_ON(ss->module == NULL);
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid, err;
- /*
- * we shouldn't be called if the subsystem is in use, and the use of
- * try_module_get in parse_cgroupfs_options should ensure that it
- * doesn't start being used while we're killing it off.
- */
- BUG_ON(ss->root != &rootnode);
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
mutex_lock(&cgroup_mutex);
- /* deassign the subsys_id */
- BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
- subsys[ss->subsys_id] = NULL;
-
- /* remove subsystem from rootnode's list of subsystems */
- list_del_init(&ss->sibling);
- /*
- * disentangle the css from all css_sets attached to the dummytop. as
- * in loading, we need to pay our respects to the hashtable gods.
- */
- write_lock(&css_set_lock);
- list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
- struct css_set *cg = link->cg;
-
- hlist_del(&cg->hlist);
- BUG_ON(!cg->subsys[ss->subsys_id]);
- cg->subsys[ss->subsys_id] = NULL;
- hhead = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, hhead);
- }
- write_unlock(&css_set_lock);
+ /* Add init_css_set to the hash table */
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
- /*
- * remove subsystem's css from the dummytop and free it - need to free
- * before marking as null because ss->destroy needs the cgrp->subsys
- * pointer to find their state. note that this also takes care of
- * freeing the css_id.
- */
- ss->destroy(ss, dummytop);
- dummytop->subsys[ss->subsys_id] = NULL;
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
-
-/**
- * cgroup_init_early - cgroup initialization at system boot
- *
- * Initialize cgroups at system boot, and initialize any
- * subsystems that request early init.
- */
-int __init cgroup_init_early(void)
-{
- int i;
- atomic_set(&init_css_set.refcount, 1);
- INIT_LIST_HEAD(&init_css_set.cg_links);
- INIT_LIST_HEAD(&init_css_set.tasks);
- INIT_HLIST_NODE(&init_css_set.hlist);
- css_set_count = 1;
- init_cgroup_root(&rootnode);
- root_count = 1;
- init_task.cgroups = &init_css_set;
-
- init_css_set_link.cg = &init_css_set;
- init_css_set_link.cgrp = dummytop;
- list_add(&init_css_set_link.cgrp_link_list,
- &rootnode.top_cgroup.css_sets);
- list_add(&init_css_set_link.cg_link_list,
- &init_css_set.cg_links);
-
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
- /* at bootup time, we don't worry about modular subsystems */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- BUG_ON(!ss->name);
- BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
- if (ss->subsys_id != i) {
- printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
- ss->name, ss->subsys_id);
- BUG();
- }
- if (ss->early_init)
- cgroup_init_subsys(ss);
- }
- return 0;
-}
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
-/**
- * cgroup_init - cgroup initialization
- *
- * Register cgroup filesystem and /proc file, and initialize
- * any subsystems that didn't request early init.
- */
-int __init cgroup_init(void)
-{
- int err;
- int i;
- struct hlist_head *hhead;
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
- err = bdi_init(&cgroup_backing_dev_info);
- if (err)
- return err;
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
- /* at bootup time, we don't worry about modular subsystems */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (!ss->early_init)
- cgroup_init_subsys(ss);
- if (ss->use_id)
- cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (!ss->disabled) {
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ }
}
- /* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
- BUG_ON(!init_root_id(&rootnode));
-
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
- if (!cgroup_kobj) {
- err = -ENOMEM;
- goto out;
- }
+ if (!cgroup_kobj)
+ return -ENOMEM;
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
- goto out;
+ return err;
}
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+ return 0;
+}
-out:
- if (err)
- bdi_destroy(&cgroup_backing_dev_info);
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
- return err;
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+
+ return 0;
}
+core_initcall(cgroup_wq_init);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
- * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
- * doesn't really matter if tsk->cgroup changes after we read it,
- * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
- * anyway. No need to check that tsk->cgroup != NULL, thanks to
- * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
- * cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -4445,28 +4881,36 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- for_each_active_root(root) {
+ for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int count = 0;
+ int ssid, count = 0;
+
+ if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(root, ss)
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
out_unlock:
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
@@ -4475,22 +4919,10 @@ out:
return retval;
}
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-const struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
+ struct cgroup_subsys *ss;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4500,14 +4932,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
* subsys/hierarchy state.
*/
mutex_lock(&cgroup_mutex);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
+
+ for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
- }
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -4525,99 +4955,83 @@ static const struct file_operations proc_cgroupstats_operations = {
};
/**
- * cgroup_fork - attach newly forked task to its parents cgroup.
+ * cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
*
- * Description: A task inherits its parent's cgroup at fork().
- *
- * A pointer to the shared css_set was automatically copied in
- * fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU, cgroup_mutex or
- * threadgroup_change_begin(), so it might no longer be a valid
- * cgroup pointer. cgroup_attach_task() might have already changed
- * current->cgroups, allowing the previously referenced cgroup
- * group to be removed and freed.
- *
- * Outside the pointer validity we also need to process the css_set
- * inheritance between threadgoup_change_begin() and
- * threadgoup_change_end(), this way there is no leak in any process
- * wide migration performed by cgroup_attach_proc() that could otherwise
- * miss a thread because it is too early or too late in the fork stage.
- *
- * At the point that cgroup_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set. Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
- /*
- * We don't need to task_lock() current because current->cgroups
- * can't be changed concurrently here. The parent obviously hasn't
- * exited and called cgroup_exit(), and we are synchronized against
- * cgroup migration through threadgroup_change_begin().
- */
- child->cgroups = current->cgroups;
- get_css_set(child->cgroups);
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- /*
- * forkexit callbacks are only supported for builtin
- * subsystems, and the builtin section of the subsys array is
- * immutable, so we don't need to lock the subsys array here.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(ss, child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ struct cgroup_subsys *ss;
+ int i;
+
+ /*
+ * This may race against cgroup_enable_task_cg_links(). As that
+ * function sets use_task_css_set_links before grabbing
+ * tasklist_lock and we just went through tasklist_lock to add
+ * @child, it's guaranteed that either we see the set
+ * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+ * @child during its iteration.
+ *
+ * If we won the race, @child is associated with %current's
+ * css_set. Grabbing css_set_rwsem guarantees both that the
+ * association is stable, and, on completion of the parent's
+ * migration, @child is visible in the source of migration or
+ * already in the destination cgroup. This guarantee is necessary
+ * when implementing operations which need to migrate all tasks of
+ * a cgroup to another.
+ *
+ * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * will remain in init_css_set. This is safe because all tasks are
+ * in the init_css_set before cg_links is enabled and there's no
+ * operation which transfers all tasks out of init_css_set.
+ */
if (use_task_css_set_links) {
- write_lock(&css_set_lock);
+ struct css_set *cset;
+
+ down_write(&css_set_rwsem);
+ cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
- /*
- * It's safe to use child->cgroups without task_lock()
- * here because we are protected through
- * threadgroup_change_begin() against concurrent
- * css_set change in cgroup_task_migrate(). Also
- * the task can't exit at that point until
- * wake_up_new_task() is called, so we are protected
- * against cgroup_exit() setting child->cgroup to
- * init_css_set.
- */
- list_add(&child->cg_list, &child->cgroups->tasks);
+ rcu_assign_pointer(child->cgroups, cset);
+ list_add(&child->cg_list, &cset->tasks);
+ get_css_set(cset);
}
- write_unlock(&css_set_lock);
+ up_write(&css_set_rwsem);
+ }
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for_each_subsys(ss, i)
+ if (ss->fork)
+ ss->fork(child);
}
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -4627,111 +5041,64 @@ void cgroup_post_fork(struct task_struct *child)
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
- * the_top_cgroup_hack:
- *
- * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
- *
- * We call cgroup_exit() while the task is still competent to
- * handle notify_on_release(), then leave the task attached to the
- * root cgroup in each hierarchy for the remainder of its exit.
- *
- * To do this properly, we would increment the reference count on
- * top_cgroup, and near the very end of the kernel/exit.c do_exit()
- * code we would add a second cgroup function call, to drop that
- * reference. This would just create an unnecessary hot spot on
- * the top_cgroup reference count, to no avail.
- *
- * Normally, holding a reference to a cgroup without bumping its
- * count is unsafe. The cgroup could go away, or someone could
- * attach us to a different cgroup, decrementing the count on
- * the first cgroup that we never incremented. But in this case,
- * top_cgroup isn't going away, and either task has PF_EXITING set,
- * which wards off any cgroup_attach_task() attempts, or task is a failed
- * fork, never visible to cgroup_attach_task.
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit. No need to bother with
+ * init_css_set refcnting. init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
- struct css_set *cg;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ bool put_cset = false;
int i;
/*
- * Unlink from the css_set task list if necessary.
- * Optimistically check cg_list before taking
- * css_set_lock
+ * Unlink from @tsk from its css_set. As migration path can't race
+ * with us, we can check cg_list without grabbing css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_del_init(&tsk->cg_list);
- write_unlock(&css_set_lock);
+ down_write(&css_set_rwsem);
+ list_del_init(&tsk->cg_list);
+ up_write(&css_set_rwsem);
+ put_cset = true;
}
/* Reassign the task to the init_css_set. */
- task_lock(tsk);
- cg = tsk->cgroups;
- tsk->cgroups = &init_css_set;
+ cset = task_css_set(tsk);
+ RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
+ if (need_forkexit_callback) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
if (ss->exit) {
- struct cgroup *old_cgrp =
- rcu_dereference_raw(cg->subsys[i])->cgroup;
- struct cgroup *cgrp = task_cgroup(tsk, i);
- ss->exit(ss, cgrp, old_cgrp, tsk);
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
+
+ ss->exit(css, old_css, tsk);
}
}
}
- task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
- int ret;
- struct cgroup *target;
-
- if (cgrp == dummytop)
- return 1;
-
- target = task_cgroup_from_root(task, cgrp->root);
- while (cgrp != target && cgrp!= cgrp->top_cgroup)
- cgrp = cgrp->parent;
- ret = (cgrp == target);
- return ret;
+ if (put_cset)
+ put_css_set(cset, true);
}
static void check_for_release(struct cgroup *cgrp)
{
- /* All of these checks rely on RCU to keep the cgroup
- * structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
- /* Control Group is currently removeable. If it's not
+ if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+ !css_has_online_children(&cgrp->self)) {
+ /*
+ * Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
- * it now */
+ * it now
+ */
int need_schedule_work = 0;
+
raw_spin_lock(&release_list_lock);
- if (!cgroup_is_removed(cgrp) &&
+ if (!cgroup_is_dead(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
@@ -4742,25 +5109,6 @@ static void check_for_release(struct cgroup *cgrp)
}
}
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
-{
- struct cgroup *cgrp = css->cgroup;
- int val;
- rcu_read_lock();
- val = atomic_sub_return(count, &css->refcnt);
- if (val == 1) {
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- cgroup_wakeup_rmdir_waiter(cgrp);
- }
- rcu_read_unlock();
- WARN_ON_ONCE(val < 1);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
@@ -4792,16 +5140,17 @@ static void cgroup_release_agent(struct work_struct *work)
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
@@ -4809,7 +5158,7 @@ static void cgroup_release_agent(struct work_struct *work)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
@@ -4835,19 +5184,15 @@ static void cgroup_release_agent(struct work_struct *work)
static int __init cgroup_disable(char *str)
{
- int i;
+ struct cgroup_subsys *ss;
char *token;
+ int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
- /*
- * cgroup_disable, being at boot time, can't know about module
- * subsystems, so we don't worry about them.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
+ for_each_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
@@ -4860,285 +5205,62 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-
-/*
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid;
-
- /*
- * This css_id() can return correct value when somone has refcnt
- * on this or this is under rcu_read_lock(). Once css->id is allocated,
- * it's unchanged until freed.
- */
- cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-
- if (cssid)
- return cssid->id;
- return 0;
-}
-EXPORT_SYMBOL_GPL(css_id);
-
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid;
-
- cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-
- if (cssid)
- return cssid->depth;
- return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
-
/**
- * css_is_ancestor - test "root" css is an ancestor of "child"
- * @child: the css to be tested.
- * @root: the css supporsed to be an ancestor of the child.
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
*
- * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
- * But, considering usual usage, the csses should be valid objects after test.
- * Assuming that the caller will do some action to the child if this returns
- * returns true, the caller must take "child";s reference count.
- * If "child" is valid object and this returns true, "root" is valid, too.
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
*/
-
-bool css_is_ancestor(struct cgroup_subsys_state *child,
- const struct cgroup_subsys_state *root)
-{
- struct css_id *child_id;
- struct css_id *root_id;
- bool ret = true;
-
- rcu_read_lock();
- child_id = rcu_dereference(child->id);
- root_id = rcu_dereference(root->id);
- if (!child_id
- || !root_id
- || (child_id->depth < root_id->depth)
- || (child_id->stack[root_id->depth] != root_id->id))
- ret = false;
- rcu_read_unlock();
- return ret;
-}
-
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
- struct css_id *id = css->id;
- /* When this is called before css_id initialization, id can be NULL */
- if (!id)
- return;
-
- BUG_ON(!ss->use_id);
-
- rcu_assign_pointer(id->css, NULL);
- rcu_assign_pointer(css->id, NULL);
- write_lock(&ss->id_lock);
- idr_remove(&ss->idr, id->id);
- write_unlock(&ss->id_lock);
- kfree_rcu(id, rcu_head);
-}
-EXPORT_SYMBOL_GPL(free_css_id);
-
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
- */
-
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
-{
- struct css_id *newid;
- int myid, error, size;
-
- BUG_ON(!ss->use_id);
-
- size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
- newid = kzalloc(size, GFP_KERNEL);
- if (!newid)
- return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
- write_lock(&ss->id_lock);
- /* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
- write_unlock(&ss->id_lock);
-
- /* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
- goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
-
- newid->id = myid;
- newid->depth = depth;
- return newid;
-remove_idr:
- error = -ENOSPC;
- write_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- write_unlock(&ss->id_lock);
-err_out:
- kfree(newid);
- return ERR_PTR(error);
-
-}
-
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *rootcss)
-{
- struct css_id *newid;
-
- rwlock_init(&ss->id_lock);
- idr_init(&ss->idr);
-
- newid = get_new_cssid(ss, 0);
- if (IS_ERR(newid))
- return PTR_ERR(newid);
-
- newid->stack[0] = newid->id;
- newid->css = rootcss;
- rootcss->id = newid;
- return 0;
-}
-
-static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
- struct cgroup *child)
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
{
- int subsys_id, i, depth = 0;
- struct cgroup_subsys_state *parent_css, *child_css;
- struct css_id *child_id, *parent_id;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
+ struct cgroup *cgrp;
- subsys_id = ss->subsys_id;
- parent_css = parent->subsys[subsys_id];
- child_css = child->subsys[subsys_id];
- parent_id = parent_css->id;
- depth = parent_id->depth + 1;
+ /* is @dentry a cgroup dir? */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return ERR_PTR(-EBADF);
- child_id = get_new_cssid(ss, depth);
- if (IS_ERR(child_id))
- return PTR_ERR(child_id);
+ rcu_read_lock();
- for (i = 0; i < depth; i++)
- child_id->stack[i] = parent_id->stack[i];
- child_id->stack[depth] = child_id->id;
/*
- * child_id->css pointer will be set after this cgroup is available
- * see cgroup_populate_dir()
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See cgroup_rmdir() for details.
*/
- rcu_assign_pointer(child_css->id, child_id);
-
- return 0;
-}
-
-/**
- * css_lookup - lookup css by id
- * @ss: cgroup subsys to be looked into.
- * @id: the id
- *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
- * NULL if not. Should be called under rcu_read_lock()
- */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
-{
- struct css_id *cssid = NULL;
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
- BUG_ON(!ss->use_id);
- cssid = idr_find(&ss->idr, id);
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
- if (unlikely(!cssid))
- return NULL;
-
- return rcu_dereference(cssid->css);
+ rcu_read_unlock();
+ return css;
}
-EXPORT_SYMBOL_GPL(css_lookup);
/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
*
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid)
-{
- struct cgroup_subsys_state *ret = NULL;
- struct css_id *tmp;
- int tmpid;
- int rootid = css_id(root);
- int depth = css_depth(root);
-
- if (!rootid)
- return NULL;
-
- BUG_ON(!ss->use_id);
- /* fill start point for scan */
- tmpid = id;
- while (1) {
- /*
- * scan next entry from bitmap(tree), tmpid is updated after
- * idr_get_next().
- */
- read_lock(&ss->id_lock);
- tmp = idr_get_next(&ss->idr, &tmpid);
- read_unlock(&ss->id_lock);
-
- if (!tmp)
- break;
- if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
- ret = rcu_dereference(tmp->css);
- if (ret) {
- *foundid = tmpid;
- break;
- }
- }
- /* continue to scan from next id */
- tmpid = tmpid + 1;
- }
- return ret;
-}
-
-/*
- * get corresponding css from file open on cgroupfs directory
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
*/
-struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
- struct cgroup *cgrp;
- struct inode *inode;
- struct cgroup_subsys_state *css;
-
- inode = f->f_dentry->d_inode;
- /* check in cgroup filesystem dir */
- if (inode->i_op != &cgroup_dir_inode_operations)
- return ERR_PTR(-EBADF);
-
- if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
- return ERR_PTR(-EINVAL);
-
- /* get cgroup */
- cgrp = __d_cgrp(f->f_dentry);
- css = cgrp->subsys[id];
- return css ? css : ERR_PTR(-ENOENT);
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
}
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5148,101 +5270,100 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
return css;
}
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+static void debug_css_free(struct cgroup_subsys_state *css)
{
- return atomic_read(&cont->count);
+ kfree(css);
}
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return cgroup_task_count(cont);
+ return cgroup_task_count(css->cgroup);
}
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
return (u64)(unsigned long)current->cgroups;
}
-static u64 current_css_set_refcount_read(struct cgroup *cont,
- struct cftype *cft)
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
u64 count;
rcu_read_lock();
- count = atomic_read(&current->cgroups->refcount);
+ count = atomic_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
-static int current_css_set_cg_links_read(struct cgroup *cont,
- struct cftype *cft,
- struct seq_file *seq)
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
- read_lock(&css_set_lock);
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ down_read(&css_set_rwsem);
rcu_read_lock();
- cg = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
- const char *name;
- if (c->dentry)
- name = c->dentry->d_name.name;
- else
- name = "?";
+ cgroup_name(c, name_buf, NAME_MAX + 1);
seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name);
+ c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
return 0;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
- struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
- struct cg_cgroup_link *link;
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
- struct css_set *cg = link->cg;
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
- seq_printf(seq, "css_set %p\n", cg);
- list_for_each_entry(task, &cg->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
- seq_puts(seq, " ...\n");
- break;
- } else {
- seq_printf(seq, " task %d\n",
- task_pid_vnr(task));
- }
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
}
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return 0;
}
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+ return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
}
static struct cftype debug_files[] = {
{
- .name = "cgroup_refcount",
- .read_u64 = cgroup_refcount_read,
- },
- {
.name = "taskcount",
.read_u64 = debug_taskcount_read,
},
@@ -5259,31 +5380,25 @@ static struct cftype debug_files[] = {
{
.name = "current_css_set_cg_links",
- .read_seq_string = current_css_set_cg_links_read,
+ .seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
- .read_seq_string = cgroup_css_links_read,
+ .seq_show = cgroup_css_links_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
-};
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, debug_files,
- ARRAY_SIZE(debug_files));
-}
+ { } /* terminate */
+};
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
- .populate = debug_populate,
- .subsys_id = debug_subsys_id,
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a6..a79e40f9d70 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,115 +21,69 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
-enum freezer_state {
- CGROUP_THAWED = 0,
- CGROUP_FREEZING,
- CGROUP_FROZEN,
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};
struct freezer {
- struct cgroup_subsys_state css;
- enum freezer_state state;
- spinlock_t lock; /* protects _writes_ to state */
+ struct cgroup_subsys_state css;
+ unsigned int state;
};
-static inline struct freezer *cgroup_freezer(
- struct cgroup *cgroup)
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
- return container_of(
- cgroup_subsys_state(cgroup, freezer_subsys_id),
- struct freezer, css);
+ return css ? container_of(css, struct freezer, css) : NULL;
}
static inline struct freezer *task_freezer(struct task_struct *task)
{
- return container_of(task_subsys_state(task, freezer_subsys_id),
- struct freezer, css);
+ return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+ return css_freezer(freezer->css.parent);
}
bool cgroup_freezing(struct task_struct *task)
{
- enum freezer_state state;
bool ret;
rcu_read_lock();
- state = task_freezer(task)->state;
- ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
}
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
-static const char *freezer_state_strs[] = {
- "THAWED",
- "FREEZING",
- "FROZEN",
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
};
-/*
- * State diagram
- * Transitions are caused by userspace writes to the freezer.state file.
- * The values in parenthesis are state labels. The rest are edge labels.
- *
- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
- * ^ ^ | |
- * | \_______THAWED_______/ |
- * \__________________________THAWED____________/
- */
-
-struct cgroup_subsys freezer_subsys;
-
-/* Locks taken and their ordering
- * ------------------------------
- * cgroup_mutex (AKA cgroup_lock)
- * freezer->lock
- * css_set_lock
- * task->alloc_lock (AKA task_lock)
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * freezer_can_attach():
- * cgroup_mutex (held by caller of can_attach)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * freezer->lock
- * sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * sighand->siglock (fake signal delivery inside freeze_task())
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
- * sighand->siglock
- */
-static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct freezer *freezer;
@@ -137,248 +91,394 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
- freezer->state = CGROUP_THAWED;
return &freezer->css;
}
-static void freezer_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
{
- struct freezer *freezer = cgroup_freezer(cgroup);
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
- if (freezer->state != CGROUP_THAWED)
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
- kfree(freezer);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
}
-/* task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
+static void freezer_css_free(struct cgroup_subsys_state *css)
{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
+ kfree(css_freezer(css));
}
/*
- * The call to cgroup_lock() in the freezer.state write method prevents
- * a write to that file racing against an attach, and hence the
- * can_attach() result will remain valid until the attach completes.
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
*/
-static int freezer_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup,
- struct cgroup_taskset *tset)
+static void freezer_attach(struct cgroup_subsys_state *new_css,
+ struct cgroup_taskset *tset)
{
- struct freezer *freezer;
+ struct freezer *freezer = css_freezer(new_css);
struct task_struct *task;
+ bool clear_frozen = false;
+
+ mutex_lock(&freezer_mutex);
/*
- * Anything frozen can't move or be moved to/from.
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
- cgroup_taskset_for_each(task, new_cgroup, tset)
- if (cgroup_freezing(task))
- return -EBUSY;
+ cgroup_taskset_for_each(task, tset) {
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = true;
+ }
+ }
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state != CGROUP_THAWED)
- return -EBUSY;
+ /* propagate FROZEN clearing upwards */
+ while (clear_frozen && (freezer = parent_freezer(freezer))) {
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = freezer->state & CGROUP_FREEZING;
+ }
- return 0;
+ mutex_unlock(&freezer_mutex);
}
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
/*
- * No lock is needed, since the task isn't on tasklist yet,
- * so it can't be moved to another cgroup, which means the
- * freezer won't be removed and will be valid during this
- * function call. Nevertheless, apply RCU read-side critical
- * section to suppress RCU lockdep false positives.
- */
- rcu_read_lock();
- freezer = task_freezer(task);
- rcu_read_unlock();
-
- /*
- * The root cgroup is non-freezable, so we can skip the
- * following check.
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
*/
- if (!freezer->css.cgroup->parent)
+ if (task_css_is_root(task, freezer_cgrp_id))
return;
- spin_lock_irq(&freezer->lock);
- BUG_ON(freezer->state == CGROUP_FROZEN);
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
- /* Locking avoids race with FREEZING -> THAWED transitions. */
- if (freezer->state == CGROUP_FREEZING)
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
- spin_unlock_irq(&freezer->lock);
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-/*
- * caller must hold freezer->lock
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
*/
-static void update_if_frozen(struct cgroup *cgroup,
- struct freezer *freezer)
+static void update_if_frozen(struct cgroup_subsys_state *css)
{
- struct cgroup_iter it;
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int nfrozen = 0, ntotal = 0;
- enum freezer_state old_state = freezer->state;
-
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- ntotal++;
- if (freezing(task) && is_task_frozen_enough(task))
- nfrozen++;
+
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
}
+ rcu_read_unlock();
- if (old_state == CGROUP_THAWED) {
- BUG_ON(nfrozen > 0);
- } else if (old_state == CGROUP_FREEZING) {
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- } else { /* old_state == CGROUP_FROZEN */
- BUG_ON(nfrozen != ntotal);
+ /* are all tasks frozen? */
+ css_task_iter_start(css, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
}
- cgroup_iter_end(cgroup, &it);
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
}
-static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
+static int freezer_read(struct seq_file *m, void *v)
{
- struct freezer *freezer;
- enum freezer_state state;
-
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
-
- freezer = cgroup_freezer(cgroup);
- spin_lock_irq(&freezer->lock);
- state = freezer->state;
- if (state == CGROUP_FREEZING) {
- /* We change from FREEZING to FROZEN lazily if the cgroup was
- * only partially frozen when we exitted write. */
- update_if_frozen(cgroup, freezer);
- state = freezer->state;
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
}
- spin_unlock_irq(&freezer->lock);
- cgroup_unlock();
- seq_puts(m, freezer_state_strs[state]);
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
return 0;
}
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int num_cant_freeze_now = 0;
-
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task))
- continue;
- if (is_task_frozen_enough(task))
- continue;
- if (!freezing(task) && !freezer_should_skip(task))
- num_cant_freeze_now++;
- }
- cgroup_iter_end(cgroup, &it);
- return num_cant_freeze_now ? -EBUSY : 0;
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
}
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it)))
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
__thaw_task(task);
- cgroup_iter_end(cgroup, &it);
+ css_task_iter_end(&it);
}
-static int freezer_change_state(struct cgroup *cgroup,
- enum freezer_state goal_state)
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
{
- struct freezer *freezer;
- int retval = 0;
-
- freezer = cgroup_freezer(cgroup);
-
- spin_lock_irq(&freezer->lock);
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
- update_if_frozen(cgroup, freezer);
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
- switch (goal_state) {
- case CGROUP_THAWED:
- if (freezer->state != CGROUP_THAWED)
- atomic_dec(&system_freezing_cnt);
- freezer->state = CGROUP_THAWED;
- unfreeze_cgroup(cgroup, freezer);
- break;
- case CGROUP_FROZEN:
- if (freezer->state == CGROUP_THAWED)
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
atomic_inc(&system_freezing_cnt);
- freezer->state = CGROUP_FREEZING;
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- default:
- BUG();
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
}
+}
- spin_unlock_irq(&freezer->lock);
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup_subsys_state *pos;
+
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
- return retval;
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-static int freezer_write(struct cgroup *cgroup,
- struct cftype *cft,
- const char *buffer)
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval;
- enum freezer_state goal_state;
+ bool freeze;
+
+ buf = strstrip(buf);
- if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
- goal_state = CGROUP_THAWED;
- else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
- goal_state = CGROUP_FROZEN;
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
else
return -EINVAL;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
- retval = freezer_change_state(cgroup, goal_state);
- cgroup_unlock();
- return retval;
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
static struct cftype files[] = {
{
.name = "state",
- .read_seq_string = freezer_read,
- .write_string = freezer_write,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
},
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
};
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
- if (!cgroup->parent)
- return 0;
- return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys freezer_subsys = {
- .name = "freezer",
- .create = freezer_create,
- .destroy = freezer_destroy,
- .populate = freezer_populate,
- .subsys_id = freezer_subsys_id,
- .can_attach = freezer_can_attach,
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
.fork = freezer_fork,
+ .base_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index f346cedfe24..633394f442f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,29 +30,6 @@
#include <asm/uaccess.h>
-/*
- * Note that the native side is already converted to a timespec, because
- * that's what we want anyway.
- */
-static int compat_get_timeval(struct timespec *o,
- struct compat_timeval __user *i)
-{
- long usec;
-
- if (get_user(o->tv_sec, &i->tv_sec) ||
- get_user(usec, &i->tv_usec))
- return -EFAULT;
- o->tv_nsec = usec * 1000;
- return 0;
-}
-
-static int compat_put_timeval(struct compat_timeval __user *o,
- struct timeval *i)
-{
- return (put_user(i->tv_sec, &o->tv_sec) ||
- put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
-}
-
static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
memset(txc, 0, sizeof(struct timex));
@@ -111,13 +88,13 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
return 0;
}
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
- if (compat_put_timeval(tv, &ktv))
+ if (compat_put_timeval(&ktv, tv))
return -EFAULT;
}
if (tz) {
@@ -128,38 +105,113 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
return 0;
}
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
- struct timespec kts;
- struct timezone ktz;
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
if (tv) {
- if (compat_get_timeval(&kts, tv))
+ if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ __get_user(tv->tv_sec, &ctv->tv_sec) ||
+ __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ __put_user(tv->tv_sec, &ctv->tv_sec) ||
+ __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-EXPORT_SYMBOL_GPL(put_compat_timespec);
+
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_get_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_put_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
+
+int compat_convert_timespec(struct timespec __user **kts,
+ const void __user *cts)
+{
+ struct timespec ts;
+ struct timespec __user *uts;
+
+ if (!cts || COMPAT_USE_64BIT_TIME) {
+ *kts = (struct timespec __user *)cts;
+ return 0;
+ }
+
+ uts = compat_alloc_user_space(sizeof(ts));
+ if (!uts)
+ return -EFAULT;
+ if (compat_get_timespec(&ts, cts))
+ return -EFAULT;
+ if (copy_to_user(uts, &ts, sizeof(ts)))
+ return -EFAULT;
+
+ *kts = uts;
+ return 0;
+}
static long compat_nanosleep_restart(struct restart_block *restart)
{
@@ -177,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (ret) {
rmtp = restart->nanosleep.compat_rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
return ret;
}
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
mm_segment_t oldfs;
long ret;
- if (get_compat_timespec(&tu, rqtp))
+ if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
if (!timespec_valid(&tu))
@@ -211,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
@@ -238,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -250,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -276,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
if (tbuf) {
struct tms tms;
@@ -302,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
* types that can be passed to put_user()/get_user().
*/
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
{
old_sigset_t s;
long ret;
@@ -320,31 +372,60 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ memcpy(blocked->sig, &set, sizeof(set));
+}
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
#endif
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
@@ -362,15 +443,15 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
#ifdef COMPAT_RLIM_OLD_INFINITY
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, &r);
+ ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
set_fs(old_fs);
if (!ret) {
@@ -389,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
#endif
-asmlinkage long compat_sys_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
@@ -435,28 +516,11 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
-}
-
-asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
- struct compat_rusage __user *ru)
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
@@ -483,9 +547,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
}
}
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
- struct compat_siginfo __user *uinfo, int options,
- struct compat_rusage __user *uru)
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
{
siginfo_t info;
struct rusage ru;
@@ -503,9 +568,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
return ret;
if (uru) {
- ret = put_compat_rusage(&ru, uru);
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -527,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
return compat_get_bitmap(k, user_mask_ptr, len * 8);
}
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
- unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+ unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
@@ -547,8 +616,8 @@ out:
return retval;
}
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
@@ -578,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
int get_compat_itimerspec(struct itimerspec *dst,
const struct compat_itimerspec __user *src)
{
- if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
- get_compat_timespec(&dst->it_value, &src->it_value))
+ if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+ __compat_get_timespec(&dst->it_value, &src->it_value))
return -EFAULT;
return 0;
}
@@ -587,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst,
int put_compat_itimerspec(struct compat_itimerspec __user *dst,
const struct itimerspec *src)
{
- if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
- put_compat_timespec(&src->it_value, &dst->it_value))
+ if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+ __compat_put_timespec(&src->it_value, &dst->it_value))
return -EFAULT;
return 0;
}
-long compat_sys_timer_create(clockid_t which_clock,
- struct compat_sigevent __user *timer_event_spec,
- timer_t __user *created_timer_id)
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+ struct compat_sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
{
struct sigevent __user *event = NULL;
@@ -611,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
return sys_timer_create(which_clock, event, created_timer_id);
}
-long compat_sys_timer_settime(timer_t timer_id, int flags,
- struct compat_itimerspec __user *new,
- struct compat_itimerspec __user *old)
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ struct compat_itimerspec __user *, new,
+ struct compat_itimerspec __user *, old)
{
long err;
mm_segment_t oldfs;
@@ -634,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
return err;
}
-long compat_sys_timer_gettime(timer_t timer_id,
- struct compat_itimerspec __user *setting)
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct compat_itimerspec __user *, setting)
{
long err;
mm_segment_t oldfs;
@@ -651,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id,
return err;
}
-long compat_sys_clock_settime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
- if (get_compat_timespec(&ts, tp))
+ if (compat_get_timespec(&ts, tp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
@@ -668,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
return err;
}
-long compat_sys_clock_gettime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -680,13 +749,13 @@ long compat_sys_clock_gettime(clockid_t which_clock,
err = sys_clock_gettime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && put_compat_timespec(&ts, tp))
+ if (!err && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
-long compat_sys_clock_adjtime(clockid_t which_clock,
- struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+ struct compat_timex __user *, utp)
{
struct timex txc;
mm_segment_t oldfs;
@@ -708,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
return ret;
}
-long compat_sys_clock_getres(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -720,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
err = sys_clock_getres(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && tp && put_compat_timespec(&ts, tp))
+ if (!err && tp && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
@@ -730,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
long err;
mm_segment_t oldfs;
struct timespec tu;
- struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+ struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
restart->nanosleep.rmtp = (struct timespec __user *) &tu;
oldfs = get_fs();
@@ -739,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&tu, rmtp))
+ compat_put_timespec(&tu, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -749,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
return err;
}
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
- struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+ struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
long err;
mm_segment_t oldfs;
struct timespec in, out;
struct restart_block *restart;
- if (get_compat_timespec(&in, rqtp))
+ if (compat_get_timespec(&in, rqtp))
return -EFAULT;
oldfs = get_fs();
@@ -769,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&out, rmtp))
+ compat_put_timespec(&out, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -883,7 +952,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -894,10 +963,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
}
EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
@@ -913,7 +992,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
sigset_from_compat(&s, &s32);
if (uts) {
- if (get_compat_timespec(&t, uts))
+ if (compat_get_timespec(&t, uts))
return -EFAULT;
}
@@ -925,25 +1004,13 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
return ret;
-
-}
-
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
- struct compat_siginfo __user *uinfo)
-{
- siginfo_t info;
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
{
compat_time_t i;
struct timeval tv;
@@ -959,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
return i;
}
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
struct timespec tv;
int err;
@@ -979,32 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
- current->saved_sigmask = current->blocked;
- set_current_blocked(&newset);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
{
struct timex txc;
int err, ret;
@@ -1023,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
}
#ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
- compat_uptr_t __user *pages32,
- const int __user *nodes,
- int __user *status,
- int flags)
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
{
const void __user * __user *pages;
int i;
@@ -1043,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
- compat_ulong_t maxnode,
- const compat_ulong_t __user *old_nodes,
- const compat_ulong_t __user *new_nodes)
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
@@ -1077,69 +1119,20 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
{
- struct sysinfo s;
-
- do_sysinfo(&s);
-
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
-
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
-
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs();
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
+ set_fs(KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+ set_fs(old_fs);
+ if (compat_put_timespec(&t, interval))
return -EFAULT;
-
- return 0;
+ return ret;
}
/*
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075ee..c18b1f1ae51 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- entry->size = kernel_config_data_size;
+ proc_set_size(entry, kernel_config_data_size);
return 0;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 00000000000..5664985c46a
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,216 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+
+void context_tracking_cpu_set(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+}
+
+/**
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
+void context_tracking_user_enter(void)
+{
+ unsigned long flags;
+
+ /*
+ * Repeat the user_enter() check here because some archs may be calling
+ * this from asm and if no CPU needs context tracking, they shouldn't
+ * go further. Repeat the check here until they support the inline static
+ * key check.
+ */
+ if (!context_tracking_is_enabled())
+ return;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ /* Kernel threads aren't supposed to go to userspace */
+ WARN_ON_ONCE(!current->mm);
+
+ local_irq_save(flags);
+ if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ trace_user_enter(0);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
+ rcu_user_enter();
+ }
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ __this_cpu_write(context_tracking.state, IN_USER);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
+
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ /*
+ * Need to disable preemption in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ preempt_disable_notrace();
+ prev_ctx = exception_enter();
+ preempt_enable_no_resched_notrace();
+
+ preempt_schedule();
+
+ preempt_disable_notrace();
+ exception_exit(prev_ctx);
+ preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
+
+/**
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
+void context_tracking_user_exit(void)
+{
+ unsigned long flags;
+
+ if (!context_tracking_is_enabled())
+ return;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
+ rcu_user_exit();
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
+ __this_cpu_write(context_tracking.state, IN_KERNEL);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
+
+/**
+ * __context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
+void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e5702..a343bde710b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,30 +10,42 @@
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include <linux/lockdep.h>
+#include <trace/events/power.h>
+
+#include "smpboot.h"
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_done);
static RAW_NOTIFIER_HEAD(cpu_chain);
@@ -52,17 +64,30 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
};
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
+ cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
@@ -75,9 +100,14 @@ void put_online_cpus(void)
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
+
+ if (WARN_ON(!cpu_hotplug.refcount))
+ cpu_hotplug.refcount++; /* try to fix things up */
+
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -104,10 +134,11 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
* get_online_cpus() not an api which is called all that often.
*
*/
-static void cpu_hotplug_begin(void)
+void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
@@ -118,16 +149,35 @@ static void cpu_hotplug_begin(void)
}
}
-static void cpu_hotplug_done(void)
+void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
-#else /* #if CONFIG_HOTPLUG_CPU */
-static void cpu_hotplug_begin(void) {}
-static void cpu_hotplug_done(void) {}
-#endif /* #else #if CONFIG_HOTPLUG_CPU */
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -139,6 +189,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
int *nr_calls)
{
@@ -162,6 +217,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
BUG_ON(cpu_notify(val, v));
}
EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
@@ -171,16 +227,64 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask. While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+ struct task_struct *p;
+
+ /*
+ * This function is called after the cpu is taken down and marked
+ * offline, so its not like new tasks will ever get this cpu set in
+ * their mm mask. -- Peter Zijlstra
+ * Thus, we may use rcu_read_lock() here, instead of grabbing
+ * full-fledged tasklist_lock.
+ */
+ WARN_ON(cpu_online(cpu));
+ rcu_read_lock();
+ for_each_process(p) {
+ struct task_struct *t;
+
+ /*
+ * Main thread might exit, but other threads may still have
+ * a valid mm. Find one.
+ */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+}
+
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
+ cputime_t utime, stime;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
+ task_cputime(p, &utime, &stime);
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (p->utime || p->stime))
- printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
- "(state = %ld, flags = %x)\n",
+ (utime || stime))
+ pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
p->comm, task_pid_nr(p), cpu,
p->state, p->flags);
}
@@ -204,6 +308,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Park the stopper thread */
+ kthread_park(current);
return 0;
}
@@ -230,16 +336,37 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (err) {
nr_calls--;
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
- printk("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
goto out_release;
}
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so explicitly call both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+#ifdef CONFIG_PREEMPT
+ synchronize_sched();
+#endif
+ synchronize_rcu();
+
+ smpboot_park_threads(cpu);
+
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
+
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
+ smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -290,81 +417,75 @@ EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-
- if (cpu_online(cpu) || !cpu_present(cpu))
- return -EINVAL;
+ struct task_struct *idle;
cpu_hotplug_begin();
+
+ if (cpu_online(cpu) || !cpu_present(cpu)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+ pr_warn("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
/* Arch-specific enabling code. */
- ret = __cpu_up(cpu);
+ ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
+
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
cpu_hotplug_done();
return ret;
}
-int __cpuinit cpu_up(unsigned int cpu)
+int cpu_up(unsigned int cpu)
{
int err = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG
- int nid;
- pg_data_t *pgdat;
-#endif
-
if (!cpu_possible(cpu)) {
- printk(KERN_ERR "can't online cpu %d because it is not "
- "configured as may-hotadd at boot time\n", cpu);
+ pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
+ cpu);
#if defined(CONFIG_IA64)
- printk(KERN_ERR "please check additional_cpus= boot "
- "parameter\n");
+ pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
- nid = cpu_to_node(cpu);
- if (!node_online(nid)) {
- err = mem_online_node(nid);
- if (err)
- return err;
- }
-
- pgdat = NODE_DATA(nid);
- if (!pgdat) {
- printk(KERN_ERR
- "Can't online cpu %d due to NULL pgdat\n", cpu);
- return -ENOMEM;
- }
-
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
- mutex_unlock(&zonelists_mutex);
- }
-#endif
+ err = try_online_node(cpu_to_node(cpu));
+ if (err)
+ return err;
cpu_maps_update_begin();
@@ -384,14 +505,6 @@ EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-void __weak arch_disable_nonboot_cpus_begin(void)
-{
-}
-
-void __weak arch_disable_nonboot_cpus_end(void)
-{
-}
-
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
@@ -403,30 +516,28 @@ int disable_nonboot_cpus(void)
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
- arch_disable_nonboot_cpus_begin();
- printk("Disabling non-boot CPUs ...\n");
+ pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
- printk(KERN_ERR "Error taking CPU%d down: %d\n",
- cpu, error);
+ pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
- arch_disable_nonboot_cpus_end();
-
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
- printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+ pr_err("Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
return error;
@@ -450,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk(KERN_INFO "Enabling non-boot CPUs ...\n");
+ pr_info("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
- printk(KERN_INFO "CPU%d is up\n", cpu);
+ pr_info("CPU%d is up\n", cpu);
continue;
}
- printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+ pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
arch_enable_nonboot_cpus_end();
@@ -479,36 +592,6 @@ static int __init alloc_frozen_cpus(void)
core_initcall(alloc_frozen_cpus);
/*
- * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
- * hotplug when tasks are about to be frozen. Also, don't allow the freezer
- * to continue until any currently running CPU hotplug operation gets
- * completed.
- * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
- * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
- * CPU hotplug path and released only after it is complete. Thus, we
- * (and hence the freezer) will block here until any currently running CPU
- * hotplug operation gets completed.
- */
-void cpu_hotplug_disable_before_freeze(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
- cpu_maps_update_done();
-}
-
-
-/*
- * When tasks have been thawed, re-enable regular CPU hotplug (which had been
- * disabled while beginning to freeze tasks).
- */
-void cpu_hotplug_enable_after_thaw(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
- cpu_maps_update_done();
-}
-
-/*
* When callbacks for CPU hotplug notifications are being executed, we must
* ensure that the state of the system with respect to the tasks being frozen
* or not, as reported by the notification, remains unchanged *throughout the
@@ -527,12 +610,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
- cpu_hotplug_disable_before_freeze();
+ cpu_hotplug_disable();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
- cpu_hotplug_enable_after_thaw();
+ cpu_hotplug_enable();
break;
default:
@@ -545,6 +628,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
static int __init cpu_hotplug_pm_sync_init(void)
{
+ /*
+ * cpu_hotplug_pm_callback has higher priority than x86
+ * bsp_pm_callback which depends on cpu_hotplug_pm_callback
+ * to disable cpu hotplug to avoid cpu hotplug race.
+ */
pm_notifier(cpu_hotplug_pm_callback, 0);
return 0;
}
@@ -560,7 +648,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-void __cpuinit notify_cpu_starting(unsigned int cpu)
+void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
@@ -640,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e1530..9656a3c3650 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
/**
- * cpm_pm_enter - CPU low power entry notifier
+ * cpu_pm_enter - CPU low power entry notifier
*
* Notifies listeners that a single CPU is entering a low power state that may
* cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
* Must be called on the affected CPU with interrupts disabled. Platform is
* responsible for ensuring that cpu_pm_enter is not called twice on the same
* CPU before cpu_pm_exit is called. Notified drivers can include VFP
- * co-processor, interrupt controller and it's PM extensions, local CPU
+ * co-processor, interrupt controller and its PM extensions, local CPU
* timers context save/restore which shouldn't be interrupted. Hence it
* must be called with interrupts disabled.
*
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
EXPORT_SYMBOL_GPL(cpu_pm_enter);
/**
- * cpm_pm_exit - CPU low power exit notifier
+ * cpu_pm_exit - CPU low power exit notifier
*
* Notifies listeners that a single CPU is exiting a low power state that may
* have caused some blocks in the same power domain as the cpu to reset.
*
* Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
EXPORT_SYMBOL_GPL(cpu_pm_exit);
/**
- * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
*
* Notifies listeners that all cpus in a power domain are entering a low power
* state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
* Must be called after cpu_pm_enter has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
/**
- * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
*
* Notifies listeners that all cpus in a power domain are exiting form a
* low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
* Must be called after cpu_pm_exit has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a66..116a4164720 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,25 +59,9 @@
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/wait.h>
-/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
-
-/* Forward declare cgroup structures */
-struct cgroup_subsys cpuset_subsys;
-struct cpuset;
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -95,32 +79,47 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
-
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
};
-/* Retrieve the cpuset for a cgroup */
-static inline struct cpuset *cgroup_cs(struct cgroup *cont)
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
- struct cpuset, css);
+ return css ? container_of(css, struct cpuset, css) : NULL;
}
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
- return container_of(task_subsys_state(task, cpuset_subsys_id),
- struct cpuset, css);
+ return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
}
#ifdef CONFIG_NUMA
@@ -138,6 +137,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -148,6 +148,11 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -184,27 +189,53 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -226,18 +257,16 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers. They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
+ * CPU / memory hotplug is handled asynchronously.
*/
-#define CPUSET_NAME_LEN (128)
-#define CPUSET_NODELIST_LEN (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
* This is ugly, but preserves the userspace API for existing cpuset
@@ -268,59 +297,43 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. If we get
- * all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * until we find one that does have some online cpus. The top
+ * cpuset always has some cpus online.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_cpus(const struct cpuset *cs,
- struct cpumask *pmask)
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
- if (cs)
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
- else
- cpumask_copy(pmask, cpu_online_mask);
- BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
+ while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ cs = parent_cs(cs);
+ cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
}
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
* up the cpuset hierarchy until we find one that does have some
- * online mems. If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_HIGH_MEMORY].
+ * online mems. The top cpuset always has some mems online.
*
* One way or another, we guarantee to return some non-empty subset
- * of node_states[N_HIGH_MEMORY].
+ * of node_states[N_MEMORY].
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (cs && !nodes_intersects(cs->mems_allowed,
- node_states[N_HIGH_MEMORY]))
- cs = cs->parent;
- if (cs)
- nodes_and(*pmask, cs->mems_allowed,
- node_states[N_HIGH_MEMORY]);
- else
- *pmask = node_states[N_HIGH_MEMORY];
- BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+ while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ cs = parent_cs(cs);
+ nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
}
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -340,7 +353,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -355,7 +368,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
-static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
@@ -389,7 +402,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -402,52 +415,66 @@ static void free_trial_cpuset(struct cpuset *trial)
* Return 0 if valid, -errno if not.
*/
-static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
- struct cgroup *cont;
+ struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
}
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -468,31 +495,27 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs)
+ continue;
- if (cpumask_empty(cp->cpus_allowed))
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -501,7 +524,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* This function builds a partial partition of the systems CPUs
* A 'partial partition' is a set of non-overlapping subsets whose
* union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched.c
+ * The output of this function needs to be passed to kernel/sched/core.c
* partition_sched_domains() routine, which will rebuild the scheduler's
* load balancing domains (sched domains) as specified by that partial
* partition.
@@ -514,7 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -530,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* is a subset of one of these domains, while there are as
* many such domains as possible, each as small as possible.
* doms - Conversion of 'csa' to an array of cpumasks, for passing to
- * the kernel/sched.c routine partition_sched_domains() in a
+ * the kernel/sched/core.c routine partition_sched_domains() in a
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
@@ -552,7 +575,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -561,6 +583,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
doms = NULL;
dattr = NULL;
@@ -583,38 +606,34 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
}
- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
continue;
-
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -672,11 +691,8 @@ restart:
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
+ pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+ nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
@@ -719,155 +735,163 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
+ /*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+ if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ goto out;
+
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
+out:
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
}
+#endif /* CONFIG_SMP */
-static int generate_sched_domains(cpumask_var_t **domains,
- struct sched_domain_attr **attributes)
+void rebuild_sched_domains(void)
{
- *domains = NULL;
- return 1;
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
-#endif /* CONFIG_SMP */
-
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
+ * @cs: the cpuset in interest
*
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
+ * A cpuset's effective cpumask is the cpumask of the nearest ancestor
+ * with non-empty cpus. We use effective cpumask whenever:
+ * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
+ * if the cpuset they reside in has no cpus)
+ * - we want to retrieve task_cs(tsk)'s cpus_allowed.
*
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
+ * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
+ * exception. See comments there.
*/
-static void async_rebuild_sched_domains(void)
+static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
+ while (cpumask_empty(cs->cpus_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
+ * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
+ * @cs: the cpuset in interest
*
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
-void rebuild_sched_domains(void)
-{
- do_rebuild_sched_domains(NULL);
-}
-
-/**
- * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Call with cgroup_mutex held. May take callback_mutex during call.
- * Called for each task in a cgroup by cgroup_scan_tasks().
- * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
- * words, if its mask is not equal to its cpuset's mask).
+ * A cpuset's effective nodemask is the nodemask of the nearest ancestor
+ * with non-empty memss. We use effective nodemask whenever:
+ * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
+ * if the cpuset they reside in has no mems)
+ * - we want to retrieve task_cs(tsk)'s mems_allowed.
+ *
+ * Called with cpuset_mutex held.
*/
-static int cpuset_test_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
{
- return !cpumask_equal(&tsk->cpus_allowed,
- (cgroup_cs(scan->cg))->cpus_allowed);
+ while (nodes_empty(cs->mems_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner containing the cgroup of the task
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup whose
- * cpus_allowed mask needs to be changed.
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void cpuset_change_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static void update_tasks_cpumask(struct cpuset *cs)
{
- set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ css_task_iter_end(&it);
}
-/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
- *
- * Called with cgroup_mutex held
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
*
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Called with cpuset_mutex held
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
{
- struct cgroup_scanner scan;
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
- scan.cg = cs->css.cgroup;
- scan.test_task = cpuset_test_cpumask;
- scan.process_task = cpuset_change_cpumask;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ update_tasks_cpumask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
}
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
+ * @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- struct ptr_heap heap;
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -887,16 +911,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
return -EINVAL;
}
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
return retval;
is_load_balanced = is_sched_load_balance(trialcs);
@@ -905,16 +926,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- /*
- * Scan tasks in the cpuset, and update the cpumasks of any
- * that need an update.
- */
- update_tasks_cpumask(cs, &heap);
-
- heap_free(&heap);
+ update_tasks_cpumask_hier(cs, true);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -926,12 +941,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
- * during this call, as manage_mutex holds off any cpuset_attach()
- * calls. Therefore we don't need to take task_lock around the
- * call to guarantee_online_mems(), as we know no one is changing
- * our task's cpuset.
- *
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
@@ -942,12 +951,16 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
+ struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ rcu_read_unlock();
}
/*
@@ -964,7 +977,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
-repeat:
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
@@ -977,83 +989,30 @@ repeat:
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
- * get_mems_allowed(). If at least one node remains unchanged and
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
* tsk does not have a mempolicy, then an empty nodemask will not be
* possible when mems_allowed is larger than a word.
*/
need_loop = task_has_mempolicy(tsk) ||
!nodes_intersects(*newmems, tsk->mems_allowed);
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- /*
- * ensure checking ->mems_allowed_change_disable after setting all new
- * allowed nodes.
- *
- * the read-side task can see an nodemask with new allowed nodes and
- * old allowed nodes. and if it allocates page when cpuset clears newly
- * disallowed ones continuous, it can see the new allowed bits.
- *
- * And if setting all new allowed nodes is after the checking, setting
- * all new allowed nodes and clearing newly disallowed ones will be done
- * continuous, and the read-side task may find no node to alloc page.
- */
- smp_mb();
- /*
- * Allocation of memory is very fast, we needn't sleep when waiting
- * for the read-side.
- */
- while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
- task_unlock(tsk);
- if (!task_curr(tsk))
- yield();
- goto repeat;
+ if (need_loop) {
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
}
- /*
- * ensure checking ->mems_allowed_change_disable before clearing all new
- * disallowed nodes.
- *
- * if clearing newly disallowed bits before the checking, the read-side
- * task may find no node to alloc page.
- */
- smp_mb();
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
- task_unlock(tsk);
-}
-
-/*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
- */
-static void cpuset_change_nodemask(struct task_struct *p,
- struct cgroup_scanner *scan)
-{
- struct mm_struct *mm;
- struct cpuset *cs;
- int migrate;
- const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cgroup_mutex */
- cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, &newmems);
-
- cpuset_change_task_nodemask(p, &newmems);
-
- mm = get_task_mm(p);
- if (!mm)
- return;
-
- migrate = is_memory_migrate(cs);
+ if (need_loop) {
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
- mmput(mm);
+ task_unlock(tsk);
}
static void *cpuset_being_rebound;
@@ -1061,43 +1020,102 @@ static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @oldmem: old mems_allowed of cpuset cs
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
- struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ static nodemask_t newmems; /* protected by cpuset_mutex */
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- scan.cg = cs->css.cgroup;
- scan.test_task = NULL;
- scan.process_task = cpuset_change_nodemask;
- scan.heap = heap;
- scan.data = (nodemask_t *)oldmem;
+ guarantee_online_mems(mems_cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- cgroup_scan_tasks(&scan);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
cpuset_being_rebound = NULL;
}
/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!nodes_empty(cp->mems_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ update_tasks_nodemask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+}
+
+/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
* cpusets mems_allowed, and for each task in the cpuset,
@@ -1105,7 +1123,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1113,15 +1131,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
int retval;
- struct ptr_heap heap;
-
- if (!oldmem)
- return -ENOMEM;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+ * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
* it's read-only
*/
if (cs == &top_cpuset) {
@@ -1143,13 +1156,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_HIGH_MEMORY])) {
+ node_states[N_MEMORY])) {
retval = -EINVAL;
goto done;
}
}
- *oldmem = cs->mems_allowed;
- if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
+
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
@@ -1157,25 +1170,24 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval < 0)
- goto done;
-
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(cs, oldmem, &heap);
-
- heap_free(&heap);
+ update_tasks_nodemask_hier(cs, true);
done:
- NODEMASK_FREE(oldmem);
return retval;
}
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1189,50 +1201,29 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
}
-/*
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
- * @tsk: task to be updated
- * @scan: struct cgroup_scanner containing the cgroup of the task
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
- */
-static void cpuset_change_flag(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
-}
-
-/*
+/**
* update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
- *
- * Called with cgroup_mutex held
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- *
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
*/
-static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_flags(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ struct css_task_iter it;
+ struct task_struct *task;
- scan.cg = cs->css.cgroup;
- scan.test_task = NULL;
- scan.process_task = cpuset_change_flag;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
}
/*
@@ -1241,7 +1232,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1250,7 +1241,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
- struct ptr_heap heap;
int err;
trialcs = alloc_trial_cpuset(cs);
@@ -1266,10 +1256,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
- err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (err < 0)
- goto out;
-
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
@@ -1281,11 +1267,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
- update_tasks_flags(cs, &heap);
- heap_free(&heap);
+ update_tasks_flags(cs);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1389,64 +1374,98 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
+static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
struct task_struct *task;
int ret;
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
+ mutex_lock(&cpuset_mutex);
+
+ /*
+ * We allow to move tasks into an empty cpuset if sane_behavior
+ * flag is set.
+ */
+ ret = -ENOSPC;
+ if (!cgroup_sane_behavior(css->cgroup) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ goto out_unlock;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, tset) {
/*
- * Kthreads bound to specific cpus cannot be moved to a new
- * cpuset; we cannot change their cpu affinity and
- * isolating such threads by their set of allowed nodes is
- * unnecessary. Thus, cpusets are not applicable for such
- * threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before
- * cpus_allowed may be changed.
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
*/
- if (task->flags & PF_THREAD_BOUND)
- return -EINVAL;
- if ((ret = security_task_setscheduler(task)))
- return ret;
+ ret = -EINVAL;
+ if (task->flags & PF_NO_SETAFFINITY)
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
- return 0;
+static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ css_cs(css)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
+ /* static buf protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
- struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
- struct cpuset *cs = cgroup_cs(cgrp);
- struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
- cgroup_taskset_for_each(task, cgrp, tset) {
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cpus_cs, cpus_attach);
+
+ guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+
+ cgroup_taskset_for_each(task, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -1461,16 +1480,34 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_from = oldcs->mems_allowed;
cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(leader);
if (mm) {
+ struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
+
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
- if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+
+ /*
+ * old_mems_allowed is the same with mems_allowed here, except
+ * if this task is being moved automatically due to hotplug.
+ * In that case @mems_allowed has been updated and is empty,
+ * so @old_mems_allowed is the right nodesets that we migrate
+ * mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
+ }
mmput(mm);
}
+
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1490,14 +1527,18 @@ typedef enum {
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
-static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = 0;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1531,18 +1572,21 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
-static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1552,30 +1596,57 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ buf = strstrip(buf);
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
+ */
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
+ flush_work(&cpuset_hotplug_work);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
- goto out;
+ goto out_unlock;
}
- switch (cft->private) {
+ switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
@@ -1588,9 +1659,11 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
-out:
- cgroup_unlock();
- return retval;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+ return retval ?: nbytes;
}
/*
@@ -1600,72 +1673,46 @@ out:
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
- * A single large read to a buffer that crosses a page boundary is
- * ok, because the result being copied to user land is not recomputed
- * across a page fault.
*/
-
-static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
- size_t count;
-
- mutex_lock(&callback_mutex);
- count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
-
- return count;
-}
+ struct cpuset *cs = css_cs(seq_css(sf));
+ cpuset_filetype_t type = seq_cft(sf)->private;
+ ssize_t count;
+ char *buf, *s;
+ int ret = 0;
-static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-{
- size_t count;
+ count = seq_get_buf(sf, &buf);
+ s = buf;
mutex_lock(&callback_mutex);
- count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
- mutex_unlock(&callback_mutex);
-
- return count;
-}
-
-static ssize_t cpuset_common_file_read(struct cgroup *cont,
- struct cftype *cft,
- struct file *file,
- char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct cpuset *cs = cgroup_cs(cont);
- cpuset_filetype_t type = cft->private;
- char *page;
- ssize_t retval = 0;
- char *s;
-
- if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
- return -ENOMEM;
-
- s = page;
switch (type) {
case FILE_CPULIST:
- s += cpuset_sprintf_cpulist(s, cs);
+ s += cpulist_scnprintf(s, count, cs->cpus_allowed);
break;
case FILE_MEMLIST:
- s += cpuset_sprintf_memlist(s, cs);
+ s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
default:
- retval = -EINVAL;
- goto out;
+ ret = -EINVAL;
+ goto out_unlock;
}
- *s++ = '\n';
- retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
- free_page((unsigned long)page);
- return retval;
+ if (s < buf + count - 1) {
+ *s++ = '\n';
+ seq_commit(sf, s - buf);
+ } else {
+ seq_commit(sf, -1);
+ }
+out_unlock:
+ mutex_unlock(&callback_mutex);
+ return ret;
}
-static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1694,9 +1741,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
return 0;
}
-static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1717,16 +1764,16 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
static struct cftype files[] = {
{
.name = "cpus",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
@@ -1794,85 +1841,32 @@ static struct cftype files[] = {
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
- .name = "memory_pressure_enabled",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
-};
-
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- int err;
-
- err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
- if (err)
- return err;
- /* memory_pressure_enabled is in root cpuset only */
- if (!cont->parent)
- err = cgroup_add_file(cont, ss,
- &cft_memory_pressure_enabled);
- return err;
-}
-
-/*
- * post_clone() is called during cgroup_create() when the
- * clone_children mount argument was specified. The cgroup
- * can not yet have any tasks.
- *
- * Currently we refuse to set up the cgroup - thereby
- * refusing the task to be entered, and as a result refusing
- * the sys_unshare() or clone() which initiated it - if any
- * sibling cpusets have exclusive cpus or mem.
- *
- * If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
- * held.
- */
-static void cpuset_post_clone(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct cgroup *parent, *child;
- struct cpuset *cs, *parent_cs;
- parent = cgroup->parent;
- list_for_each_entry(child, &parent->children, sibling) {
- cs = cgroup_cs(child);
- if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
- return;
- }
- cs = cgroup_cs(cgroup);
- parent_cs = cgroup_cs(parent);
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
- mutex_lock(&callback_mutex);
- cs->mems_allowed = parent_cs->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
- return;
-}
+ { } /* terminate */
+};
/*
- * cpuset_create - create a cpuset
- * ss: cpuset cgroup subsystem
- * cont: control group that the new cpuset will be part of
+ * cpuset_css_alloc - allocate a cpuset css
+ * cgrp: control group that the new cpuset will be part of
*/
-static struct cgroup_subsys_state *cpuset_create(
- struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
- struct cpuset *parent;
- if (!cont->parent) {
+ if (!parent_css)
return &top_cpuset.css;
- }
- parent = cgroup_cs(cont->parent);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1880,49 +1874,107 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
- cs->parent = parent;
- number_of_cpusets++;
- return &cs->css ;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ cpuset_inc();
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * histrical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ mutex_unlock(&callback_mutex);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- number_of_cpusets--;
+ cpuset_dec();
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
-struct cgroup_subsys cpuset_subsys = {
- .name = "cpuset",
- .create = cpuset_create,
- .destroy = cpuset_destroy,
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .populate = cpuset_populate,
- .post_clone = cpuset_post_clone,
- .subsys_id = cpuset_subsys_id,
+ .base_cftypes = files,
.early_init = 1,
};
@@ -1953,226 +2005,230 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
- number_of_cpusets = 1;
return 0;
}
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
*/
-static void cpuset_do_move_task(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
- struct cgroup *new_cgroup = scan->data;
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = parent_cs(cs);
+ while (cpumask_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent_cs(parent);
- cgroup_attach_task(new_cgroup, tsk);
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
}
/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems;
+ bool is_empty;
+ bool sane = cgroup_sane_behavior(cs->css.cgroup);
- scan.cg = from->css.cgroup;
- scan.test_task = NULL; /* select all tasks in cgroup */
- scan.process_task = cpuset_do_move_task;
- scan.heap = NULL;
- scan.data = to->css.cgroup;
+retry:
+ wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- if (cgroup_scan_tasks(&scan))
- printk(KERN_ERR "move_member_tasks_to_cpuset: "
- "cgroup_scan_tasks failed\n");
-}
+ mutex_lock(&cpuset_mutex);
-/*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- */
-static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
-{
- struct cpuset *parent;
+ /*
+ * We have raced with task attaching. We wait until attaching
+ * is finished, so we won't attach a task to an empty cpuset.
+ */
+ if (cs->attach_in_progress) {
+ mutex_unlock(&cpuset_mutex);
+ goto retry;
+ }
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
+ * If sane_behavior flag is set, we need to update tasks' cpumask
+ * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
+ * call update_tasks_cpumask() if the cpuset becomes empty, as
+ * the tasks in it will be migrated to an ancestor.
*/
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
+ if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+ (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
+ update_tasks_cpumask(cs);
+
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
/*
- * Find its next-highest non-empty parent, (top cpuset
- * has online cpus, so can't be empty).
+ * If sane_behavior flag is set, we need to update tasks' nodemask
+ * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
+ * call update_tasks_nodemask() if the cpuset becomes empty, as
+ * the tasks in it will be migratd to an ancestor.
*/
- parent = cs->parent;
- while (cpumask_empty(parent->cpus_allowed) ||
- nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ if ((sane && nodes_empty(cs->mems_allowed)) ||
+ (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
- move_member_tasks_to_cpuset(cs, parent);
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+ *
+ * Otherwise move tasks to the nearest ancestor with execution
+ * resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (!sane && is_empty)
+ remove_tasks_in_empty_cpuset(cs);
}
-/*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
*
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'. But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static void scan_for_empty_cpusets(struct cpuset *root)
-{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- list_add_tail((struct list_head *)&root->stack_list, &queue);
-
- while (!list_empty(&queue)) {
- cp = list_first_entry(&queue, struct cpuset, stack_list);
- list_del(queue.next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &queue);
- }
+static void cpuset_hotplug_workfn(struct work_struct *work)
+{
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated, mems_updated;
- /* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
- nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
- continue;
+ mutex_lock(&cpuset_mutex);
+
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
- oldmems = cp->mems_allowed;
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
- /* Remove offline cpus and mems from this cpuset. */
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_HIGH_MEMORY]);
+ top_cpuset.mems_allowed = new_mems;
mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset);
+ }
+
+ mutex_unlock(&cpuset_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed) ||
- nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else {
- update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems, NULL);
+ /* if cpus or mems changed, we need to propagate to descendants */
+ if (cpus_updated || mems_updated) {
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ cpuset_hotplug_update_tasks(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
}
+ rcu_read_unlock();
}
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated)
+ rebuild_sched_domains();
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
-
- cgroup_lock();
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- mutex_unlock(&callback_mutex);
- scan_for_empty_cpusets(&top_cpuset);
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
-
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- oldmems = top_cpuset.mems_allowed;
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
- break;
- case MEM_OFFLINE:
- /*
- * needn't update top_cpuset.mems_allowed explicitly because
- * scan_for_empty_cpusets() will update it.
- */
- scan_for_empty_cpusets(&top_cpuset);
- break;
- default:
- break;
- }
- cgroup_unlock();
-
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+ top_cpuset.mems_allowed = node_states[N_MEMORY];
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
/**
@@ -2182,28 +2238,29 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
+ struct cpuset *cpus_cs;
+
mutex_lock(&callback_mutex);
- task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
- task_unlock(tsk);
+ rcu_read_lock();
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ guarantee_online_cpus(cpus_cs, pmask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- const struct cpuset *cs;
- int cpu;
+ struct cpuset *cpus_cs;
rcu_read_lock();
- cs = task_cs(tsk);
- if (cs)
- do_set_cpus_allowed(tsk, cs->cpus_allowed);
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2219,22 +2276,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
* set any mask even if it is not right from task_cs() pov,
* the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
*/
-
- cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
- if (cpu >= nr_cpu_ids) {
- /*
- * Either tsk->cpus_allowed is wrong (see above) or it
- * is actually empty. The latter case is only possible
- * if we are racing with remove_tasks_in_empty_cpuset().
- * Like above we can temporary set any mask and rely on
- * set_cpus_allowed_ptr() as synchronization point.
- */
- do_set_cpus_allowed(tsk, cpu_possible_mask);
- cpu = cpumask_any(cpu_active_mask);
- }
-
- return cpu;
}
void cpuset_init_current_mems_allowed(void)
@@ -2248,18 +2293,20 @@ void cpuset_init_current_mems_allowed(void)
*
* Description: Returns the nodemask_t mems_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
+ * subset of node_states[N_MEMORY], even if this means going outside the
* tasks cpuset.
**/
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
+ struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
- task_lock(tsk);
- guarantee_online_mems(task_cs(tsk), &mask);
- task_unlock(tsk);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &mask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
return mask;
@@ -2282,10 +2329,10 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
* callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
-static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
@@ -2352,7 +2399,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
*/
int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- const struct cpuset *cs; /* current cpuset ancestors */
+ struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2375,11 +2422,11 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
mutex_lock(&callback_mutex);
- task_lock(current);
+ rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
- task_unlock(current);
-
allowed = node_isset(node, cs->mems_allowed);
+ rcu_read_unlock();
+
mutex_unlock(&callback_mutex);
return allowed;
}
@@ -2423,17 +2470,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
*
@@ -2508,26 +2544,33 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
+#define CPUSET_NODELIST_LEN (256)
+
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @task: pointer to task_struct of some task.
+ * @tsk: pointer to task_struct of some task.
*
* Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log. Must hold task_lock(task) to allow
- * dereferencing task_cs(task).
+ * mems_allowed to the kernel log.
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- struct dentry *dentry;
+ /* Statically allocated to prevent using excess stack. */
+ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+ static DEFINE_SPINLOCK(cpuset_buffer_lock);
+ struct cgroup *cgrp;
- dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+ rcu_read_lock();
+
+ cgrp = task_cs(tsk)->css.cgroup;
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
- printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cpuset_name, cpuset_nodelist);
+ pr_info("%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+
+ rcu_read_unlock();
spin_unlock(&cpuset_buffer_lock);
}
@@ -2559,9 +2602,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
void __cpuset_memory_pressure_bump(void)
{
- task_lock(current);
+ rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
- task_unlock(current);
+ rcu_read_unlock();
}
#ifdef CONFIG_PROC_PID_CPUSET
@@ -2571,19 +2614,19 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2593,44 +2636,32 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
- css = task_subsys_state(tsk, cpuset_subsys_id);
- retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
- if (retval < 0)
- goto out_unlock;
- seq_puts(m, buf);
+ retval = -ENAMETOOLONG;
+ rcu_read_lock();
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
+ rcu_read_unlock();
+ if (!p)
+ goto out_put_task;
+ seq_puts(m, p);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+ retval = 0;
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Mems_allowed:\t");
+ seq_puts(m, "Mems_allowed:\t");
seq_nodemask(m, &task->mems_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Mems_allowed_list:\t");
+ seq_puts(m, "\n");
+ seq_puts(m, "Mems_allowed_list:\t");
seq_nodemask_list(m, &task->mems_allowed);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a404..e0573a43c7d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#if 0
@@ -29,17 +30,6 @@
static struct kmem_cache *cred_jar;
/*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
- .usage = ATOMIC_INIT(2),
- .tgid = 0,
- .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
-};
-#endif
-
-/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
@@ -48,6 +38,14 @@ struct cred init_cred = {
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
@@ -56,9 +54,6 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
-#ifdef CONFIG_KEYS
- .tgcred = &init_tgcred,
-#endif
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -87,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
}
/*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
-{
- struct thread_group_cred *tgcred =
- container_of(rcu, struct thread_group_cred, rcu);
-
- BUG_ON(atomic_read(&tgcred->usage) != 0);
-
- key_put(tgcred->session_keyring);
- key_put(tgcred->process_keyring);
- kfree(tgcred);
-}
-#endif
-
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = cred->tgcred;
-
- if (atomic_dec_and_test(&tgcred->usage))
- call_rcu(&tgcred->rcu, release_tgcred_rcu);
-#endif
-}
-
-/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
@@ -141,12 +106,14 @@ static void put_cred_rcu(struct rcu_head *rcu)
#endif
security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
- release_tgcred(cred);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -197,13 +164,6 @@ void exit_creds(struct task_struct *tsk)
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
-
- cred = (struct cred *) tsk->replacement_session_keyring;
- if (cred) {
- tsk->replacement_session_keyring = NULL;
- validate_creds(cred);
- put_cred(cred);
- }
}
/**
@@ -243,15 +203,6 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
- if (!new->tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
- atomic_set(&new->tgcred->usage, 1);
-#endif
-
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
@@ -302,11 +253,13 @@ struct cred *prepare_creds(void)
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
+ get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
- atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
@@ -330,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
*/
struct cred *prepare_exec_creds(void)
{
- struct thread_group_cred *tgcred = NULL;
struct cred *new;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred)
- return NULL;
-#endif
-
new = prepare_creds();
- if (!new) {
- kfree(tgcred);
+ if (!new)
return new;
- }
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
- /* create a new per-thread-group creds for all this set of threads to
- * share */
- memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
-
/* inherit the session keyring; new process keyring */
- key_get(tgcred->session_keyring);
- tgcred->process_keyring = NULL;
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
#endif
return new;
@@ -379,9 +313,6 @@ struct cred *prepare_exec_creds(void)
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
struct cred *new;
int ret;
@@ -411,11 +342,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
goto error_put;
}
- /* cache user_ns in cred. Doesn't need a refcount because it will
- * stay pinned by cred->user
- */
- new->user_ns = new->user->user_ns;
-
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
@@ -426,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
install_thread_keyring_to_cred(new);
}
- /* we share the process and session keyrings between all the threads in
- * a process - this is slightly icky as we violate COW credentials a
- * bit */
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
if (!(clone_flags & CLONE_THREAD)) {
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- ret = -ENOMEM;
- goto error_put;
- }
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
}
#endif
@@ -456,6 +372,31 @@ error_put:
return ret;
}
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -490,11 +431,11 @@ int commit_creds(struct cred *new)
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (old->euid != new->euid ||
- old->egid != new->egid ||
- old->fsuid != new->fsuid ||
- old->fsgid != new->fsgid ||
- !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
@@ -502,9 +443,9 @@ int commit_creds(struct cred *new)
}
/* alter the thread keyring */
- if (new->fsuid != old->fsuid)
+ if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(task);
- if (new->fsgid != old->fsgid)
+ if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(task);
/* do it
@@ -521,16 +462,16 @@ int commit_creds(struct cred *new)
alter_cred_subscribers(old, -2);
/* send notifications */
- if (new->uid != old->uid ||
- new->euid != old->euid ||
- new->suid != old->suid ||
- new->fsuid != old->fsuid)
+ if (!uid_eq(new->uid, old->uid) ||
+ !uid_eq(new->euid, old->euid) ||
+ !uid_eq(new->suid, old->suid) ||
+ !uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
- if (new->gid != old->gid ||
- new->egid != old->egid ||
- new->sgid != old->sgid ||
- new->fsgid != old->fsgid)
+ if (!gid_eq(new->gid, old->gid) ||
+ !gid_eq(new->egid, old->egid) ||
+ !gid_eq(new->sgid, old->sgid) ||
+ !gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
@@ -644,9 +585,6 @@ void __init cred_init(void)
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
const struct cred *old;
struct cred *new;
@@ -654,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
-#endif
-
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
@@ -675,16 +605,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
+ get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = NULL;
- new->tgcred = tgcred;
- new->request_key_auth = NULL;
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
@@ -799,9 +727,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- cred->uid, cred->euid, cred->suid, cred->fsuid);
+ from_kuid_munged(&init_user_ns, cred->uid),
+ from_kuid_munged(&init_user_ns, cred->euid),
+ from_kuid_munged(&init_user_ns, cred->suid),
+ from_kuid_munged(&init_user_ns, cred->fsuid));
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kgid_munged(&init_user_ns, cred->gid),
+ from_kgid_munged(&init_user_ns, cred->egid),
+ from_kgid_munged(&init_user_ns, cred->sgid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784ef..1adf62b39b9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
*/
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
+#include <linux/serial_core.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/console.h>
@@ -41,18 +42,19 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
+#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include "debug_core.h"
@@ -75,6 +77,8 @@ static int exception_level;
struct kgdb_io *dbg_io_ops;
static DEFINE_SPINLOCK(kgdb_registration_lock);
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
/* kgdb console driver is loaded */
static int kgdb_con_registered;
/* determine if kgdb console output should be used */
@@ -96,6 +100,7 @@ static int __init opt_kgdb_con(char *str)
early_param("kgdbcon", opt_kgdb_con);
module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
/*
* Holds information about breakpoints in a kernel. These breakpoints are
@@ -157,37 +162,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
* Weak aliases for breakpoint management,
* can be overriden by architectures when needed:
*/
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
if (err)
return err;
-
- return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
- BREAK_INSTR_SIZE);
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ return err;
}
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- return probe_kernel_write((char *)addr,
- (char *)bundle, BREAK_INSTR_SIZE);
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
int __weak kgdb_validate_break_address(unsigned long addr)
{
- char tmp_variable[BREAK_INSTR_SIZE];
+ struct kgdb_bkpt tmp;
int err;
- /* Validate setting the breakpoint and then removing it. In the
+ /* Validate setting the breakpoint and then removing it. If the
* remove fails, the kernel needs to emit a bad message because we
* are deep trouble not being able to put things back the way we
* found them.
*/
- err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+ tmp.bpt_addr = addr;
+ err = kgdb_arch_set_breakpoint(&tmp);
if (err)
return err;
- err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+ err = kgdb_arch_remove_breakpoint(&tmp);
if (err)
printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
"memory destroyed at: %lx", addr);
@@ -218,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm && current->mm->mmap_cache) {
- flush_cache_range(current->mm->mmap_cache,
- addr, addr + BREAK_INSTR_SIZE);
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
}
+
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
@@ -231,7 +245,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
*/
int dbg_activate_sw_breakpoints(void)
{
- unsigned long addr;
int error;
int ret = 0;
int i;
@@ -240,16 +253,15 @@ int dbg_activate_sw_breakpoints(void)
if (kgdb_break[i].state != BP_SET)
continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_set_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
if (error) {
ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+ printk(KERN_INFO "KGDB: BP install failed: %lx",
+ kgdb_break[i].bpt_addr);
continue;
}
- kgdb_flush_swbreak_addr(addr);
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
kgdb_break[i].state = BP_ACTIVE;
}
return ret;
@@ -298,7 +310,6 @@ int dbg_set_sw_break(unsigned long addr)
int dbg_deactivate_sw_breakpoints(void)
{
- unsigned long addr;
int error;
int ret = 0;
int i;
@@ -306,15 +317,14 @@ int dbg_deactivate_sw_breakpoints(void)
for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
if (kgdb_break[i].state != BP_ACTIVE)
continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+ printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
ret = error;
}
- kgdb_flush_swbreak_addr(addr);
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
kgdb_break[i].state = BP_SET;
}
return ret;
@@ -348,7 +358,6 @@ int kgdb_isremovedbreak(unsigned long addr)
int dbg_remove_all_break(void)
{
- unsigned long addr;
int error;
int i;
@@ -356,12 +365,10 @@ int dbg_remove_all_break(void)
for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
if (kgdb_break[i].state != BP_ACTIVE)
goto setundefined;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error)
printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
- addr);
+ kgdb_break[i].bpt_addr);
setundefined:
kgdb_break[i].state = BP_UNDEFINED;
}
@@ -527,7 +534,7 @@ return_normal:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
@@ -576,8 +583,12 @@ return_normal:
raw_spin_lock(&dbg_slave_lock);
#ifdef CONFIG_SMP
+ /* If send_ready set, slaves are already waiting */
+ if (ks->send_ready)
+ atomic_set(ks->send_ready, 1);
+
/* Signal the other CPUs to enter kgdb_wait() */
- if ((!kgdb_single_step) && kgdb_do_roundup)
+ else if ((!kgdb_single_step) && kgdb_do_roundup)
kgdb_roundup_cpus(flags);
#endif
@@ -651,7 +662,7 @@ kgdb_restore:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&masters_in_kgdb);
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
@@ -674,22 +685,46 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
+
+ memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
ks->signo = signo;
ks->err_code = ecode;
- ks->kgdb_usethreadid = 0;
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
- return 0; /* Ouch, double exception ! */
+ goto out; /* Ouch, double exception ! */
if (kgdb_info[ks->cpu].enter_kgdb != 0)
- return 0;
+ goto out;
+
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
+}
- return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
}
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
int kgdb_nmicallback(int cpu, void *regs)
{
#ifdef CONFIG_SMP
@@ -709,6 +744,31 @@ int kgdb_nmicallback(int cpu, void *regs)
return 1;
}
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
+ atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+ if (!kgdb_io_ready(0) || !send_ready)
+ return 1;
+
+ if (kgdb_info[cpu].enter_kgdb == 0) {
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->ex_vector = trapnr;
+ ks->signo = SIGTRAP;
+ ks->err_code = err_code;
+ ks->linux_regs = regs;
+ ks->send_ready = send_ready;
+ kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
static void kgdb_console_write(struct console *co, const char *s,
unsigned count)
{
@@ -752,7 +812,7 @@ static void sysrq_handle_dbg(int key)
static struct sysrq_key_op sysrq_dbg_op = {
.handler = sysrq_handle_dbg,
- .help_msg = "debug(G)",
+ .help_msg = "debug(g)",
.action_msg = "DEBUG",
};
#endif
@@ -784,6 +844,33 @@ void __init dbg_late_init(void)
kdb_init(KDB_INIT_FULL);
}
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+ /*
+ * Take the following action on reboot notify depending on value:
+ * 1 == Enter debugger
+ * 0 == [the default] detatch debug client
+ * -1 == Do nothing... and use this until the board resets
+ */
+ switch (kgdbreboot) {
+ case 1:
+ kgdb_breakpoint();
+ case -1:
+ goto done;
+ }
+ if (!dbg_kdb_mode)
+ gdbstub_exit(code);
+done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+ .notifier_call = dbg_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX,
+};
+
static void kgdb_register_callbacks(void)
{
if (!kgdb_io_module_registered) {
@@ -791,6 +878,8 @@ static void kgdb_register_callbacks(void)
kgdb_arch_init();
if (!dbg_is_early)
kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
+ register_reboot_notifier(&dbg_reboot_notifier);
atomic_notifier_chain_register(&panic_notifier_list,
&kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +901,8 @@ static void kgdb_unregister_callbacks(void)
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
+ unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
atomic_notifier_chain_unregister(&panic_notifier_list,
&kgdb_panic_event_nb);
kgdb_arch_exit();
@@ -952,7 +1043,7 @@ int dbg_io_get_char(void)
* otherwise as a quick means to stop program execution and "break" into
* the debugger.
*/
-void kgdb_breakpoint(void)
+noinline void kgdb_breakpoint(void)
{
atomic_inc(&kgdb_setting_breakpoint);
wmb(); /* Sync point before breakpoint */
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7..127d9bc49fb 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
+ atomic_t *send_ready;
};
/* Exception state values */
@@ -72,6 +73,8 @@ extern int dbg_kdb_mode;
#ifdef CONFIG_KGDB_KDB
extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad8..19d9a578c75 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/serial_core.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
len = len / 2;
remcom_out_buffer[len++] = 0;
+ kdb_common_init_state(ks);
kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
strcpy(remcom_out_buffer, "OK");
}
break;
@@ -1111,6 +1115,13 @@ void gdbstub_exit(int status)
unsigned char checksum, ch, buffer[3];
int loop;
+ if (!kgdb_connected)
+ return;
+ kgdb_connected = 0;
+
+ if (!dbg_io_ops || dbg_kdb_mode)
+ return;
+
buffer[0] = 'W';
buffer[1] = hex_asc_hi(status);
buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1140,6 @@ void gdbstub_exit(int status)
dbg_io_ops->write_char(hex_asc_lo(checksum));
/* make sure the output is flushed, lest the bootloader clobber it */
- dbg_io_ops->flush();
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459..70a504601dc 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!bp->bp_type) {
+ kdb_printf("Software breakpoints are unavailable.\n"
+ " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " OR use hw breaks: help bph\n");
+ }
+#endif
return 1;
}
return 0;
@@ -479,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
/*
* kdb_ss
*
- * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
- * commands.
+ * Process the 'ss' (Single Step) command.
*
* ss
- * ssb
*
* Parameters:
* argc Argument count
@@ -491,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
* Outputs:
* None.
* Returns:
- * KDB_CMD_SS[B] for success, a kdb error if failure.
+ * KDB_CMD_SS for success, a kdb error if failure.
* Locking:
* None.
* Remarks:
*
* Set the arch specific option to trigger a debug trap after the next
* instruction.
- *
- * For 'ssb', set the trace flag in the debug trap handler
- * after printing the current insn and return directly without
- * invoking the kdb command processor, until a branch instruction
- * is encountered.
*/
static int kdb_ss(int argc, const char **argv)
{
- int ssb = 0;
-
- ssb = (strcmp(argv[0], "ssb") == 0);
if (argc != 0)
return KDB_ARGCOUNT;
/*
* Set trace flag and go.
*/
KDB_STATE_SET(DOING_SS);
- if (ssb) {
- KDB_STATE_SET(DOING_SSB);
- return KDB_CMD_SSB;
- }
return KDB_CMD_SS;
}
@@ -554,8 +547,6 @@ void __init kdb_initbptab(void)
kdb_register_repeat("ss", kdb_ss, "",
"Single Step", 1, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("ssb", kdb_ss, "",
- "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41..fe15fff5df5 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,14 +15,13 @@
#include <linux/sched.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
-#include <asm/system.h>
#include "kdb_private.h"
static void kdb_show_stack(struct task_struct *p, void *addr)
{
int old_lvl = console_loglevel;
- console_loglevel = 15;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
kdb_set_current_task(p);
if (addr) {
@@ -130,6 +129,8 @@ kdb_bt(int argc, const char **argv)
}
/* Now the inactive tasks */
kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
if (task_curr(p))
continue;
if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff1..8859ca34dcf 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
#include <linux/kdb.h>
#include <linux/kdebug.h>
#include <linux/export.h>
+#include <linux/hardirq.h>
#include "kdb_private.h"
#include "../debug_core.h"
@@ -33,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
@@ -52,6 +69,12 @@ int kdb_stub(struct kgdb_state *ks)
if (atomic_read(&kgdb_setting_breakpoint))
reason = KDB_REASON_KEYBOARD;
+ if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+ reason = KDB_REASON_SYSTEM_NMI;
+
+ else if (in_nmi())
+ reason = KDB_REASON_NMI;
+
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
reason = KDB_REASON_BREAK;
@@ -90,13 +113,10 @@ int kdb_stub(struct kgdb_state *ks)
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
- kdb_initial_cpu = atomic_read(&kgdb_active);
- kdb_current_task = kgdb_info[ks->cpu].task;
- kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ kdb_common_init_state(ks);
/* Remove any breakpoints as needed by kdb and clear single step */
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
- KDB_STATE_CLEAR(DOING_SSB);
KDB_STATE_SET(PAGER);
/* zero out any offline cpu data */
for_each_present_cpu(i) {
@@ -121,9 +141,7 @@ int kdb_stub(struct kgdb_state *ks)
* Upon exit from the kdb main loop setup break points and restart
* the system based on the requested continue state
*/
- kdb_initial_cpu = -1;
- kdb_current_task = NULL;
- kdb_current_regs = NULL;
+ kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e..7c70812caea 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
{
int diag;
int linecount;
+ int colcount;
int logging, saved_loglevel = 0;
int saved_trap_printk;
int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
if (diag || linecount <= 1)
linecount = 24;
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
diag = kdbgetintenv("LOGGING", &logging);
if (diag)
logging = 0;
@@ -689,8 +694,8 @@ kdb_printit:
if (!dbg_kdb_mode && kgdb_connected) {
gdbstub_msg_write(kdb_buffer, retlen);
} else {
- if (!dbg_io_ops->is_console) {
- len = strlen(kdb_buffer);
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = retlen;
cp = kdb_buffer;
while (len--) {
dbg_io_ops->write_char(*cp);
@@ -705,19 +710,34 @@ kdb_printit:
}
if (logging) {
saved_loglevel = console_loglevel;
- console_loglevel = 0;
+ console_loglevel = CONSOLE_LOGLEVEL_SILENT;
printk(KERN_INFO "%s", kdb_buffer);
}
- if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
- kdb_nextline++;
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
/* check for having reached the LINES number of printed lines */
- if (kdb_nextline == linecount) {
+ if (kdb_nextline >= linecount) {
char buf1[16] = "";
-#if defined(CONFIG_SMP)
- char buf2[32];
-#endif
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
@@ -732,18 +752,10 @@ kdb_printit:
if (moreprompt == NULL)
moreprompt = "more> ";
-#if defined(CONFIG_SMP)
- if (strchr(moreprompt, '%')) {
- sprintf(buf2, moreprompt, get_cpu());
- put_cpu();
- moreprompt = buf2;
- }
-#endif
-
kdb_input_flush();
c = console_drivers;
- if (!dbg_io_ops->is_console) {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(moreprompt);
cp = moreprompt;
while (len--) {
@@ -776,7 +788,7 @@ kdb_printit:
kdb_grepping_flag = 0;
kdb_printf("\n");
} else if (buf1[0] == ' ') {
- kdb_printf("\n");
+ kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
} else if (buf1[0] == '\n') {
kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c..118527aa60e 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
static int kbd_exists;
+static int kbd_last_ret;
/*
* Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
return -1;
}
- if ((scancode & 0x80) != 0)
+ if ((scancode & 0x80) != 0) {
+ if (scancode == 0x9c)
+ kbd_last_ret = 0;
return -1;
+ }
scancode &= 0x7f;
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
return -1; /* ignore unprintables */
}
- if ((scancode & 0x7f) == 0x1c) {
- /*
- * enter key. All done. Absorb the release scancode.
- */
+ if (scancode == 0x1c) {
+ kbd_last_ret = 1;
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+ int scancode, scanstatus;
+
+ /*
+ * Nothing to clean up, since either
+ * ENTER was never pressed, or has already
+ * gotten cleaned up.
+ */
+ if (!kbd_last_ret)
+ return;
+
+ kbd_last_ret = 0;
+ /*
+ * Enter key. Need to absorb the break code here, lest it gets
+ * leaked out if we exit KDB as the result of processing 'g'.
+ *
+ * This has several interesting implications:
+ * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+ * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+ * only get a break code at the end of the repeated
+ * sequence. This means we can't propagate the repeated key
+ * press, and must swallow it away.
+ * + Need to handle possible PS/2 mouse input.
+ * + Need to handle mashed keys.
+ */
+
+ while (1) {
while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
- ;
+ cpu_relax();
/*
- * Fetch the scancode
+ * Fetch the scancode.
*/
scancode = inb(KBD_DATA_REG);
scanstatus = inb(KBD_STATUS_REG);
- while (scanstatus & KBD_STAT_MOUSE_OBF) {
- scancode = inb(KBD_DATA_REG);
- scanstatus = inb(KBD_STATUS_REG);
- }
+ /*
+ * Skip mouse input.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ continue;
- if (scancode != 0x9c) {
- /*
- * Wasn't an enter-release, why not?
- */
- kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
- scancode, scanstatus);
- }
+ /*
+ * If we see 0xe0, this is either a break code for KP
+ * ENTER, or a repeat make for KP ENTER. Either way,
+ * since the second byte is equivalent to an ENTER,
+ * skip the 0xe0 and try again.
+ *
+ * If we see 0x1c, this must be a repeat ENTER or KP
+ * ENTER (and we swallowed 0xe0 before). Try again.
+ *
+ * We can also see make and break codes for other keys
+ * mashed before or after pressing ENTER. Thus, if we
+ * see anything other than 0x9c, we have to try again.
+ *
+ * Note, if you held some key as ENTER was depressed,
+ * that break code would get leaked out.
+ */
+ if (scancode != 0x9c)
+ continue;
- return 13;
+ return;
}
-
- return keychar & 0xff;
}
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437..2f7c760305c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,12 +14,14 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/smp.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -122,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
};
#undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
@@ -138,11 +140,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
static char *__env[] = {
#if defined(CONFIG_SMP)
"PROMPT=[%d]kdb> ",
- "MOREPROMPT=[%d]more> ",
#else
"PROMPT=kdb> ",
- "MOREPROMPT=more> ",
#endif
+ "MOREPROMPT=more> ",
"RADIX=16",
"MDCOUNT=8", /* lines of md output */
KDB_PLATFORM_ENV,
@@ -174,7 +175,7 @@ static char *__env[] = {
(char *)0,
};
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
struct task_struct *kdb_curr_task(int cpu)
{
@@ -680,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
}
if (argc != 3)
return KDB_ARGCOUNT;
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set) {
- kdb_printf("Could not allocate new defcmd_set entry for %s\n",
- argv[1]);
- defcmd_set = save_defcmd_set;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
memcpy(defcmd_set, save_defcmd_set,
defcmd_set_count * sizeof(*defcmd_set));
- kfree(save_defcmd_set);
s = defcmd_set + defcmd_set_count;
memset(s, 0, sizeof(*s));
s->usable = 1;
s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
if (s->usage[0] == '"') {
- strcpy(s->usage, s->usage+1);
+ strcpy(s->usage, argv[2]+1);
s->usage[strlen(s->usage)-1] = '\0';
}
if (s->help[0] == '"') {
- strcpy(s->help, s->help+1);
+ strcpy(s->help, argv[3]+1);
s->help[strlen(s->help)-1] = '\0';
}
++defcmd_set_count;
defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
}
/*
@@ -1074,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
static void kdb_dumpregs(struct pt_regs *regs)
{
int old_lvl = console_loglevel;
- console_loglevel = 15;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
show_regs(regs);
kdb_trap_printk--;
@@ -1111,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
* KDB_CMD_GO User typed 'go'.
* KDB_CMD_CPU User switched to another cpu.
* KDB_CMD_SS Single step.
- * KDB_CMD_SSB Single step until branch.
*/
static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_dbtrap_t db_result)
@@ -1150,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
instruction_pointer(regs));
break;
- case KDB_DB_SSB:
- /*
- * In the midst of ssb command. Just return.
- */
- KDB_DEBUG_STATE("kdb_local 3", reason);
- return KDB_CMD_SSB; /* Continue with SSB command */
-
- break;
case KDB_DB_SS:
break;
case KDB_DB_SSBPT:
@@ -1192,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
instruction_pointer(regs));
kdb_dumpregs(regs);
break;
+ case KDB_REASON_SYSTEM_NMI:
+ kdb_printf("due to System NonMaskable Interrupt\n");
+ break;
case KDB_REASON_NMI:
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
@@ -1235,18 +1246,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*cmdbuf = '\0';
*(cmd_hist[cmd_head]) = '\0';
- if (KDB_FLAG(ONLY_DO_DUMP)) {
- /* kdb is off but a catastrophic error requires a dump.
- * Take the dump and reboot.
- * Turn on logging so the kdb output appears in the log
- * buffer in the dump.
- */
- const char *setargs[] = { "set", "LOGGING", "1" };
- kdb_set(2, setargs);
- kdb_reboot(0, NULL);
- /*NOTREACHED*/
- }
-
do_full_getstr:
#if defined(CONFIG_SMP)
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -1292,7 +1291,6 @@ do_full_getstr:
if (diag == KDB_CMD_GO
|| diag == KDB_CMD_CPU
|| diag == KDB_CMD_SS
- || diag == KDB_CMD_SSB
|| diag == KDB_CMD_KGDB)
break;
@@ -1379,12 +1377,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
break;
}
- if (result == KDB_CMD_SSB) {
- KDB_STATE_SET(DOING_SS);
- KDB_STATE_SET(DOING_SSB);
- break;
- }
-
if (result == KDB_CMD_KGDB) {
if (!KDB_STATE(DOING_KGDB))
kdb_printf("Entering please attach debugger "
@@ -1400,6 +1392,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
if (KDB_STATE(DOING_SS))
KDB_STATE_CLEAR(SSBPT);
+ /* Clean up any keyboard devices before leaving */
+ kdb_kbd_cleanup_state();
+
return result;
}
@@ -1978,6 +1973,8 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("Module Size modstruct Used by\n");
list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
@@ -2037,8 +2034,15 @@ static int kdb_env(int argc, const char **argv)
*/
static int kdb_dmesg(int argc, const char **argv)
{
- char *syslog_data[4], *start, *end, c = '\0', *p;
- int diag, logging, logsize, lines = 0, adjust = 0, n;
+ int diag;
+ int logging;
+ int lines = 0;
+ int adjust = 0;
+ int n = 0;
+ int skip = 0;
+ struct kmsg_dumper dumper = { .active = 1 };
+ size_t len;
+ char buf[201];
if (argc > 2)
return KDB_ARGCOUNT;
@@ -2061,22 +2065,10 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
- * logical start, end+1. */
- kdb_syslog_data(syslog_data);
- if (syslog_data[2] == syslog_data[3])
- return 0;
- logsize = syslog_data[1] - syslog_data[0];
- start = syslog_data[2];
- end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
- for (n = 0, p = start; p < end; ++p) {
- c = *KDB_WRAP(p);
- if (c == '\n')
- ++n;
- }
- if (c != '\n')
- ++n;
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ n++;
+
if (lines < 0) {
if (adjust >= n)
kdb_printf("buffer only contains %d lines, nothing "
@@ -2084,21 +2076,11 @@ static int kdb_dmesg(int argc, const char **argv)
else if (adjust - lines >= n)
kdb_printf("buffer only contains %d lines, last %d "
"lines printed\n", n, n - adjust);
- if (adjust) {
- for (; start < end && adjust; ++start) {
- if (*KDB_WRAP(start) == '\n')
- --adjust;
- }
- if (start < end)
- ++start;
- }
- for (p = start; p < end && lines; ++p) {
- if (*KDB_WRAP(p) == '\n')
- ++lines;
- }
- end = p;
+ skip = adjust;
+ lines = abs(lines);
} else if (lines > 0) {
- int skip = n - (adjust + lines);
+ skip = n - lines - adjust;
+ lines = abs(lines);
if (adjust >= n) {
kdb_printf("buffer only contains %d lines, "
"nothing printed\n", n);
@@ -2109,39 +2091,56 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_printf("buffer only contains %d lines, first "
"%d lines printed\n", n, lines);
}
- for (; start < end && skip; ++start) {
- if (*KDB_WRAP(start) == '\n')
- --skip;
- }
- for (p = start; p < end && lines; ++p) {
- if (*KDB_WRAP(p) == '\n')
- --lines;
- }
- end = p;
+ } else {
+ lines = n;
}
- /* Do a line at a time (max 200 chars) to reduce protocol overhead */
- c = '\n';
- while (start != end) {
- char buf[201];
- p = buf;
+
+ if (skip >= n || skip < 0)
+ return 0;
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ if (skip) {
+ skip--;
+ continue;
+ }
+ if (!lines--)
+ break;
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- while (start < end && (c = *KDB_WRAP(start)) &&
- (p - buf) < sizeof(buf)-1) {
- ++start;
- *p++ = c;
- if (c == '\n')
- break;
- }
- *p = '\0';
- kdb_printf("%s", buf);
+
+ kdb_printf("%.*s\n", (int)len - 1, buf);
}
- if (c != '\n')
- kdb_printf("\n");
return 0;
}
#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
/*
* kdb_cpu - This function implements the 'cpu' command.
* cpu [<cpunum>]
@@ -2354,69 +2353,6 @@ static int kdb_pid(int argc, const char **argv)
return 0;
}
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- * linked list and executes an arbitrary command for each
- * element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
- int diag = 0;
- unsigned long addr;
- long offset = 0;
- unsigned long va;
- unsigned long linkoffset;
- int nextarg;
- const char *command;
-
- if (argc != 3)
- return KDB_ARGCOUNT;
-
- nextarg = 1;
- diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
- if (diag)
- return diag;
-
- diag = kdbgetularg(argv[2], &linkoffset);
- if (diag)
- return diag;
-
- /*
- * Using the starting address as
- * the first element in the list, and assuming that
- * the list ends with a null pointer.
- */
-
- va = addr;
- command = kdb_strdup(argv[3], GFP_KDB);
- if (!command) {
- kdb_printf("%s: cannot duplicate command\n", __func__);
- return 0;
- }
- /* Recursive use of kdb_parse, do not use argv after this point */
- argv = NULL;
-
- while (va) {
- char buf[80];
-
- if (KDB_FLAG(CMD_INTERRUPT))
- goto out;
-
- sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
- diag = kdb_parse(buf);
- if (diag)
- goto out;
-
- addr = va + linkoffset;
- if (kdb_getword(&va, addr, sizeof(va)))
- goto out;
- }
-
-out:
- kfree(command);
- return diag;
-}
-
static int kdb_kgdb(int argc, const char **argv)
{
return KDB_CMD_KGDB;
@@ -2434,11 +2370,15 @@ static int kdb_help(int argc, const char **argv)
kdb_printf("-----------------------------"
"-----------------------------\n");
for_each_kdbcmd(kt, i) {
- if (kt->cmd_name)
- kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
- kt->cmd_usage, kt->cmd_help);
+ char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
}
return 0;
}
@@ -2743,7 +2683,7 @@ int kdb_register_repeat(char *cmd,
(kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
kfree(kdb_commands);
}
- memset(new + kdb_max_commands, 0,
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
kdb_command_extend * sizeof(*new));
kdb_commands = new;
kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2847,15 +2787,13 @@ static void __init kdb_inittab(void)
"Stack traceback", 1, KDB_REPEAT_NONE);
kdb_register_repeat("btp", kdb_bt, "<pid>",
"Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
- "Display stack all processes", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btc", kdb_bt, "",
"Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
KDB_REPEAT_NONE);
- kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
- "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
kdb_register_repeat("env", kdb_env, "",
"Show environment variables", 0, KDB_REPEAT_NONE);
kdb_register_repeat("set", kdb_set, "",
@@ -2886,6 +2824,10 @@ static void __init kdb_inittab(void)
kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
"Display syslog buffer", 0, KDB_REPEAT_NONE);
#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+ }
kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
"Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40..7afd3c8c41d 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
#define KDB_CMD_SS (-1003)
-#define KDB_CMD_SSB (-1004)
#define KDB_CMD_KGDB (-1005)
/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
* kdb control */
#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
-#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
- * DOING_SS is also set */
#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
* after one ss, independent of
* DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
typedef enum {
KDB_DB_BPT, /* Breakpoint */
KDB_DB_SS, /* Single-step trap */
- KDB_DB_SSB, /* Single step to branch */
KDB_DB_SSBPT, /* Single step over breakpoint */
KDB_DB_NOBPT /* Spurious breakpoint */
} kdb_dbtrap_t;
@@ -205,7 +201,6 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
extern unsigned long kdb_task_state(const struct task_struct *p,
@@ -246,6 +241,13 @@ extern void debug_kusage(void);
extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
#ifdef CONFIG_MODULES
extern struct list_head *kdb_modules;
#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d218..d35cc2d3a4c 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
if (!pfn_valid(pfn))
return 1;
page = pfn_to_page(pfn);
- vaddr = kmap_atomic(page, KM_KDB);
+ vaddr = kmap_atomic(page);
memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
- kunmap_atomic(vaddr, KM_KDB);
+ kunmap_atomic(vaddr);
return 0;
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053a..54996b71e66 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,20 +106,17 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
-
- /* Though tsk->delays accessed later, early exit avoids
- * unnecessary returning of other data
- */
- if (!tsk->delays)
- goto done;
+ cputime_t utime, stime, stimescaled, utimescaled;
tmp = (s64)d->cpu_run_real_total;
- cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ task_cputime(tsk, &utime, &stime);
+ cputime_to_timespec(utime + stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
- cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ cputime_to_timespec(utimescaled + stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -155,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
-done:
return 0;
}
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c..6c6262f86c1 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915efef66..e556751d15d 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
-
-#include <asm/elf.h>
-
+#include <linux/binfmts.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
return 0;
}
-int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
- unsigned long limit)
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
return 1;
}
-int __weak elf_core_write_extra_data(struct file *file, size_t *size,
- unsigned long limit)
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
{
return 1;
}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf..103f5d147b2 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
endif
obj-y := core.o ring_buffer.o callchain.o
+
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f39..97b67df8fbf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
err = alloc_callchain_buffers();
exit:
+ if (err)
+ atomic_dec(&nr_callchain_events);
+
mutex_unlock(&callchain_mutex);
return err;
@@ -153,11 +156,17 @@ put_callchain_entry(int rctx)
put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
}
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
int rctx;
struct perf_callchain_entry *entry;
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
entry = get_callchain_entry(&rctx);
if (rctx == -1)
@@ -168,18 +177,29 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
entry->nr = 0;
- if (!user_mode(regs)) {
+ if (kernel && !user_mode(regs)) {
perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
}
- if (regs) {
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
}
exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1b5c081d8b9..6b17ac1b0c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -36,6 +37,10 @@
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/mman.h>
#include "internal.h"
@@ -116,7 +121,15 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
- PERF_FLAG_PID_CGROUP)
+ PERF_FLAG_PID_CGROUP |\
+ PERF_FLAG_FD_CLOEXEC)
+
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+ (PERF_SAMPLE_BRANCH_KERNEL |\
+ PERF_SAMPLE_BRANCH_HV)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
@@ -128,12 +141,14 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
+static atomic_t nr_freq_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -154,25 +169,130 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
/*
* max perf event sample rate
*/
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
- DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT 25
+
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+void update_perf_cpu_limits(void)
+{
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+ do_div(tmp, 100);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+}
+
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ update_perf_cpu_limits();
return 0;
}
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done. This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+static DEFINE_PER_CPU(u64, running_sample_length);
+
+static void perf_duration_warn(struct irq_work *w)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ local_samples_len = __get_cpu_var(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ if (allowed_ns == 0)
+ return;
+
+ /* decay the counter by 1 average sample */
+ local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+ local_samples_len += sample_len_ns;
+ __get_cpu_var(running_sample_length) = local_samples_len;
+
+ /*
+ * note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * from having to maintain a count.
+ */
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ if (avg_local_sample_len <= allowed_ns)
+ return;
+
+ if (max_samples_per_tick <= 1)
+ return;
+
+ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+ update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
+}
+
static atomic64_t perf_event_id;
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -185,9 +305,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
-
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -225,6 +342,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
@@ -232,8 +363,8 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task)
{
- return container_of(task_subsys_state(task, perf_subsys_id),
- struct perf_cgroup, css);
+ return container_of(task_css(task, perf_event_cgrp_id),
+ struct perf_cgroup, css);
}
static inline bool
@@ -242,12 +373,22 @@ perf_cgroup_match(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- return !event->cgrp || event->cgrp == cpuctx->cgrp;
-}
+ /* @event doesn't care about cgroup */
+ if (!event->cgrp)
+ return true;
-static inline void perf_get_cgroup(struct perf_event *event)
-{
- css_get(&event->cgrp->css);
+ /* wants specific cgroup scope but @cpuctx isn't associated with any */
+ if (!cpuctx->cgrp)
+ return false;
+
+ /*
+ * Cgroup scoping is recursive. An event enabled for a cgroup is
+ * also enabled for all its descendant cgroups. If @cpuctx's
+ * cgroup is a descendant of @event's (the test covers identity
+ * case), it's a match.
+ */
+ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
}
static inline void perf_put_cgroup(struct perf_event *event)
@@ -363,6 +504,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
/*
* perf_cgroup_events says at least one
@@ -386,9 +529,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
- /* set cgrp before ctxsw in to
- * allow event_filter_match() to not
- * have to pass task around
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
*/
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -459,14 +603,14 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct file *file;
- int ret = 0, fput_needed;
+ struct fd f = fdget(fd);
+ int ret = 0;
- file = fget_light(fd, &fput_needed);
- if (!file)
+ if (!f.file)
return -EBADF;
- css = cgroup_css_from_dir(file, perf_subsys_id);
+ css = css_tryget_online_from_dir(f.file->f_dentry,
+ &perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -475,9 +619,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
- /* must be done before we fput() the file */
- perf_get_cgroup(event);
-
/*
* all events in a group must monitor
* the same cgroup because a task belongs
@@ -488,7 +629,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
ret = -EINVAL;
}
out:
- fput_light(file, fput_needed);
+ fdput(f);
return ret;
}
@@ -612,6 +753,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
#endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+ struct perf_cpu_context *cpuctx;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ int rotations = 0;
+
+ WARN_ON(!irqs_disabled());
+
+ cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+ rotations = perf_rotate_context(cpuctx);
+
+ /*
+ * arm timer if needed
+ */
+ if (rotations) {
+ hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ ret = HRTIMER_RESTART;
+ }
+
+ return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ continue;
+
+ hrtimer_cancel(&cpuctx->hrtimer);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+ int timer;
+
+ /* no multiplexing needed for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ /*
+ * check default is sane, if not set then force to
+ * default interval (1/tick)
+ */
+ timer = pmu->hrtimer_interval_ms;
+ if (timer < 1)
+ timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* not for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ if (hrtimer_active(hr))
+ return;
+
+ if (!hrtimer_callback_running(hr))
+ __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+ 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -666,6 +907,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
+ ctx->generation++;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -714,8 +956,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
- rcu_read_lock();
retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
@@ -731,6 +983,8 @@ retry:
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
goto retry;
}
@@ -740,6 +994,7 @@ retry:
}
}
rcu_read_unlock();
+ preempt_enable();
return ctx;
}
@@ -881,12 +1136,26 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+
+ ctx->generation++;
+}
+
+/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
}
/*
@@ -934,9 +1203,18 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
event->header_size = size;
}
@@ -952,6 +1230,9 @@ static void perf_event__id_header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ size += sizeof(data->id);
+
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
@@ -1020,6 +1301,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack--;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1040,6 +1324,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
@@ -1118,6 +1404,8 @@ event_sched_out(struct perf_event *event,
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
+ perf_pmu_disable(event->pmu);
+
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
@@ -1134,6 +1422,8 @@ event_sched_out(struct perf_event *event,
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+
+ perf_pmu_enable(event->pmu);
}
static void
@@ -1156,6 +1446,11 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
/*
* Cross CPU call to remove a performance event
*
@@ -1164,12 +1459,15 @@ group_sched_out(struct perf_event *group_event,
*/
static int __perf_remove_from_context(void *info)
{
- struct perf_event *event = info;
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
@@ -1194,10 +1492,14 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
lockdep_assert_held(&ctx->mutex);
@@ -1206,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event)
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, event);
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
- if (!task_function_call(task, __perf_remove_from_context, event))
+ if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -1228,6 +1530,8 @@ retry:
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
+ if (detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -1235,7 +1539,7 @@ retry:
/*
* Cross CPU call to disable a performance event
*/
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -1374,6 +1678,9 @@ event_sched_in(struct perf_event *event,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
+ int ret = 0;
+
+ lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -1396,10 +1703,13 @@ event_sched_in(struct perf_event *event,
*/
smp_wmb();
+ perf_pmu_disable(event->pmu);
+
if (event->pmu->add(event, PERF_EF_START)) {
event->state = PERF_EVENT_STATE_INACTIVE;
event->oncpu = -1;
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1415,7 +1725,10 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- return 0;
+out:
+ perf_pmu_enable(event->pmu);
+
+ return ret;
}
static int
@@ -1424,7 +1737,7 @@ group_sched_in(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = group_event->pmu;
+ struct pmu *pmu = ctx->pmu;
u64 now = ctx->time;
bool simulate = false;
@@ -1435,6 +1748,7 @@ group_sched_in(struct perf_event *group_event,
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -1481,6 +1795,8 @@ group_error:
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
+
return -EAGAIN;
}
@@ -1627,6 +1943,8 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
event->ctx = ctx;
+ if (event->cpu != -1)
+ event->cpu = cpu;
if (!task) {
/*
@@ -1691,7 +2009,16 @@ static int __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- if (WARN_ON_ONCE(!ctx->is_active))
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -1734,8 +2061,10 @@ static int __perf_event_enable(void *info)
* If this event can't go on and it's part of a
* group, then the whole group has to come off.
*/
- if (leader != event)
+ if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
+ perf_cpu_hrtimer_restart(cpuctx);
+ }
if (leader->attr.pinned) {
update_group_times(leader);
leader->state = PERF_EVENT_STATE_ERROR;
@@ -1860,22 +2189,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
}
/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled events.
- * If the number of enabled events is the same, then the set
- * of enabled events should be the same, because these are both
- * inherited contexts, therefore we can't access individual events
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
- return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
- && ctx1->parent_gen == ctx2->parent_gen
- && !ctx1->pin_count && !ctx2->pin_count;
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
}
static void __perf_event_sync_stat(struct perf_event *event,
@@ -1924,9 +2269,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
perf_event_update_userpage(next_event);
}
-#define list_next_entry(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
@@ -1958,7 +2300,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
+ struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
@@ -1970,10 +2312,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
return;
rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp[ctxn];
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent || !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
@@ -2001,6 +2351,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
+unlock:
rcu_read_unlock();
if (do_switch) {
@@ -2195,6 +2546,64 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
}
/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /* no need to flush branch stack if not changing task */
+ if (prev == task)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ /*
+ * check if the context has at least one
+ * event using PERF_SAMPLE_BRANCH_STACK
+ */
+ if (cpuctx->ctx.nr_branch_stack > 0
+ && pmu->flush_branch_stack) {
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->flush_branch_stack();
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
*
@@ -2225,6 +2634,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
*/
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
+
+ /* check for system-wide branch_stack events */
+ if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+ perf_branch_stack_sched_in(prev, task);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2363,16 +2776,18 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ perf_pmu_disable(event->pmu);
+
hwc = &event->hw;
- if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+ if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
- continue;
+ goto next;
/*
* stop the event and update event->count
@@ -2394,6 +2809,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+ next:
+ perf_pmu_enable(event->pmu);
}
perf_pmu_enable(ctx->pmu);
@@ -2418,7 +2835,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
* because they're strictly cpu affine and rotate_start is called with IRQs
* disabled, while rotate_context is called from IRQ context.
*/
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
int rotate = 0, remove = 1;
@@ -2457,7 +2874,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
+
+ return rotate;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (atomic_read(&nr_freq_events) ||
+ __this_cpu_read(perf_throttled_count))
+ return false;
+ else
+ return true;
}
+#endif
void perf_event_task_tick(void)
{
@@ -2478,10 +2908,6 @@ void perf_event_task_tick(void)
ctx = cpuctx->task_ctx;
if (ctx)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- if (cpuctx->jiffies_interval == 1 ||
- !(jiffies % cpuctx->jiffies_interval))
- perf_rotate_context(cpuctx);
}
}
@@ -2549,6 +2975,22 @@ out:
local_irq_restore(flags);
}
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -2771,35 +3213,51 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
-static void free_event(struct perf_event *event)
+static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
- irq_work_sync(&event->pending);
+ if (event->parent)
+ return;
- if (!event->parent) {
- if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec_deferred(&perf_sched_events);
- if (event->attr.mmap || event->attr.mmap_data)
- atomic_dec(&nr_mmap_events);
- if (event->attr.comm)
- atomic_dec(&nr_comm_events);
- if (event->attr.task)
- atomic_dec(&nr_task_events);
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- if (is_cgroup_event(event)) {
- atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_dec_deferred(&perf_sched_events);
- }
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
}
+ if (is_cgroup_event(event))
+ atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+}
- if (event->rb) {
- ring_buffer_put(event->rb);
- event->rb = NULL;
- }
+static void unaccount_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_dec(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_dec(&nr_comm_events);
+ if (event->attr.task)
+ atomic_dec(&nr_task_events);
+ if (event->attr.freq)
+ atomic_dec(&nr_freq_events);
if (is_cgroup_event(event))
- perf_detach_cgroup(event);
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (has_branch_stack(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+
+ unaccount_event_cpu(event, event->cpu);
+}
+
+static void __free_event(struct perf_event *event)
+{
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
if (event->destroy)
event->destroy(event);
@@ -2807,48 +3265,62 @@ static void free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->pmu)
+ module_put(event->pmu->module);
+
call_rcu(&event->rcu_head, free_event_rcu);
}
-int perf_event_release_kernel(struct perf_event *event)
+static void _free_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
+ irq_work_sync(&event->pending);
- WARN_ON_ONCE(ctx->parent_ctx);
- /*
- * There are two ways this annotation is useful:
- *
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
- *
- * 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
- */
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- raw_spin_unlock_irq(&ctx->lock);
- perf_remove_from_context(event);
- mutex_unlock(&ctx->mutex);
+ unaccount_event(event);
- free_event(event);
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
- return 0;
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+ __free_event(event);
+}
+
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
+{
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
+
+ _free_event(event);
}
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
{
- struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
- file->private_data = NULL;
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
@@ -2883,7 +3355,37 @@ static int perf_release(struct inode *inode, struct file *file)
put_task_struct(owner);
}
- return perf_event_release_kernel(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ perf_remove_from_context(event, true);
+ mutex_unlock(&ctx->mutex);
+
+ _free_event(event);
+}
+
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3027,30 +3529,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
unsigned int events = POLL_HUP;
/*
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
- * grabs the rb reference but perf_event_set_output() overrides it.
- * Here is the timeline for two threads T1, T2:
- * t0: T1, rb = rcu_dereference(event->rb)
- * t1: T2, old_rb = event->rb
- * t2: T2, event->rb = new rb
- * t3: T2, ring_buffer_detach(old_rb)
- * t4: T1, ring_buffer_attach(rb1)
- * t5: T1, poll_wait(event->waitq)
- *
- * To avoid this problem, we grab mmap_mutex in perf_poll()
- * thereby ensuring that the assignment of the new ring buffer
- * and the detachment of the old buffer appear atomic to perf_poll()
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (rb) {
- ring_buffer_attach(event, rb);
+ rb = event->rb;
+ if (rb)
events = atomic_xchg(&rb->poll, 0);
- }
- rcu_read_unlock();
-
mutex_unlock(&event->mmap_mutex);
poll_wait(file, &event->waitq, wait);
@@ -3095,16 +3580,15 @@ static void perf_event_for_each(struct perf_event *event,
event = event->group_leader;
perf_event_for_each_child(event, func);
- func(event);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
- perf_event_for_each_child(event, func);
+ perf_event_for_each_child(sibling, func);
mutex_unlock(&ctx->mutex);
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
struct perf_event_context *ctx = event->ctx;
- int ret = 0;
+ int ret = 0, active;
u64 value;
if (!is_sampling_event(event))
@@ -3128,6 +3612,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->attr.sample_period = value;
event->hw.sample_period = value;
}
+
+ active = (event->state == PERF_EVENT_STATE_ACTIVE);
+ if (active) {
+ perf_pmu_disable(ctx->pmu);
+ event->pmu->stop(event, PERF_EF_UPDATE);
+ }
+
+ local64_set(&event->hw.period_left, 0);
+
+ if (active) {
+ event->pmu->start(event, PERF_EF_RELOAD);
+ perf_pmu_enable(ctx->pmu);
+ }
+
unlock:
raw_spin_unlock_irq(&ctx->lock);
@@ -3136,21 +3634,18 @@ unlock:
static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static inline int perf_fget_light(int fd, struct fd *p)
{
- struct file *file;
-
- file = fget_light(fd, fput_needed);
- if (!file)
- return ERR_PTR(-EBADF);
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
- if (file->f_op != &perf_fops) {
- fput_light(file, *fput_needed);
- *fput_needed = 0;
- return ERR_PTR(-EBADF);
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return -EBADF;
}
-
- return file->private_data;
+ *p = f;
+ return 0;
}
static int perf_event_set_output(struct perf_event *event,
@@ -3180,22 +3675,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
+ case PERF_EVENT_IOC_ID:
+ {
+ u64 id = primary_event_id(event);
+
+ if (copy_to_user((void __user *)arg, &id, sizeof(id)))
+ return -EFAULT;
+ return 0;
+ }
+
case PERF_EVENT_IOC_SET_OUTPUT:
{
- struct perf_event *output_event = NULL;
- int fput_needed = 0;
int ret;
-
if (arg != -1) {
- output_event = perf_fget_light(arg, &fput_needed);
- if (IS_ERR(output_event))
- return PTR_ERR(output_event);
+ struct perf_event *output_event;
+ struct fd output;
+ ret = perf_fget_light(arg, &output);
+ if (ret)
+ return ret;
+ output_event = output.file->private_data;
+ ret = perf_event_set_output(event, output_event);
+ fdput(output);
+ } else {
+ ret = perf_event_set_output(event, NULL);
}
-
- ret = perf_event_set_output(event, output_event);
- if (output_event)
- fput_light(output_event->filp, fput_needed);
-
return ret;
}
@@ -3238,10 +3741,6 @@ int perf_event_task_disable(void)
return 0;
}
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
-
static int perf_event_index(struct perf_event *event)
{
if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3749,46 @@ static int perf_event_index(struct perf_event *event)
if (event->state != PERF_EVENT_STATE_ACTIVE)
return 0;
- return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+ return event->pmu->event_idx(event);
}
static void calc_timer_values(struct perf_event *event,
+ u64 *now,
u64 *enabled,
u64 *running)
{
- u64 now, ctx_time;
+ u64 ctx_time;
- now = perf_clock();
- ctx_time = event->shadow_ctx_time + now;
+ *now = perf_clock();
+ ctx_time = event->shadow_ctx_time + *now;
*enabled = ctx_time - event->tstamp_enabled;
*running = ctx_time - event->tstamp_running;
}
+static void perf_event_init_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ userpg = rb->user_page;
+
+ /* Allow new userspace to detect that bit 0 is deprecated */
+ userpg->cap_bit0_is_deprecated = 1;
+ userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+
+unlock:
+ rcu_read_unlock();
+}
+
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
/*
* Callers need to ensure there can be no nesting of this function, otherwise
* the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,9 +3798,13 @@ void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
struct ring_buffer *rb;
- u64 enabled, running;
+ u64 enabled, running, now;
rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
@@ -3286,13 +3814,9 @@ void perf_event_update_userpage(struct perf_event *event)
* because of locking issue as we can be called in
* NMI context
*/
- calc_timer_values(event, &enabled, &running);
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
+ calc_timer_values(event, &now, &enabled, &running);
userpg = rb->user_page;
-
/*
* Disable preemption so as to not let the corresponding user-space
* spin too long if we get preempted.
@@ -3302,7 +3826,7 @@ void perf_event_update_userpage(struct perf_event *event)
barrier();
userpg->index = perf_event_index(event);
userpg->offset = perf_event_count(event);
- if (event->state == PERF_EVENT_STATE_ACTIVE)
+ if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
userpg->time_enabled = enabled +
@@ -3311,6 +3835,8 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
+ arch_perf_update_userpage(userpg, now);
+
barrier();
++userpg->lock;
preempt_enable();
@@ -3356,32 +3882,47 @@ unlock:
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
+ struct ring_buffer *old_rb = NULL;
unsigned long flags;
- if (!list_empty(&event->rb_entry))
- return;
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
- spin_lock_irqsave(&rb->event_lock, flags);
- if (!list_empty(&event->rb_entry))
- goto unlock;
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
- list_add(&event->rb_entry, &rb->event_list);
-unlock:
- spin_unlock_irqrestore(&rb->event_lock, flags);
-}
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
-static void ring_buffer_detach(struct perf_event *event,
- struct ring_buffer *rb)
-{
- unsigned long flags;
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
- if (list_empty(&event->rb_entry))
- return;
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
+
+ rcu_assign_pointer(event->rb, rb);
- spin_lock_irqsave(&rb->event_lock, flags);
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
}
static void ring_buffer_wakeup(struct perf_event *event)
@@ -3390,13 +3931,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
- wake_up_all(&event->waitq);
-
-unlock:
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
rcu_read_unlock();
}
@@ -3425,18 +3963,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
- struct perf_event *event, *n;
- unsigned long flags;
-
if (!atomic_dec_and_test(&rb->refcount))
return;
- spin_lock_irqsave(&rb->event_lock, flags);
- list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- }
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3446,26 +3976,95 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
}
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->rb);
- struct user_struct *user = event->mmap_user;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = ring_buffer_get(event);
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ goto out_put;
+
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->pinned_vm -= event->mmap_locked;
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
mutex_unlock(&event->mmap_mutex);
+ put_event(event);
- ring_buffer_put(rb);
- free_uid(user);
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
}
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+out_put:
+ ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3515,12 +4114,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages == nr_pages)
- atomic_inc(&event->rb->refcount);
- else
+ if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
goto unlock;
}
@@ -3561,19 +4172,29 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->rb, rb);
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
atomic_long_add(user_extra, &user->locked_vm);
- event->mmap_locked = extra;
- event->mmap_user = get_current_user();
- vma->vm_mm->pinned_vm += event->mmap_locked;
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
unlock:
if (!ret)
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_RESERVED;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -3581,7 +4202,7 @@ unlock:
static int perf_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
@@ -3660,6 +4281,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask)
+{
+ int bit;
+
+ for_each_set_bit(bit, (const unsigned long *) &mask,
+ sizeof(mask) * BITS_PER_BYTE) {
+ u64 val;
+
+ val = perf_reg_value(regs, bit);
+ perf_output_put(handle, val);
+ }
+}
+
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+ struct pt_regs *regs)
+{
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ regs_user->regs = regs;
+ regs_user->abi = perf_reg_abi(current);
+ }
+}
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+ unsigned long addr = perf_user_stack_pointer(regs);
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+ struct pt_regs *regs)
+{
+ u64 task_size;
+
+ /* No regs, no stack pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ /*
+ * Check if we fit in with the requested stack size into the:
+ * - TASK_SIZE
+ * If we don't, we limit the size to the TASK_SIZE.
+ *
+ * - remaining sample size
+ * If we don't, we customize the stack size to
+ * fit in to the remaining sample size.
+ */
+
+ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+ stack_size = min(stack_size, (u16) task_size);
+
+ /* Current header size plus static size and dynamic size. */
+ header_size += 2 * sizeof(u64);
+
+ /* Do we fit in with the current stack dump size? */
+ if ((u16) (header_size + stack_size) < header_size) {
+ /*
+ * If we overflow the maximum size for the sample,
+ * we customize the stack dump size to fit in.
+ */
+ stack_size = USHRT_MAX - header_size - sizeof(u64);
+ stack_size = round_up(stack_size, sizeof(u64));
+ }
+
+ return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+ struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned long sp;
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user stack dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ sp = perf_user_stack_pointer(regs);
+ rem = __output_copy_user(handle, (void *) sp, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -3678,7 +4425,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_TIME)
data->time = perf_clock();
- if (sample_type & PERF_SAMPLE_ID)
+ if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -3717,6 +4464,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
}
void perf_event__output_id_sample(struct perf_event *event,
@@ -3782,7 +4532,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;
- if (sub != event)
+ if ((sub != event) &&
+ (sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
values[n++] = perf_event_count(sub);
@@ -3799,7 +4550,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
- u64 enabled = 0, running = 0;
+ u64 enabled = 0, running = 0, now;
u64 read_format = event->attr.read_format;
/*
@@ -3812,7 +4563,7 @@ static void perf_output_read(struct perf_output_handle *handle,
* NMI context
*/
if (read_format & PERF_FORMAT_TOTAL_TIMES)
- calc_timer_values(event, &enabled, &running);
+ calc_timer_values(event, &now, &enabled, &running);
if (event->attr.read_format & PERF_FORMAT_GROUP)
perf_output_read_group(handle, event, enabled, running);
@@ -3829,6 +4580,9 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, *header);
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+
if (sample_type & PERF_SAMPLE_IP)
perf_output_put(handle, data->ip);
@@ -3889,6 +4643,56 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
+ size_t size;
+
+ size = data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+
+ perf_output_put(handle, data->br_stack->nr);
+ perf_output_copy(handle, data->br_stack->entries, size);
+ } else {
+ /*
+ * we always store at least the value of nr
+ */
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ u64 abi = data->regs_user.abi;
+
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_user;
+ perf_output_sample_regs(handle,
+ data->regs_user.regs,
+ mask);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ perf_output_sample_ustack(handle,
+ data->stack_user_size,
+ data->regs_user.regs);
+ }
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -3925,7 +4729,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- data->callchain = perf_callchain(regs);
+ data->callchain = perf_callchain(event, regs);
if (data->callchain)
size += data->callchain->nr;
@@ -3944,6 +4748,58 @@ void perf_prepare_sample(struct perf_event_header *header,
WARN_ON_ONCE(size & (sizeof(u64)-1));
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ int size = sizeof(u64); /* nr */
+ if (data->br_stack) {
+ size += data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+ }
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_user(&data->regs_user, regs);
+
+ if (data->regs_user.regs) {
+ u64 mask = event->attr.sample_regs_user;
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ /*
+ * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * processed as the last one or have additional check added
+ * in case new sample type is added, because we could eat
+ * up the rest of the sample size.
+ */
+ struct perf_regs_user *uregs = &data->regs_user;
+ u16 stack_size = event->attr.sample_stack_user;
+ u16 size = sizeof(u64);
+
+ if (!uregs->abi)
+ perf_sample_regs_user(uregs, regs);
+
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ uregs->regs);
+
+ /*
+ * If there is something to dump, add space for the dump
+ * itself and for the field that tells the dynamic size,
+ * which is how many have been actually dumped.
+ */
+ if (stack_size)
+ size += sizeof(u64) + stack_size;
+
+ data->stack_user_size = stack_size;
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -4009,10 +4865,63 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+ perf_event_aux_output_cb output,
+ void *data)
+{
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
+}
+
+static void
+perf_event_aux(perf_event_aux_output_cb output, void *data,
+ struct perf_event_context *task_ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int ctxn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ goto next;
+ perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ if (task_ctx)
+ goto next;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto next;
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_aux_ctx(ctx, output, data);
+next:
+ put_cpu_ptr(pmu->pmu_cpu_context);
+ }
+
+ if (task_ctx) {
+ preempt_disable();
+ perf_event_aux_ctx(task_ctx, output, data);
+ preempt_enable();
+ }
+ rcu_read_unlock();
+}
+
/*
* task tracking -- fork/exit
*
- * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
*/
struct perf_task_event {
@@ -4030,14 +4939,25 @@ struct perf_task_event {
} event_id;
};
+static int perf_event_task_match(struct perf_event *event)
+{
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap2 || event->attr.mmap_data ||
+ event->attr.task;
+}
+
static void perf_event_task_output(struct perf_event *event,
- struct perf_task_event *task_event)
+ void *data)
{
+ struct perf_task_event *task_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
struct task_struct *task = task_event->task;
int ret, size = task_event->event_id.header.size;
+ if (!perf_event_task_match(event))
+ return;
+
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
@@ -4060,61 +4980,6 @@ out:
task_event->event_id.header.size = size;
}
-static int perf_event_task_match(struct perf_event *event)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm || event->attr.mmap ||
- event->attr.mmap_data || event->attr.task)
- return 1;
-
- return 0;
-}
-
-static void perf_event_task_ctx(struct perf_event_context *ctx,
- struct perf_task_event *task_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_task_match(event))
- perf_event_task_output(event, task_event);
- }
-}
-
-static void perf_event_task_event(struct perf_task_event *task_event)
-{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
- struct pmu *pmu;
- int ctxn;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
- goto next;
- perf_event_task_ctx(&cpuctx->ctx, task_event);
-
- ctx = task_event->task_ctx;
- if (!ctx) {
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- }
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
-}
-
static void perf_event_task(struct task_struct *task,
struct perf_event_context *task_ctx,
int new)
@@ -4143,7 +5008,9 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_task_event(&task_event);
+ perf_event_aux(perf_event_task_output,
+ &task_event,
+ task_ctx);
}
void perf_event_fork(struct task_struct *task)
@@ -4168,14 +5035,23 @@ struct perf_comm_event {
} event_id;
};
+static int perf_event_comm_match(struct perf_event *event)
+{
+ return event->attr.comm;
+}
+
static void perf_event_comm_output(struct perf_event *event,
- struct perf_comm_event *comm_event)
+ void *data)
{
+ struct perf_comm_event *comm_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
int ret;
+ if (!perf_event_comm_match(event))
+ return;
+
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
comm_event->event_id.header.size);
@@ -4197,39 +5073,10 @@ out:
comm_event->event_id.header.size = size;
}
-static int perf_event_comm_match(struct perf_event *event)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm)
- return 1;
-
- return 0;
-}
-
-static void perf_event_comm_ctx(struct perf_event_context *ctx,
- struct perf_comm_event *comm_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_comm_match(event))
- perf_event_comm_output(event, comm_event);
- }
-}
-
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
char comm[TASK_COMM_LEN];
unsigned int size;
- struct pmu *pmu;
- int ctxn;
memset(comm, 0, sizeof(comm));
strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4239,39 +5086,15 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
- goto next;
- perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_event_comm_ctx(ctx, comm_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_comm_output,
+ comm_event,
+ NULL);
}
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
- struct perf_event_context *ctx;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- perf_event_enable_on_exec(ctx);
- }
if (!atomic_read(&nr_comm_events))
return;
@@ -4283,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
- .misc = 0,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
@@ -4303,6 +5126,10 @@ struct perf_mmap_event {
const char *file_name;
int file_size;
+ int maj, min;
+ u64 ino;
+ u64 ino_generation;
+ u32 prot, flags;
struct {
struct perf_event_header header;
@@ -4315,14 +5142,39 @@ struct perf_mmap_event {
} event_id;
};
+static int perf_event_mmap_match(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
+
+ return (!executable && event->attr.mmap_data) ||
+ (executable && (event->attr.mmap || event->attr.mmap2));
+}
+
static void perf_event_mmap_output(struct perf_event *event,
- struct perf_mmap_event *mmap_event)
+ void *data)
{
+ struct perf_mmap_event *mmap_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
int ret;
+ if (!perf_event_mmap_match(event, data))
+ return;
+
+ if (event->attr.mmap2) {
+ mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
+ mmap_event->event_id.header.size += sizeof(mmap_event->maj);
+ mmap_event->event_id.header.size += sizeof(mmap_event->min);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
+ }
+
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
mmap_event->event_id.header.size);
@@ -4333,6 +5185,16 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
+
+ if (event->attr.mmap2) {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
+ }
+
__output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
@@ -4343,119 +5205,116 @@ out:
mmap_event->event_id.header.size = size;
}
-static int perf_event_mmap_match(struct perf_event *event,
- struct perf_mmap_event *mmap_event,
- int executable)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if ((!executable && event->attr.mmap_data) ||
- (executable && event->attr.mmap))
- return 1;
-
- return 0;
-}
-
-static void perf_event_mmap_ctx(struct perf_event_context *ctx,
- struct perf_mmap_event *mmap_event,
- int executable)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_mmap_match(event, mmap_event, executable))
- perf_event_mmap_output(event, mmap_event);
- }
-}
-
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
struct vm_area_struct *vma = mmap_event->vma;
struct file *file = vma->vm_file;
+ int maj = 0, min = 0;
+ u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
- const char *name;
- struct pmu *pmu;
- int ctxn;
-
- memset(tmp, 0, sizeof(tmp));
+ char *name;
if (file) {
+ struct inode *inode;
+ dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
/*
- * d_path works from the end of the rb backwards, so we
+ * d_path() works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
- if (!buf) {
- name = strncpy(tmp, "//enomem", sizeof(tmp));
- goto got_name;
- }
- name = d_path(&file->f_path, buf, PATH_MAX);
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
- name = strncpy(tmp, "//toolong", sizeof(tmp));
- goto got_name;
+ name = "//toolong";
+ goto cpy_name;
}
+ inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ gen = inode->i_generation;
+ maj = MAJOR(dev);
+ min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
+ goto got_name;
} else {
- if (arch_vma_name(mmap_event->vma)) {
- name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp));
- goto got_name;
- }
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
- if (!vma->vm_mm) {
- name = strncpy(tmp, "[vdso]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
- name = strncpy(tmp, "[heap]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
- name = strncpy(tmp, "[stack]", sizeof(tmp));
- goto got_name;
+ name = "[stack]";
+ goto cpy_name;
}
- name = strncpy(tmp, "//anon", sizeof(tmp));
- goto got_name;
+ name = "//anon";
+ goto cpy_name;
}
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
got_name:
- size = ALIGN(strlen(name)+1, sizeof(u64));
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
mmap_event->file_name = name;
mmap_event->file_size = size;
+ mmap_event->maj = maj;
+ mmap_event->min = min;
+ mmap_event->ino = ino;
+ mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
- mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
- goto next;
- perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_event_mmap_ctx(ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
- }
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_mmap_output,
+ mmap_event,
+ NULL);
kfree(buf);
}
@@ -4483,6 +5342,12 @@ void perf_event_mmap(struct vm_area_struct *vma)
.len = vma->vm_end - vma->vm_start,
.pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
},
+ /* .maj (attr_mmap2 only) */
+ /* .min (attr_mmap2 only) */
+ /* .ino (attr_mmap2 only) */
+ /* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
@@ -4560,6 +5425,7 @@ static int __perf_event_overflow(struct perf_event *event,
__this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
+ tick_nohz_full_kick();
ret = 1;
}
}
@@ -4618,6 +5484,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -4629,7 +5498,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
* sign as trigger.
*/
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 period = hwc->last_period;
@@ -4798,7 +5667,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct perf_event *event;
- struct hlist_node *node;
struct hlist_head *head;
rcu_read_lock();
@@ -4806,7 +5674,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (!head)
goto end;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
@@ -4839,7 +5707,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
if (rctx < 0)
return;
- perf_sample_data_init(&data, addr);
+ perf_sample_data_init(&data, addr, 0);
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
@@ -4865,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -4923,11 +5797,6 @@ static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
- if (event->cpu != -1) {
- swevent_hlist_put_cpu(event, event->cpu);
- return;
- }
-
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
@@ -4961,9 +5830,6 @@ static int swevent_hlist_get(struct perf_event *event)
int err;
int cpu, failed_cpu;
- if (event->cpu != -1)
- return swevent_hlist_get_cpu(event, event->cpu);
-
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
@@ -4986,7 +5852,7 @@ fail:
return err;
}
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
static void sw_perf_event_destroy(struct perf_event *event)
{
@@ -4994,17 +5860,23 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
- jump_label_dec(&perf_swevent_enabled[event_id]);
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
}
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5896,18 @@ static int perf_swevent_init(struct perf_event *event)
if (err)
return err;
- jump_label_inc(&perf_swevent_enabled[event_id]);
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
}
return 0;
}
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+ return 0;
+}
+
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
@@ -5040,6 +5917,8 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .event_idx = perf_swevent_event_idx,
};
#ifdef CONFIG_EVENT_TRACING
@@ -5073,25 +5952,50 @@ static int perf_tp_event_match(struct perf_event *event,
}
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
- struct pt_regs *regs, struct hlist_head *head, int rctx)
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
{
struct perf_sample_data data;
struct perf_event *event;
- struct hlist_node *node;
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
};
- perf_sample_data_init(&data, addr);
+ perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -5108,6 +6012,12 @@ static int perf_tp_event_init(struct perf_event *event)
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
err = perf_trace_init(event);
if (err)
return err;
@@ -5126,6 +6036,8 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .event_idx = perf_swevent_event_idx,
};
static inline void perf_tp_register(void)
@@ -5179,7 +6091,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
- perf_sample_data_init(&sample, bp->attr.bp_addr);
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
perf_swevent_event(bp, 1, &sample, regs);
@@ -5205,13 +6117,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event->pmu->read(event);
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
regs = get_irq_regs();
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (perf_event_overflow(event, &data, regs))
+ if (__perf_event_overflow(event, 1, &data, regs))
ret = HRTIMER_NORESTART;
}
@@ -5275,6 +6186,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5331,6 +6243,12 @@ static int cpu_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
perf_swevent_init_hrtimer(event);
return 0;
@@ -5345,6 +6263,8 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
+
+ .event_idx = perf_swevent_event_idx,
};
/*
@@ -5403,6 +6323,12 @@ static int task_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
perf_swevent_init_hrtimer(event);
return 0;
@@ -5417,6 +6343,8 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
+
+ .event_idx = perf_swevent_event_idx,
};
static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,11 +6372,16 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
perf_pmu_enable(pmu);
}
+static int perf_event_idx_default(struct perf_event *event)
+{
+ return event->hw.idx + 1;
+}
+
/*
* Ensures all contexts with the same task_ctx_nr have the same
* pmu_cpu_context too.
*/
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
struct pmu *pmu;
@@ -5472,8 +6405,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- if (cpuctx->active_pmu == old_pmu)
- cpuctx->active_pmu = pmu;
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
}
}
@@ -5505,16 +6438,64 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);
+
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
-static struct device_attribute pmu_dev_attrs[] = {
- __ATTR_RO(type),
- __ATTR_NULL,
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;
+
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;
+
+ if (timer < 1)
+ return -EINVAL;
+
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;
+
+ pmu->hrtimer_interval_ms = timer;
+
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }
+
+ return count;
+}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
- .dev_attrs = pmu_dev_attrs,
+ .dev_groups = pmu_dev_groups,
};
static void pmu_dev_release(struct device *dev)
@@ -5530,6 +6511,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (!pmu->dev)
goto out;
+ pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
ret = dev_set_name(pmu->dev, "%s", pmu->name);
if (ret)
@@ -5553,7 +6535,7 @@ free_dev:
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
int cpu, ret;
@@ -5569,13 +6551,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
pmu->name = name;
if (type < 0) {
- int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
- if (!err)
- goto free_pdc;
-
- err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
- if (err) {
- ret = err;
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
goto free_pdc;
}
}
@@ -5592,6 +6570,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
+ ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
@@ -5605,9 +6584,11 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
- cpuctx->jiffies_interval = 1;
+
+ __perf_cpu_hrtimer_init(cpuctx, cpu);
+
INIT_LIST_HEAD(&cpuctx->rotation_list);
- cpuctx->active_pmu = pmu;
+ cpuctx->unique_pmu = pmu;
}
got_cpu_context:
@@ -5633,6 +6614,9 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->event_idx)
+ pmu->event_idx = perf_event_idx_default;
+
list_add_rcu(&pmu->entry, &pmus);
ret = 0;
unlock:
@@ -5652,6 +6636,7 @@ free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
@@ -5673,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu)
put_device(pmu->dev);
free_pmu_context(pmu);
}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
struct pmu *perf_init_event(struct perf_event *event)
{
@@ -5686,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (ret)
@@ -5694,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event)
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (!ret)
@@ -5711,6 +6705,44 @@ unlock:
return pmu;
}
+static void account_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
+ }
+ if (is_cgroup_event(event))
+ atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void account_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_inc(&perf_sched_events.key);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_inc(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_inc(&nr_comm_events);
+ if (event->attr.task)
+ atomic_inc(&nr_task_events);
+ if (event->attr.freq) {
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_full_kick_all();
+ }
+ if (has_branch_stack(event))
+ static_key_slow_inc(&perf_sched_events.key);
+ if (is_cgroup_event(event))
+ static_key_slow_inc(&perf_sched_events.key);
+
+ account_event_cpu(event, event->cpu);
+}
+
/*
* Allocate and initialize a event structure
*/
@@ -5725,7 +6757,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
- long err;
+ long err = -EINVAL;
if ((unsigned)cpu >= nr_cpu_ids) {
if (!task || cpu != -1)
@@ -5750,12 +6782,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
+ INIT_LIST_HEAD(&event->active_entry);
+ INIT_HLIST_NODE(&event->hlist_entry);
+
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
+ atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
@@ -5764,18 +6800,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->parent = parent_event;
- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
if (task) {
event->attach_state = PERF_ATTACH_TASK;
+
+ if (attr->type == PERF_TYPE_TRACEPOINT)
+ event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
*/
- if (attr->type == PERF_TYPE_BREAKPOINT)
+ else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task;
#endif
}
@@ -5788,8 +6827,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
- if (attr->disabled)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event__state_init(event);
pmu = NULL;
@@ -5805,43 +6843,36 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* we currently do not support PERF_FORMAT_GROUP on inherited events
*/
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
- goto done;
+ goto err_ns;
pmu = perf_init_event(event);
-
-done:
- err = 0;
if (!pmu)
- err = -EINVAL;
- else if (IS_ERR(pmu))
+ goto err_ns;
+ else if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
-
- if (err) {
- if (event->ns)
- put_pid_ns(event->ns);
- kfree(event);
- return ERR_PTR(err);
+ goto err_ns;
}
if (!event->parent) {
- if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_sched_events.key);
- if (event->attr.mmap || event->attr.mmap_data)
- atomic_inc(&nr_mmap_events);
- if (event->attr.comm)
- atomic_inc(&nr_comm_events);
- if (event->attr.task)
- atomic_inc(&nr_task_events);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
- if (err) {
- free_event(event);
- return ERR_PTR(err);
- }
+ if (err)
+ goto err_pmu;
}
}
return event;
+
+err_pmu:
+ if (event->destroy)
+ event->destroy(event);
+ module_put(pmu->module);
+err_ns:
+ if (event->ns)
+ put_pid_ns(event->ns);
+ kfree(event);
+
+ return ERR_PTR(err);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -5908,6 +6939,61 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->read_format & ~(PERF_FORMAT_MAX-1))
return -EINVAL;
+ if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ u64 mask = attr->branch_sample_type;
+
+ /* only using defined bits */
+ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+ return -EINVAL;
+
+ /* at least one branch bit must be set */
+ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+ return -EINVAL;
+
+ /* propagate priv level, when not set for branch */
+ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+ /* exclude_kernel checked on syscall entry */
+ if (!attr->exclude_kernel)
+ mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ if (!attr->exclude_user)
+ mask |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!attr->exclude_hv)
+ mask |= PERF_SAMPLE_BRANCH_HV;
+ /*
+ * adjust user setting (for HW filter setup)
+ */
+ attr->branch_sample_type = mask;
+ }
+ /* privileged levels capture (kernel, hv): check permissions */
+ if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+ ret = perf_reg_validate(attr->sample_regs_user);
+ if (ret)
+ return ret;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+ if (!arch_perf_have_user_stack_dump())
+ return -ENOSYS;
+
+ /*
+ * We have __u32 type for the size, but so far
+ * we can only use __u16 as maximum due to the
+ * __u16 sample size limit.
+ */
+ if (attr->sample_stack_user >= USHRT_MAX)
+ ret = -EINVAL;
+ else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+ ret = -EINVAL;
+ }
+
out:
return ret;
@@ -5920,7 +7006,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL, *old_rb = NULL;
+ struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -5955,16 +7041,12 @@ set:
goto unlock;
}
- old_rb = event->rb;
- rcu_assign_pointer(event->rb, rb);
- if (old_rb)
- ring_buffer_detach(event, old_rb);
+ ring_buffer_attach(event, rb);
+
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_rb)
- ring_buffer_put(old_rb);
out:
return ret;
}
@@ -5986,13 +7068,13 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct file *group_file = NULL;
+ struct fd group = {NULL, 0};
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
int move_group = 0;
- int fput_needed = 0;
int err;
+ int f_flags = O_RDWR;
/* for future expandability... */
if (flags & ~PERF_FLAG_ALL)
@@ -6010,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -6021,17 +7106,18 @@ SYSCALL_DEFINE5(perf_event_open,
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
return -EINVAL;
- event_fd = get_unused_fd_flags(O_RDWR);
+ if (flags & PERF_FLAG_FD_CLOEXEC)
+ f_flags |= O_CLOEXEC;
+
+ event_fd = get_unused_fd_flags(f_flags);
if (event_fd < 0)
return event_fd;
if (group_fd != -1) {
- group_leader = perf_fget_light(group_fd, &fput_needed);
- if (IS_ERR(group_leader)) {
- err = PTR_ERR(group_leader);
+ err = perf_fget_light(group_fd, &group);
+ if (err)
goto err_fd;
- }
- group_file = group_leader->filp;
+ group_leader = group.file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6046,26 +7132,38 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
+ get_online_cpus();
+
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_task;
+ goto err_cpus;
}
if (flags & PERF_FLAG_PID_CGROUP) {
err = perf_cgroup_connect(pid, event, &attr, group_leader);
- if (err)
+ if (err) {
+ __free_event(event);
+ goto err_cpus;
+ }
+ }
+
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
goto err_alloc;
- /*
- * one more event:
- * - that has cgroup constraint on event->cpu
- * - that may need work on context switch
- */
- atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_inc(&perf_sched_events.key);
+ }
}
+ account_event(event);
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -6098,7 +7196,7 @@ SYSCALL_DEFINE5(perf_event_open,
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, cpu);
+ ctx = find_get_context(pmu, task, event->cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_alloc;
@@ -6146,7 +7244,8 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+ f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
goto err_context;
@@ -6156,35 +7255,44 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader, false);
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling);
+ perf_remove_from_context(sibling, false);
+ perf_event__state_init(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
put_ctx(gctx);
}
- event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
if (move_group) {
- perf_install_in_context(ctx, group_leader, cpu);
+ synchronize_rcu();
+ perf_install_in_context(ctx, group_leader, event->cpu);
get_ctx(ctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, cpu);
+ perf_install_in_context(ctx, sibling, event->cpu);
get_ctx(ctx);
}
}
- perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
+ perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
+ put_online_cpus();
+
event->owner = current;
mutex_lock(&current->perf_event_mutex);
@@ -6203,7 +7311,7 @@ SYSCALL_DEFINE5(perf_event_open,
* of the group leader will find the pointer to itself in
* perf_group_detach().
*/
- fput_light(group_file, fput_needed);
+ fdput(group);
fd_install(event_fd, event_file);
return event_fd;
@@ -6212,11 +7320,13 @@ err_context:
put_ctx(ctx);
err_alloc:
free_event(event);
+err_cpus:
+ put_online_cpus();
err_task:
if (task)
put_task_struct(task);
err_group_fd:
- fput_light(group_file, fput_needed);
+ fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -6250,17 +7360,17 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err;
}
+ account_event(event);
+
ctx = find_get_context(event->pmu, task, cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_free;
}
- event->filp = NULL;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -6273,6 +7383,41 @@ err:
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx;
+ struct perf_event_context *dst_ctx;
+ struct perf_event *event, *tmp;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+
+ mutex_lock(&src_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+ event_entry) {
+ perf_remove_from_context(event, false);
+ unaccount_event_cpu(event, src_cpu);
+ put_ctx(src_ctx);
+ list_add(&event->migrate_entry, &events);
+ }
+ mutex_unlock(&src_ctx->mutex);
+
+ synchronize_rcu();
+
+ mutex_lock(&dst_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+ mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
+
static void sync_child_event(struct perf_event *child_event,
struct task_struct *child)
{
@@ -6305,7 +7450,7 @@ static void sync_child_event(struct perf_event *child_event,
* Release the parent event, if this was the last
* reference to it.
*/
- fput(parent_event->filp);
+ put_event(parent_event);
}
static void
@@ -6313,13 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- if (child_event->parent) {
- raw_spin_lock_irq(&child_ctx->lock);
- perf_group_detach(child_event);
- raw_spin_unlock_irq(&child_ctx->lock);
- }
-
- perf_remove_from_context(child_event);
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -6334,8 +7485,8 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *tmp;
- struct perf_event_context *child_ctx;
+ struct perf_event *child_event, *next;
+ struct perf_event_context *child_ctx, *parent_ctx;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -6360,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@@ -6370,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
@@ -6381,32 +7548,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*
* __perf_event_exit_task()
* sync_child_event()
- * fput(parent_event->filp)
- * perf_release()
- * mutex_lock(&ctx->mutex)
+ * put_event()
+ * mutex_lock(&ctx->mutex)
*
* But since its the parent context it won't be the same instance.
*/
mutex_lock(&child_ctx->mutex);
-again:
- list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
- group_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
-
- list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
- /*
- * If the last event was a group event, it will have appended all
- * its siblings to the list, but we obtained 'tmp' before that which
- * will still point to the list head terminating the iteration.
- */
- if (!list_empty(&child_ctx->pinned_groups) ||
- !list_empty(&child_ctx->flexible_groups))
- goto again;
-
mutex_unlock(&child_ctx->mutex);
put_ctx(child_ctx);
@@ -6451,7 +7602,7 @@ static void perf_free_event(struct perf_event *event,
list_del_init(&event->child_list);
mutex_unlock(&parent->child_mutex);
- fput(parent->filp);
+ put_event(parent);
perf_group_detach(event);
list_del_event(event, ctx);
@@ -6531,6 +7682,12 @@ inherit_event(struct perf_event *parent_event,
NULL, NULL);
if (IS_ERR(child_event))
return child_event;
+
+ if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
get_ctx(child_ctx);
/*
@@ -6572,14 +7729,6 @@ inherit_event(struct perf_event *parent_event,
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
- * Get a reference to the parent filp - we will fput it
- * when the child event exits. This is safe to do because
- * we are in the parent and we know that the filp still
- * exists and has a nonzero count:
- */
- atomic_long_inc(&parent_event->filp->f_count);
-
- /*
* Link this into the parent event's child list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
@@ -6636,7 +7785,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/
- child_ctx = alloc_perf_context(event->pmu, child);
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -6673,6 +7822,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
* swapped under us.
*/
parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
/*
* No need to check if parent_ctx != NULL here; since we saw
@@ -6779,11 +7930,12 @@ static void __init perf_event_init_all_cpus(void)
}
}
-static void __cpuinit perf_event_init_cpu(int cpu)
+static void perf_event_init_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -6806,15 +7958,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
+ struct remove_event re = { .detach_group = false };
struct perf_event_context *ctx = __info;
- struct perf_event *event, *tmp;
perf_pmu_rotate_stop(ctx->pmu);
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_remove_from_context(event);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
- __perf_remove_from_context(event);
+ rcu_read_lock();
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
+ rcu_read_unlock();
}
static void perf_event_exit_cpu_context(int cpu)
@@ -6838,11 +7990,12 @@ static void perf_event_exit_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ perf_event_exit_cpu_context(cpu);
+
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
-
- perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
@@ -6868,7 +8021,7 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int __cpuinit
+static int
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
@@ -6884,7 +8037,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
-
default:
break;
}
@@ -6912,6 +8064,13 @@ void __init perf_event_init(void)
/* do not patch jump label more than once per second */
jump_label_rate_limit(&perf_sched_events, HZ);
+
+ /*
+ * Build time assertion that we keep the data_head at the intended
+ * location. IOW, validation we got the __reserved[] size right.
+ */
+ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+ != 1024);
}
static int __init perf_event_sysfs_init(void)
@@ -6943,8 +8102,8 @@ unlock:
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
- struct cgroup_subsys *ss, struct cgroup *cont)
+static struct cgroup_subsys_state *
+perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct perf_cgroup *jc;
@@ -6961,12 +8120,10 @@ static struct cgroup_subsys_state *perf_cgroup_create(
return &jc->css;
}
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct perf_cgroup *jc;
- jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
- struct perf_cgroup, css);
+ struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
+
free_percpu(jc->info);
kfree(jc);
}
@@ -6978,17 +8135,18 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void perf_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, tset)
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task)
+static void perf_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -7001,11 +8159,9 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
task_function_call(task, __perf_cgroup_move, task);
}
-struct cgroup_subsys perf_subsys = {
- .name = "perf_event",
- .subsys_id = perf_subsys_id,
- .create = perf_cgroup_create,
- .destroy = perf_cgroup_destroy,
+struct cgroup_subsys perf_event_cgrp_subsys = {
+ .css_alloc = perf_cgroup_css_alloc,
+ .css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
};
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38b..1559fb0b929 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
#include <linux/smp.h>
#include <linux/hw_breakpoint.h>
-
-
/*
* Constraints data
*/
+struct bp_cpuinfo {
+ /* Number of pinned cpu breakpoints in a cpu */
+ unsigned int cpu_pinned;
+ /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+ unsigned int *tsk_pinned;
+ /* Number of non-pinned cpu/task breakpoints in a cpu */
+ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
-/* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
-
-/* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
-
-/* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
-
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
static int nr_slots[TYPE_MAX];
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+ return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
+
/* Keep track of the breakpoints attached to tasks */
static LIST_HEAD(bp_task_head);
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
int i;
- unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
for (i = nr_slots[type] - 1; i >= 0; i--) {
if (tsk_pinned[i] > 0)
@@ -111,20 +114,29 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
*/
-static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
+static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
struct task_struct *tsk = bp->hw.bp_target;
struct perf_event *iter;
int count = 0;
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
+ if (iter->hw.bp_target == tsk &&
+ find_slot_idx(iter) == type &&
+ (iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
return count;
}
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+ if (bp->cpu >= 0)
+ return cpumask_of(bp->cpu);
+ return cpu_possible_mask;
+}
+
/*
* Report the number of pinned/un-pinned breakpoints we have in
* a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -133,34 +145,23 @@ static void
fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
enum bp_type_idx type)
{
- int cpu = bp->cpu;
- struct task_struct *tsk = bp->hw.bp_target;
-
- if (cpu >= 0) {
- slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
- if (!tsk)
- slots->pinned += max_task_bp_pinned(cpu, type);
- else
- slots->pinned += task_bp_pinned(bp, type);
- slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
-
- return;
- }
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
- for_each_online_cpu(cpu) {
- unsigned int nr;
+ for_each_cpu(cpu, cpumask) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+ int nr;
- nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
- if (!tsk)
+ nr = info->cpu_pinned;
+ if (!bp->hw.bp_target)
nr += max_task_bp_pinned(cpu, type);
else
- nr += task_bp_pinned(bp, type);
+ nr += task_bp_pinned(cpu, bp, type);
if (nr > slots->pinned)
slots->pinned = nr;
- nr = per_cpu(nr_bp_flexible[type], cpu);
-
+ nr = info->flexible;
if (nr > slots->flexible)
slots->flexible = nr;
}
@@ -180,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
/*
* Add a pinned breakpoint for the given task in our constraint table
*/
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
enum bp_type_idx type, int weight)
{
- unsigned int *tsk_pinned;
- int old_count = 0;
- int old_idx = 0;
- int idx = 0;
-
- old_count = task_bp_pinned(bp, type);
- old_idx = old_count - 1;
- idx = old_idx + weight;
-
- /* tsk_pinned[n] is the number of tasks having n breakpoints */
- tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
- if (enable) {
- tsk_pinned[idx]++;
- if (old_count > 0)
- tsk_pinned[old_idx]--;
- } else {
- tsk_pinned[idx]--;
- if (old_count > 0)
- tsk_pinned[old_idx]++;
- }
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int old_idx, new_idx;
+
+ old_idx = task_bp_pinned(cpu, bp, type) - 1;
+ new_idx = old_idx + weight;
+
+ if (old_idx >= 0)
+ tsk_pinned[old_idx]--;
+ if (new_idx >= 0)
+ tsk_pinned[new_idx]++;
}
/*
@@ -212,33 +203,26 @@ static void
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int weight)
{
- int cpu = bp->cpu;
- struct task_struct *tsk = bp->hw.bp_target;
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
- /* Pinned counter cpu profiling */
- if (!tsk) {
+ if (!enable)
+ weight = -weight;
- if (enable)
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
- else
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+ /* Pinned counter cpu profiling */
+ if (!bp->hw.bp_target) {
+ get_bp_info(bp->cpu, type)->cpu_pinned += weight;
return;
}
/* Pinned counter task profiling */
-
- if (!enable)
- list_del(&bp->hw.bp_list);
-
- if (cpu >= 0) {
- toggle_bp_task_slot(bp, cpu, enable, type, weight);
- } else {
- for_each_online_cpu(cpu)
- toggle_bp_task_slot(bp, cpu, enable, type, weight);
- }
+ for_each_cpu(cpu, cpumask)
+ toggle_bp_task_slot(bp, cpu, type, weight);
if (enable)
list_add_tail(&bp->hw.bp_list, &bp_task_head);
+ else
+ list_del(&bp->hw.bp_list);
}
/*
@@ -259,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to a single cpu, check:
*
- * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
- * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
*
* -> If there are already non-pinned counters in this cpu, it means
* there is already a free slot for them.
@@ -270,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to every cpus, check:
*
- * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
- * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
*
* -> This is roughly the same, except we check the number of per cpu
* bp for every cpu and we keep the max one. Same for the per tasks
@@ -282,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to a single cpu, check:
*
- * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
- * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
*
- * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ * -> Same checks as before. But now the info->flexible, if any, must keep
* one register at least (or they will never be fed).
*
* - If attached to every cpus, check:
*
- * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
- * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
+ * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
static int __reserve_bp_slot(struct perf_event *bp)
{
@@ -453,7 +437,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
int old_type = bp->attr.bp_type;
int err = 0;
- perf_event_disable(bp);
+ /*
+ * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+ * will not be possible to raise IPIs that invoke __perf_event_disable.
+ * So call the function directly after making sure we are targeting the
+ * current task.
+ */
+ if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+ __perf_event_disable(bp);
+ else
+ perf_event_disable(bp);
bp->attr.bp_addr = attr->bp_addr;
bp->attr.bp_type = attr->bp_type;
@@ -507,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
void *context)
{
- struct perf_event * __percpu *cpu_events, **pevent, *bp;
- long err;
+ struct perf_event * __percpu *cpu_events, *bp;
+ long err = 0;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -517,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
get_online_cpus();
for_each_online_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
-
- *pevent = bp;
-
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
- goto fail;
+ break;
}
- }
- put_online_cpus();
-
- return cpu_events;
-fail:
- for_each_online_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
- if (IS_ERR(*pevent))
- break;
- unregister_hw_breakpoint(*pevent);
+ per_cpu(*cpu_events, cpu) = bp;
}
put_online_cpus();
- free_percpu(cpu_events);
+ if (likely(!err))
+ return cpu_events;
+
+ unregister_wide_hw_breakpoint(cpu_events);
return (void __percpu __force *)ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -553,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
{
int cpu;
- struct perf_event **pevent;
- for_each_possible_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
- unregister_hw_breakpoint(*pevent);
- }
+ for_each_possible_cpu(cpu)
+ unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
+
free_percpu(cpu_events);
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -581,6 +562,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
if (bp->attr.type != PERF_TYPE_BREAKPOINT)
return -ENOENT;
+ /*
+ * no branch sampling for breakpoint events
+ */
+ if (has_branch_stack(bp))
+ return -EOPNOTSUPP;
+
err = register_perf_hw_breakpoint(bp);
if (err)
return err;
@@ -595,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
if (!(flags & PERF_EF_START))
bp->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(bp)) {
+ bp->hw.last_period = bp->hw.sample_period;
+ perf_swevent_set_period(bp);
+ }
+
return arch_install_hw_breakpoint(bp);
}
@@ -613,6 +605,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
bp->hw.state = PERF_HES_STOPPED;
}
+static int hw_breakpoint_event_idx(struct perf_event *bp)
+{
+ return 0;
+}
+
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
@@ -622,11 +619,12 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
+
+ .event_idx = hw_breakpoint_event_idx,
};
int __init init_hw_breakpoint(void)
{
- unsigned int **task_bp_pinned;
int cpu, err_cpu;
int i;
@@ -635,10 +633,11 @@ int __init init_hw_breakpoint(void)
for_each_possible_cpu(cpu) {
for (i = 0; i < TYPE_MAX; i++) {
- task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
- *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
- GFP_KERNEL);
- if (!*task_bp_pinned)
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
+ GFP_KERNEL);
+ if (!info->tsk_pinned)
goto err_alloc;
}
}
@@ -651,10 +650,10 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ kfree(get_bp_info(err_cpu, i)->tsk_pinned);
if (err_cpu == cpu)
break;
- for (i = 0; i < TYPE_MAX; i++)
- kfree(per_cpu(nr_task_bp_pinned[i], cpu));
}
return -ENOMEM;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90af..569b218782a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
/* Buffer handling */
@@ -15,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
- int writable; /* are we writable */
+ int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
@@ -30,6 +31,10 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
@@ -76,32 +81,73 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline void
-__output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+{ \
+ unsigned long size, written; \
+ \
+ do { \
+ size = min(handle->size, len); \
+ written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
+ \
+ len -= written; \
+ handle->addr += written; \
+ buf += written; \
+ handle->size -= written; \
+ if (!handle->size) { \
+ struct ring_buffer *rb = handle->rb; \
+ \
+ handle->page++; \
+ handle->page &= rb->nr_pages - 1; \
+ handle->addr = rb->data_pages[handle->page]; \
+ handle->size = PAGE_SIZE << page_order(rb); \
+ } \
+ } while (len && written == size); \
+ \
+ return len; \
+}
+
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
+{
+ memcpy(dst, src, n);
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
{
- do {
- unsigned long size = min_t(unsigned long, handle->size, len);
-
- memcpy(handle->addr, buf, size);
-
- len -= size;
- handle->addr += size;
- buf += size;
- handle->size -= size;
- if (!handle->size) {
- struct ring_buffer *rb = handle->rb;
-
- handle->page++;
- handle->page &= rb->nr_pages - 1;
- handle->addr = rb->data_pages[handle->page];
- handle->size = PAGE_SIZE << page_order(rb);
- }
- } while (len);
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
}
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
/* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
@@ -133,4 +179,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
recursion[rctx]--;
}
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7..146a5792b1d 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,28 +12,10 @@
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/circ_buf.h>
#include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
- unsigned long offset, unsigned long head)
-{
- unsigned long mask;
-
- if (!rb->writable)
- return true;
-
- mask = perf_data_size(rb) - 1;
-
- offset = (offset - tail) & mask;
- head = (head - tail) & mask;
-
- if ((int)(head - offset) < 0)
- return false;
-
- return true;
-}
-
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
@@ -75,15 +57,37 @@ again:
goto out;
/*
- * Publish the known good head. Rely on the full barrier implied
- * by atomic_dec_and_test() order the rb->head read and this
- * write.
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * if (LOAD ->data_tail) { LOAD ->data_head
+ * (A) smp_rmb() (C)
+ * STORE $data LOAD $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head STORE ->data_tail
+ * }
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * In our case (A) is a control dependency that separates the load of
+ * the ->data_tail and the stores of $data. In case ->data_tail
+ * indicates there is no room in the buffer to store $data we do not.
+ *
+ * D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
*/
+ smp_wmb(); /* B, matches C */
rb->user_page->data_head = head;
/*
- * Now check if we missed an update, rely on the (compiler)
- * barrier in atomic_dec_and_test() to re-read rb->head.
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
@@ -102,8 +106,7 @@ int perf_output_begin(struct perf_output_handle *handle,
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
- int have_lost;
- struct perf_sample_data sample_data;
+ int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
@@ -118,55 +121,72 @@ int perf_output_begin(struct perf_output_handle *handle,
event = event->parent;
rb = rcu_dereference(event->rb);
- if (!rb)
+ if (unlikely(!rb))
goto out;
- handle->rb = rb;
- handle->event = event;
-
- if (!rb->nr_pages)
+ if (unlikely(!rb->nr_pages))
goto out;
+ handle->rb = rb;
+ handle->event = event;
+
have_lost = local_read(&rb->lost);
- if (have_lost) {
- lost_event.header.size = sizeof(lost_event);
- perf_event_header__init_id(&lost_event.header, &sample_data,
- event);
- size += lost_event.header.size;
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
}
perf_output_get_handle(handle);
do {
- /*
- * Userspace could choose to issue a mb() before updating the
- * tail pointer. So that all reads will be completed before the
- * write is issued.
- */
tail = ACCESS_ONCE(rb->user_page->data_tail);
- smp_rmb();
offset = head = local_read(&rb->head);
- head += size;
- if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
+
+ /*
+ * The above forms a control dependency barrier separating the
+ * @tail load above from the data stores below. Since the @tail
+ * load is required to compute the branch to fail below.
+ *
+ * A, matches D; the full memory barrier userspace SHOULD issue
+ * after reading the data and before storing the new tail
+ * position.
+ *
+ * See perf_output_put_handle().
+ */
+
+ head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
- if (head - local_read(&rb->wakeup) > rb->watermark)
+ /*
+ * We rely on the implied barrier() by local_cmpxchg() to ensure
+ * none of the data stores below can be lifted up by the compiler.
+ */
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
- handle->page = offset >> (PAGE_SHIFT + page_order(rb));
- handle->page &= rb->nr_pages - 1;
- handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
- handle->addr = rb->data_pages[handle->page];
- handle->addr += handle->size;
- handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+ page_shift = PAGE_SHIFT + page_order(rb);
+
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
- if (have_lost) {
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
@@ -182,10 +202,16 @@ out:
return -ENOSPC;
}
-void perf_output_copy(struct perf_output_handle *handle,
+unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
- __output_copy(handle, buf, len);
+ return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+ unsigned int len)
+{
+ return __output_skip(handle, NULL, len);
}
void perf_output_end(struct perf_output_handle *handle)
@@ -206,7 +232,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
- rb->writable = 1;
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
@@ -306,11 +334,16 @@ void rb_free(struct ring_buffer *rb)
}
#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
- if (pgoff > (1UL << page_order(rb)))
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -330,10 +363,11 @@ static void rb_free_work(struct work_struct *work)
int i, nr;
rb = container_of(work, struct ring_buffer, work);
- nr = 1 << page_order(rb);
+ nr = data_page_nr(rb);
base = rb->user_page;
- for (i = 0; i < nr + 1; i++)
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
@@ -367,7 +401,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
- rb->nr_pages = 1;
+ rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 00000000000..6f3254e8c13
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1993 @@
+/*
+ * User-space Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2012
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h> /* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rmap.h> /* anon_vma_prepare */
+#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/ptrace.h> /* user_enable_single_step */
+#include <linux/kdebug.h> /* notifier mechanism */
+#include "../../mm/internal.h" /* munlock_vma_page */
+#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/uprobes.h>
+
+#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
+#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
+
+static struct rb_root uprobes_tree = RB_ROOT;
+/*
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
+ */
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
+
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+
+#define UPROBES_HASH_SZ 13
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+
+static struct percpu_rw_semaphore dup_mmap_sem;
+
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN 0
+
+struct uprobe {
+ struct rb_node rb_node; /* node in the rb tree */
+ atomic_t ref;
+ struct rw_semaphore register_rwsem;
+ struct rw_semaphore consumer_rwsem;
+ struct list_head pending_list;
+ struct uprobe_consumer *consumers;
+ struct inode *inode; /* Also hold a ref to inode */
+ loff_t offset;
+ unsigned long flags;
+
+ /*
+ * The generic code assumes that it has two members of unknown type
+ * owned by the arch-specific code:
+ *
+ * insn - copy_insn() saves the original instruction here for
+ * arch_uprobe_analyze_insn().
+ *
+ * ixol - potentially modified instruction to execute out of
+ * line, copied to xol_area by xol_get_insn_slot().
+ */
+ struct arch_uprobe arch;
+};
+
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
+/*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot. It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct page *page;
+
+ /*
+ * We keep the vma's vm_start rather than a pointer to the vma
+ * itself. The probed process or a naughty kernel module could make
+ * the vma go away, and we must handle that reasonably gracefully.
+ */
+ unsigned long vaddr; /* Page(s) of instruction slots */
+};
+
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ * - is_register: indicates if we are in register context.
+ * - Return 1 if the specified virtual address is in an
+ * executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+
+ if (is_register)
+ flags |= VM_WRITE;
+
+ return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
+}
+
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
+{
+ return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
+
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+ return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
+}
+
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma: vma that holds the pte pointing to page
+ * @addr: address the old @page is mapped at
+ * @page: the cowed page we are replacing by kpage
+ * @kpage: the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int err;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = addr;
+ const unsigned long mmun_end = addr + PAGE_SIZE;
+
+ /* For try_to_free_swap() and munlock_vma_page() below */
+ lock_page(page);
+
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ err = -EAGAIN;
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto unlock;
+
+ get_page(kpage);
+ page_add_new_anon_rmap(kpage, vma, addr);
+
+ if (!PageAnon(page)) {
+ dec_mm_counter(mm, MM_FILEPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
+ }
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ pte_unmap_unlock(ptep, ptl);
+
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_page(page);
+ put_page(page);
+
+ err = 0;
+ unlock:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ unlock_page(page);
+ return err;
+}
+
+/**
+ * is_swbp_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_swbp_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
+{
+ return *insn == UPROBE_SWBP_INSN;
+}
+
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ kunmap_atomic(kaddr);
+}
+
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+ uprobe_opcode_t old_opcode;
+ bool is_swbp;
+
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ is_swbp = is_swbp_insn(&old_opcode);
+
+ if (is_swbp_insn(new_opcode)) {
+ if (is_swbp) /* register: already installed? */
+ return 0;
+ } else {
+ if (!is_swbp) /* unregister: was it changed by us? */
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify is_trap_at_addr and
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
+ *
+ * uprobe_write_opcode - write the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held for write.
+ * Return 0 (success) or a negative errno.
+ */
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
+ uprobe_opcode_t opcode)
+{
+ struct page *old_page, *new_page;
+ struct vm_area_struct *vma;
+ int ret;
+
+retry:
+ /* Read the page with vaddr into memory */
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ if (ret <= 0)
+ return ret;
+
+ ret = verify_opcode(old_page, vaddr, &opcode);
+ if (ret <= 0)
+ goto put_old;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
+ ret = -ENOMEM;
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ if (!new_page)
+ goto put_old;
+
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ goto put_new;
+
+ __SetPageUptodate(new_page);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ ret = __replace_page(vma, vaddr, old_page, new_page);
+ if (ret)
+ mem_cgroup_uncharge_page(new_page);
+
+put_new:
+ page_cache_release(new_page);
+put_old:
+ put_page(old_page);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+
+/**
+ * set_swbp - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+}
+
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @auprobe: arch specific probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+}
+
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+ if (l->inode < r->inode)
+ return -1;
+
+ if (l->inode > r->inode)
+ return 1;
+
+ if (l->offset < r->offset)
+ return -1;
+
+ if (l->offset > r->offset)
+ return 1;
+
+ return 0;
+}
+
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe u = { .inode = inode, .offset = offset };
+ struct rb_node *n = uprobes_tree.rb_node;
+ struct uprobe *uprobe;
+ int match;
+
+ while (n) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ match = match_uprobe(&u, uprobe);
+ if (!match) {
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+ }
+
+ if (match < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ return NULL;
+}
+
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe;
+
+ spin_lock(&uprobes_treelock);
+ uprobe = __find_uprobe(inode, offset);
+ spin_unlock(&uprobes_treelock);
+
+ return uprobe;
+}
+
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+ struct rb_node **p = &uprobes_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct uprobe *u;
+ int match;
+
+ while (*p) {
+ parent = *p;
+ u = rb_entry(parent, struct uprobe, rb_node);
+ match = match_uprobe(uprobe, u);
+ if (!match) {
+ atomic_inc(&u->ref);
+ return u;
+ }
+
+ if (match < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ }
+
+ u = NULL;
+ rb_link_node(&uprobe->rb_node, parent, p);
+ rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+ /* get access + creation ref */
+ atomic_set(&uprobe->ref, 2);
+
+ return u;
+}
+
+/*
+ * Acquire uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ * increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ * get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+ struct uprobe *u;
+
+ spin_lock(&uprobes_treelock);
+ u = __insert_uprobe(uprobe);
+ spin_unlock(&uprobes_treelock);
+
+ return u;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe, *cur_uprobe;
+
+ uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+ if (!uprobe)
+ return NULL;
+
+ uprobe->inode = igrab(inode);
+ uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
+ init_rwsem(&uprobe->consumer_rwsem);
+
+ /* add to uprobes_tree, sorted on inode:offset */
+ cur_uprobe = insert_uprobe(uprobe);
+ /* a uprobe exists for this inode:offset combination */
+ if (cur_uprobe) {
+ kfree(uprobe);
+ uprobe = cur_uprobe;
+ iput(inode);
+ }
+
+ return uprobe;
+}
+
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ down_write(&uprobe->consumer_rwsem);
+ uc->next = uprobe->consumers;
+ uprobe->consumers = uc;
+ up_write(&uprobe->consumer_rwsem);
+}
+
+/*
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
+ * or return false.
+ */
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ struct uprobe_consumer **con;
+ bool ret = false;
+
+ down_write(&uprobe->consumer_rwsem);
+ for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+ if (*con == uc) {
+ *con = uc->next;
+ ret = true;
+ break;
+ }
+ }
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
+{
+ struct page *page;
+ /*
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
+ */
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ copy_from_page(page, offset, insn, nbytes);
+ page_cache_release(page);
+
+ return 0;
+}
+
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
+{
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = &uprobe->arch.insn;
+ int size = sizeof(uprobe->arch.insn);
+ int len, err = -EIO;
+
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
+
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
+ if (err)
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
+}
+
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
+ struct mm_struct *mm, unsigned long vaddr)
+{
+ int ret = 0;
+
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ return ret;
+
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ goto out;
+
+ ret = copy_insn(uprobe, file);
+ if (ret)
+ goto out;
+
+ ret = -ENOTSUPP;
+ if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
+ goto out;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
+ if (ret)
+ goto out;
+
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
+ smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+
+ out:
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long vaddr)
+{
+ bool first_uprobe;
+ int ret;
+
+ ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
+ if (ret)
+ return ret;
+
+ /*
+ * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+ * the task can hit this breakpoint right after __replace_page().
+ */
+ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ if (first_uprobe)
+ set_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ ret = set_swbp(&uprobe->arch, mm, vaddr);
+ if (!ret)
+ clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ else if (first_uprobe)
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ return ret;
+}
+
+static int
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ return set_orig_insn(&uprobe->arch, mm, vaddr);
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+/*
+ * There could be threads that have already hit the breakpoint. They
+ * will recheck the current insn and restart if find_uprobe() fails.
+ * See find_active_uprobe().
+ */
+static void delete_uprobe(struct uprobe *uprobe)
+{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
+ spin_lock(&uprobes_treelock);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
+ iput(uprobe->inode);
+ put_uprobe(uprobe);
+}
+
+struct map_info {
+ struct map_info *next;
+ struct mm_struct *mm;
+ unsigned long vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+ struct map_info *next = info->next;
+ kfree(info);
+ return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+ unsigned long pgoff = offset >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct map_info *curr = NULL;
+ struct map_info *prev = NULL;
+ struct map_info *info;
+ int more = 0;
+
+ again:
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (!valid_vma(vma, is_register))
+ continue;
+
+ if (!prev && !more) {
+ /*
+ * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * reclaim. This is optimistic, no harm done if it fails.
+ */
+ prev = kmalloc(sizeof(struct map_info),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (prev)
+ prev->next = NULL;
+ }
+ if (!prev) {
+ more++;
+ continue;
+ }
+
+ if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ continue;
+
+ info = prev;
+ prev = prev->next;
+ info->next = curr;
+ curr = info;
+
+ info->mm = vma->vm_mm;
+ info->vaddr = offset_to_vaddr(vma, offset);
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (!more)
+ goto out;
+
+ prev = curr;
+ while (curr) {
+ mmput(curr->mm);
+ curr = curr->next;
+ }
+
+ do {
+ info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+ if (!info) {
+ curr = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ info->next = prev;
+ prev = info;
+ } while (--more);
+
+ goto again;
+ out:
+ while (prev)
+ prev = free_map_info(prev);
+ return curr;
+}
+
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
+{
+ bool is_register = !!new;
+ struct map_info *info;
+ int err = 0;
+
+ percpu_down_write(&dup_mmap_sem);
+ info = build_map_info(uprobe->inode->i_mapping,
+ uprobe->offset, is_register);
+ if (IS_ERR(info)) {
+ err = PTR_ERR(info);
+ goto out;
+ }
+
+ while (info) {
+ struct mm_struct *mm = info->mm;
+ struct vm_area_struct *vma;
+
+ if (err && is_register)
+ goto free;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, info->vaddr);
+ if (!vma || !valid_vma(vma, is_register) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ goto unlock;
+
+ if (vma->vm_start > info->vaddr ||
+ vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
+ goto unlock;
+
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
+
+ unlock:
+ up_write(&mm->mmap_sem);
+ free:
+ mmput(mm);
+ info = free_map_info(info);
+ }
+ out:
+ percpu_up_write(&dup_mmap_sem);
+ return err;
+}
+
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
+}
+
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ int err;
+
+ if (WARN_ON(!consumer_del(uprobe, uc)))
+ return;
+
+ err = register_for_each_vma(uprobe, NULL);
+ /* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
+}
+
+/*
+ * uprobe_register - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @uc: information on howto handle the probe..
+ *
+ * Apart from the access refcount, uprobe_register() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple). Creation refcount stops uprobe_unregister from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @uc for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+ int ret;
+
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
+ return -EINVAL;
+
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
+ /* Racy, just to catch the obvious mistakes */
+ if (offset > i_size_read(inode))
+ return -EINVAL;
+
+ retry:
+ uprobe = alloc_uprobe(inode, offset);
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
+ }
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ return ret;
+}
+
+/*
+ * uprobe_unregister - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return;
+
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
+
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
+ }
+ up_read(&mm->mmap_sem);
+
+ return err;
+}
+
+static struct rb_node *
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
+{
+ struct rb_node *n = uprobes_tree.rb_node;
+
+ while (n) {
+ struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
+
+ if (inode < u->inode) {
+ n = n->rb_left;
+ } else if (inode > u->inode) {
+ n = n->rb_right;
+ } else {
+ if (max < u->offset)
+ n = n->rb_left;
+ else if (min > u->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
+ }
+
+ return n;
+}
+
+/*
+ * For a given range in vma, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *head)
+{
+ loff_t min, max;
+ struct rb_node *n, *t;
+ struct uprobe *u;
+
+ INIT_LIST_HEAD(head);
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ if (n) {
+ for (t = n; t; t = rb_prev(t)) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset < min)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ for (t = n; (t = rb_next(t)); ) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset > max)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ }
+ spin_unlock(&uprobes_treelock);
+}
+
+/*
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ *
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
+ */
+int uprobe_mmap(struct vm_area_struct *vma)
+{
+ struct list_head tmp_list;
+ struct uprobe *uprobe, *u;
+ struct inode *inode;
+
+ if (no_uprobe_events() || !valid_vma(vma, true))
+ return 0;
+
+ inode = file_inode(vma->vm_file);
+ if (!inode)
+ return 0;
+
+ mutex_lock(uprobes_mmap_hash(inode));
+ build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
+ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+ install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ }
+ put_uprobe(uprobe);
+ }
+ mutex_unlock(uprobes_mmap_hash(inode));
+
+ return 0;
+}
+
+static bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ loff_t min, max;
+ struct inode *inode;
+ struct rb_node *n;
+
+ inode = file_inode(vma->vm_file);
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ spin_unlock(&uprobes_treelock);
+
+ return !!n;
+}
+
+/*
+ * Called in context of a munmap of a vma.
+ */
+void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ if (no_uprobe_events() || !valid_vma(vma, false))
+ return;
+
+ if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+ return;
+
+ if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
+ test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ return;
+
+ if (vma_has_uprobes(vma, start, end))
+ set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+}
+
+/* Slot allocation for XOL */
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
+{
+ int ret = -EALREADY;
+
+ down_write(&mm->mmap_sem);
+ if (mm->uprobes_state.xol_area)
+ goto fail;
+
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
+ }
+
+ ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+ if (ret)
+ goto fail;
+
+ smp_wmb(); /* pairs with get_xol_area() */
+ mm->uprobes_state.xol_area = area;
+ fail:
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+static struct xol_area *__create_xol_area(unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
+
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
+ if (unlikely(!area))
+ goto out;
+
+ area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+ if (!area->bitmap)
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
+
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
+ set_bit(0, area->bitmap);
+ atomic_set(&area->slot_count, 1);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+
+ if (!xol_add_vma(mm, area))
+ return area;
+
+ __free_page(area->page);
+ free_bitmap:
+ kfree(area->bitmap);
+ free_area:
+ kfree(area);
+ out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
+ area = mm->uprobes_state.xol_area;
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
+}
+
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+ struct xol_area *area = mm->uprobes_state.xol_area;
+
+ if (!area)
+ return;
+
+ put_page(area->page);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
+void uprobe_start_dup_mmap(void)
+{
+ percpu_down_read(&dup_mmap_sem);
+}
+
+void uprobe_end_dup_mmap(void)
+{
+ percpu_up_read(&dup_mmap_sem);
+}
+
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ newmm->uprobes_state.xol_area = NULL;
+
+ if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
+ set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
+ set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ }
+}
+
+/*
+ * - search for a free slot.
+ */
+static unsigned long xol_take_insn_slot(struct xol_area *area)
+{
+ unsigned long slot_addr;
+ int slot_nr;
+
+ do {
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ break;
+
+ slot_nr = UINSNS_PER_PAGE;
+ continue;
+ }
+ wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+ } while (slot_nr >= UINSNS_PER_PAGE);
+
+ slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
+ atomic_inc(&area->slot_count);
+
+ return slot_addr;
+}
+
+/*
+ * xol_get_insn_slot - allocate a slot for xol.
+ * Returns the allocated slot address or 0.
+ */
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+{
+ struct xol_area *area;
+ unsigned long xol_vaddr;
+
+ area = get_xol_area();
+ if (!area)
+ return 0;
+
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
+ return 0;
+
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+
+ return xol_vaddr;
+}
+
+/*
+ * xol_free_insn_slot - If slot was earlier allocated by
+ * @xol_get_insn_slot(), make the slot available for
+ * subsequent requests.
+ */
+static void xol_free_insn_slot(struct task_struct *tsk)
+{
+ struct xol_area *area;
+ unsigned long vma_end;
+ unsigned long slot_addr;
+
+ if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ return;
+
+ slot_addr = tsk->utask->xol_vaddr;
+ if (unlikely(!slot_addr))
+ return;
+
+ area = tsk->mm->uprobes_state.xol_area;
+ vma_end = area->vaddr + PAGE_SIZE;
+ if (area->vaddr <= slot_addr && slot_addr < vma_end) {
+ unsigned long offset;
+ int slot_nr;
+
+ offset = slot_addr - area->vaddr;
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ if (slot_nr >= UINSNS_PER_PAGE)
+ return;
+
+ clear_bit(slot_nr, area->bitmap);
+ atomic_dec(&area->slot_count);
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
+
+ tsk->utask->xol_vaddr = 0;
+ }
+}
+
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+ void *src, unsigned long len)
+{
+ /* Initialize the slot */
+ copy_to_page(page, vaddr, src, len);
+
+ /*
+ * We probably need flush_icache_user_range() but it needs vma.
+ * This should work on most of architectures by default. If
+ * architecture needs to do something different it can define
+ * its own version of the function.
+ */
+ flush_dcache_page(page);
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+ return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+ struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
+
+ if (!utask)
+ return;
+
+ if (utask->active_uprobe)
+ put_uprobe(utask->active_uprobe);
+
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
+ xol_free_insn_slot(t);
+ kfree(utask);
+ t->utask = NULL;
+}
+
+/*
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *get_utask(void)
+{
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
+}
+
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->dup_xol_addr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ t->utask->dup_xol_addr = area->vaddr;
+ init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+ task_work_add(t, &t->utask->dup_xol_work, true);
+}
+
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
+ struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
+
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
+
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
+}
+
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
+
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
+
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
+}
+
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep. When xol insn itself
+ * triggers the signal, restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip). This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+ struct task_struct *t = current;
+ struct uprobe_task *utask = t->utask;
+
+ if (likely(!utask || !utask->active_uprobe))
+ return false;
+
+ WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+
+ if (signal_pending(t)) {
+ spin_lock_irq(&t->sighand->siglock);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ spin_unlock_irq(&t->sighand->siglock);
+
+ if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+ utask->state = UTASK_SSTEP_TRAPPED;
+ set_tsk_thread_flag(t, TIF_UPROBE);
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ }
+ }
+
+ return true;
+}
+
+static void mmf_recalc_uprobes(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!valid_vma(vma, false))
+ continue;
+ /*
+ * This is not strictly accurate, we can race with
+ * uprobe_unregister() and see the already removed
+ * uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
+ */
+ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
+ return;
+ }
+
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+}
+
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ struct page *page;
+ uprobe_opcode_t opcode;
+ int result;
+
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+
+ result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ if (result < 0)
+ return result;
+
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ put_page(page);
+ out:
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
+}
+
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, bp_vaddr);
+ if (vma && vma->vm_start <= bp_vaddr) {
+ if (valid_vma(vma, false)) {
+ struct inode *inode = file_inode(vma->vm_file);
+ loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+
+ uprobe = find_uprobe(inode, offset);
+ }
+
+ if (!uprobe)
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
+ } else {
+ *is_swbp = -EFAULT;
+ }
+
+ if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ mmf_recalc_uprobes(mm);
+ up_read(&mm->mmap_sem);
+
+ return uprobe;
+}
+
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
+
+ remove &= rc;
+ }
+
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+ utask->depth--;
+
+ if (!chained)
+ break;
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ unsigned long bp_vaddr;
+ int uninitialized_var(is_swbp);
+
+ bp_vaddr = uprobe_get_swbp_addr(regs);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (!uprobe) {
+ if (is_swbp > 0) {
+ /* No matching uprobe; signal SIGTRAP. */
+ send_sig(SIGTRAP, current, 0);
+ } else {
+ /*
+ * Either we raced with uprobe_unregister() or we can't
+ * access this memory. The latter is only possible if
+ * another thread plays with our ->mm. In both cases
+ * we can simply restart. If this vma was unmapped we
+ * can pretend this insn was not executed yet and get
+ * the (correct) SIGSEGV after restart.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+ }
+ return;
+ }
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
+ /*
+ * TODO: move copy_insn/etc into _register and remove this hack.
+ * After we hit the bp, _unregister + _register can install the
+ * new and not-yet-analyzed uprobe at the same address, restart.
+ */
+ smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+ goto out;
+
+ /* Tracing handlers use ->utask to communicate with fetch methods */
+ if (!get_utask())
+ goto out;
+
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ goto out;
+
+ handler_chain(uprobe, regs);
+
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ goto out;
+
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
+ return;
+
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+out:
+ put_uprobe(uprobe);
+}
+
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ int err = 0;
+
+ uprobe = utask->active_uprobe;
+ if (utask->state == UTASK_SSTEP_ACK)
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
+ else if (utask->state == UTASK_SSTEP_TRAPPED)
+ arch_uprobe_abort_xol(&uprobe->arch, regs);
+ else
+ WARN_ON_ONCE(1);
+
+ put_uprobe(uprobe);
+ utask->active_uprobe = NULL;
+ utask->state = UTASK_RUNNING;
+ xol_free_insn_slot(current);
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* see uprobe_deny_signal() */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
+}
+
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
+ * allows the thread to return from interrupt. After that handle_swbp()
+ * sets utask->active_uprobe.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
+ * and allows the thread to return from interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+
+ clear_thread_flag(TIF_UPROBE);
+
+ utask = current->utask;
+ if (utask && utask->active_uprobe)
+ handle_singlestep(utask, regs);
+ else
+ handle_swbp(regs);
+}
+
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
+ return 0;
+
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (!current->mm || !utask || !utask->active_uprobe)
+ /* task is currently not uprobed */
+ return 0;
+
+ utask->state = UTASK_SSTEP_ACK;
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+static struct notifier_block uprobe_exception_nb = {
+ .notifier_call = arch_uprobe_exception_notify,
+ .priority = INT_MAX-1, /* notified after kprobes, kgdb */
+};
+
+static int __init init_uprobes(void)
+{
+ int i;
+
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
+ mutex_init(&uprobes_mmap_mutex[i]);
+
+ if (percpu_init_rwsem(&dup_mmap_sem))
+ return -ENOMEM;
+
+ return register_die_notifier(&uprobe_exception_nb);
+}
+__initcall(init_uprobes);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae37422..83d4382f569 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
- .pers_low = 0, /* PER_LINUX personality. */
+ .pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
- return (ep);
+ return ep;
}
int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
out:
write_unlock(&exec_domains_lock);
- return (err);
+ return err;
}
+EXPORT_SYMBOL(register_exec_domain);
int
unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
write_unlock(&exec_domains_lock);
return 0;
}
+EXPORT_SYMBOL(unregister_exec_domain);
int __set_personality(unsigned int personality)
{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
return 0;
}
+EXPORT_SYMBOL(__set_personality);
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
return old;
}
-
-
-EXPORT_SYMBOL(register_exec_domain);
-EXPORT_SYMBOL(unregister_exec_domain);
-EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6..e5c4668f179 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -52,6 +52,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
+#include <linux/shm.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -73,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
__this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
+ list_del_rcu(&p->thread_node);
}
/*
@@ -84,6 +86,7 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
@@ -122,9 +125,10 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime += tsk->utime;
- sig->stime += tsk->stime;
- sig->gtime += tsk->gtime;
+ task_cputime(tsk, &utime, &stime);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -309,244 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-/**
- * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to kthreadd so it
- * isn't in the way of other processes and is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited from a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
- */
-static void reparent_to_kthreadd(void)
-{
- write_lock_irq(&tasklist_lock);
-
- ptrace_unlink(current);
- /* Reparent to init */
- current->real_parent = current->parent = kthreadd_task;
- list_move_tail(&current->sibling, &current->real_parent->children);
-
- /* Set the exit signal to SIGCHLD so we signal init on exit */
- current->exit_signal = SIGCHLD;
-
- if (task_nice(current) < 0)
- set_user_nice(current, 0);
- /* cpus_allowed? */
- /* rt_priority? */
- /* signals? */
- memcpy(current->signal->rlim, init_task.signal->rlim,
- sizeof(current->signal->rlim));
-
- atomic_inc(&init_cred.usage);
- commit_creds(&init_cred);
- write_unlock_irq(&tasklist_lock);
-}
-
-void __set_special_pids(struct pid *pid)
-{
- struct task_struct *curr = current->group_leader;
-
- if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
-
- if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
-}
-
-static void set_special_pids(struct pid *pid)
-{
- write_lock_irq(&tasklist_lock);
- __set_special_pids(pid);
- write_unlock_irq(&tasklist_lock);
-}
-
-/*
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- /* This is only needed for daemonize()'ed kthreads */
- sigdelset(&current->blocked, sig);
- /*
- * Kernel threads handle their own signals. Let the signal code
- * know it'll be handled, so that they don't get converted to
- * SIGKILL or just silently dropped.
- */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(allow_signal);
-
-int disallow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(disallow_signal);
-
-/*
- * Put all the gunge required to become a kernel thread without
- * attached user resources in one place where it belongs.
- */
-
-void daemonize(const char *name, ...)
-{
- va_list args;
- sigset_t blocked;
-
- va_start(args, name);
- vsnprintf(current->comm, sizeof(current->comm), name, args);
- va_end(args);
-
- /*
- * If we were started as result of loading a module, close all of the
- * user space pages. We don't need them, and if we didn't close them
- * they would be locked into memory.
- */
- exit_mm(current);
- /*
- * We don't want to have TIF_FREEZE set if the system-wide hibernation
- * or suspend transition begins right now.
- */
- current->flags |= (PF_NOFREEZE | PF_KTHREAD);
-
- if (current->nsproxy != &init_nsproxy) {
- get_nsproxy(&init_nsproxy);
- switch_task_namespaces(current, &init_nsproxy);
- }
- set_special_pids(&init_struct_pid);
- proc_clear_tty(current);
-
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* Become as one with the init task */
-
- daemonize_fs_struct();
- exit_files(current);
- current->files = init_task.files;
- atomic_inc(&current->files->count);
-
- reparent_to_kthreadd();
-}
-
-EXPORT_SYMBOL(daemonize);
-
-static void close_files(struct files_struct * files)
-{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
-
- /*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock because this is the last reference to the
- * files structure. But use RCU to shut RCU-lockdep up.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- rcu_read_unlock();
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file) {
- filp_close(file, files);
- cond_resched();
- }
- }
- i++;
- set >>= 1;
- }
- }
-}
-
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
-void put_files_struct(struct files_struct *files)
-{
- struct fdtable *fdt;
-
- if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /*
- * Free the fd and fdset arrays if we expanded them.
- * If the fdtable was embedded, pass files for freeing
- * at the end of the RCU grace period. Otherwise,
- * you can free files immediately.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- if (fdt != &files->fdtab)
- kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
- rcu_read_unlock();
- }
-}
-
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
-void exit_files(struct task_struct *tsk)
-{
- struct files_struct * files = tsk->files;
-
- if (files) {
- task_lock(tsk);
- tsk->files = NULL;
- task_unlock(tsk);
- put_files_struct(files);
- }
-}
-
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
@@ -589,14 +356,18 @@ retry:
}
/*
- * Search through everything else. We should not get
- * here often
+ * Search through everything else, we should not get here often.
*/
- do_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- } while_each_thread(g, c);
-
+ for_each_process(g) {
+ if (g->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ if (c->mm)
+ break;
+ }
+ }
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
@@ -628,7 +399,7 @@ assign_new_owner:
task_unlock(c);
put_task_struct(c);
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
@@ -642,6 +413,7 @@ static void exit_mm(struct task_struct * tsk)
mm_release(tsk, mm);
if (!mm)
return;
+ sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
@@ -668,7 +440,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
@@ -686,11 +458,11 @@ static void exit_mm(struct task_struct * tsk)
}
/*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ * child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
__releases(&tasklist_lock)
@@ -710,17 +482,37 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
if (unlikely(pid_ns->child_reaper == father)) {
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns))
- panic("Attempted to kill init!");
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?:
+ father->exit_code);
+ }
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
+ } else if (father->signal->has_child_subreaper) {
+ struct task_struct *reaper;
+
/*
- * We can not clear ->child_reaper or leave it alone.
- * There may by stealth EXIT_DEAD tasks on ->children,
- * forget_original_parent() must move them somewhere.
+ * Find the first ancestor marked as child_subreaper.
+ * Note that the code below checks same_thread_group(reaper,
+ * pid_ns->child_reaper). This is what we need to DTRT in a
+ * PID namespace. However we still need the check above, see
+ * http://marc.info/?l=linux-kernel&m=131385460420380
*/
- pid_ns->child_reaper = init_pid_ns.child_reaper;
+ for (reaper = father->real_parent;
+ reaper != &init_task;
+ reaper = reaper->real_parent) {
+ if (same_thread_group(reaper, pid_ns->child_reaper))
+ break;
+ if (!reaper->signal->is_child_subreaper)
+ continue;
+ thread = reaper;
+ do {
+ if (!(thread->flags & PF_EXITING))
+ return reaper;
+ } while_each_thread(reaper, thread);
+ }
}
return pid_ns->child_reaper;
@@ -743,7 +535,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (same_thread_group(p->real_parent, father))
return;
- /* We don't want people slaying init. */
+ /* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
@@ -812,31 +604,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk);
- exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
- /* Let father know we died
- *
- * Thread signals are configurable, but you aren't going to use
- * that to send signals to arbitrary processes.
- * That stops right now.
- *
- * If the parent exec id doesn't match the exec id we saved
- * when we started then we know the parent has changed security
- * domain.
- *
- * If our self_exec id doesn't match our parent_exec_id then
- * we have changed execution domain as these two values started
- * the same after a fork.
- */
- if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
- (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id))
- tsk->exit_signal = SIGCHLD;
-
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -876,9 +648,9 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
- "left\n",
- current->comm, free);
+ printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+ "%lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -935,8 +707,6 @@ void do_exit(long code)
schedule();
}
- exit_irq_thread();
-
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
@@ -953,7 +723,7 @@ void do_exit(long code)
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
- sync_mm_rss(tsk, tsk->mm);
+ sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
@@ -979,7 +749,10 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
- check_stack_usage();
+ if (group_dead)
+ disassociate_ctty(1);
+ exit_task_namespaces(tsk);
+ exit_task_work(tsk);
exit_thread();
/*
@@ -990,21 +763,17 @@ void do_exit(long code)
*/
perf_event_exit_task(tsk);
- cgroup_exit(tsk, 1);
-
- if (group_dead)
- disassociate_ctty(1);
+ cgroup_exit(tsk);
module_put(task_thread_info(tsk)->exec_domain->module);
- proc_exit_connector(tsk);
-
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
- ptrace_put_breakpoints(tsk);
+ flush_ptrace_hw_breakpoint(tsk);
exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
@@ -1018,7 +787,7 @@ void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held(tsk);
+ debug_check_no_locks_held();
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
@@ -1030,10 +799,14 @@ void do_exit(long code)
exit_io_context(tsk);
if (tsk->splice_pipe)
- __free_pipe_info(tsk->splice_pipe);
+ free_pipe_info(tsk->splice_pipe);
+
+ if (tsk->task_frag.page)
+ put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
@@ -1206,7 +979,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
- uid_t uid = __task_cred(p)->uid;
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
if (!likely(wo->wo_flags & WEXITED))
@@ -1228,17 +1001,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
+ traced = ptrace_reparented(p);
/*
- * Try to move the task's state to DEAD
- * only one thread is allowed to do this:
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = xchg(&p->exit_state, EXIT_DEAD);
- if (state != EXIT_ZOMBIE) {
- BUG_ON(state != EXIT_DEAD);
+ state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
- }
-
- traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
* thread_group_leader() to filter out sub-threads.
@@ -1264,17 +1033,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_times() to get times for the thread
+ * We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
- psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
@@ -1299,7 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to EXIT_DEAD.
+ * thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
@@ -1336,22 +1105,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (!retval)
retval = pid;
- if (traced) {
+ if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
- /*
- * If this is not a sub-thread, notify the parent.
- * If parent wants a zombie, don't release it now.
- */
- if (thread_group_leader(p) &&
- !do_notify_parent(p, p->exit_signal)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
- if (p != NULL)
+ if (state == EXIT_DEAD)
release_task(p);
return retval;
@@ -1419,7 +1185,7 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
@@ -1492,7 +1258,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
@@ -1528,7 +1294,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
- int ret = eligible_child(wo, p);
+ int ret;
+
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1546,33 +1317,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- /* dead body doesn't have much to contribute */
- if (unlikely(p->exit_state == EXIT_DEAD)) {
+ if (unlikely(p->exit_state == EXIT_TRACE)) {
/*
- * But do not ignore this task until the tracer does
- * wait_task_zombie()->do_notify_parent().
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
*/
- if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+ if (likely(!ptrace))
wo->notask_error = 0;
return 0;
}
- /* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
- * A zombie ptracee is only visible to its ptracer.
- * Notification and reaping will be cascaded to the real
- * parent when the ptracer detaches.
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
*/
- if (likely(!ptrace) && unlikely(p->ptrace)) {
- /* it will become visible, clear notask_error */
- wo->notask_error = 0;
- return 0;
- }
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
- if (!delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
/*
* Allow access to stopped/continued state via zombie by
@@ -1598,19 +1380,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
wo->notask_error = 0;
} else {
/*
- * If @p is ptraced by a task in its real parent's group,
- * hide group stop/continued state when looking at @p as
- * the real parent; otherwise, a single stop can be
- * reported twice as group and ptrace stops.
- *
- * If a ptracer wants to distinguish the two events for its
- * own children, it should create a separate process which
- * takes the role of real parent.
- */
- if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
- return 0;
-
- /*
* @p is alive and it's gonna stop, continue or exit, so
* there always is something to wait for.
*/
@@ -1809,9 +1578,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1849,8 +1615,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b824..d8a6446adbc 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
+/* Cleared by build time tools if the table is already sorted. */
+u32 __initdata __visible main_extable_sort_needed = 1;
+
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- sort_extable(__start___ex_table, __stop___ex_table);
+ if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ pr_notice("Sorting __ex_table...\n");
+ sort_extable(__start___ex_table, __stop___ex_table);
+ }
}
/* Given an address, look for it in the exception tables. */
@@ -55,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
- addr <= (unsigned long)_einittext)
+ addr < (unsigned long)_einittext)
return 1;
return 0;
}
@@ -63,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
- addr <= (unsigned long)_etext)
+ addr < (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index b77fd559c78..6a13c46cd87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,12 +28,15 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -47,6 +50,7 @@
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
@@ -66,6 +70,10 @@
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
+#include <linux/signalfd.h>
+#include <linux/uprobes.h>
+#include <linux/aio.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -110,32 +118,69 @@ int nr_processes(void)
return total;
}
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct_node(node) \
- kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
-# define free_task_struct(tsk) \
- kmem_cache_free(task_struct_cachep, (tsk))
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
+
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+ kmem_cache_free(task_struct_cachep, tsk);
+}
#endif
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
+
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
-#ifdef CONFIG_DEBUG_STACK_USAGE
- gfp_t mask = GFP_KERNEL | __GFP_ZERO;
-#else
- gfp_t mask = GFP_KERNEL;
-#endif
- struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+
+static void free_thread_info(struct thread_info *ti)
+{
+ kmem_cache_free(thread_info_cache, ti);
}
+
+void thread_info_cache_init(void)
+{
+ thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ THREAD_SIZE, 0, NULL);
+ BUG_ON(thread_info_cache == NULL);
+}
+# endif
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -166,9 +211,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
void free_task(struct task_struct *tsk)
{
account_kernel_stack(tsk->stack, -1);
+ arch_release_thread_info(tsk->stack);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
+ arch_release_task_struct(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -192,6 +240,8 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
+ security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -201,17 +251,11 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
void __init fork_init(unsigned long mempages)
{
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
@@ -243,7 +287,7 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC];
}
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
@@ -258,21 +302,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
int node = tsk_fork_get_node(orig);
int err;
- prepare_to_copy(orig);
-
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
ti = alloc_thread_info_node(tsk, node);
- if (!ti) {
- free_task_struct(tsk);
- return NULL;
- }
+ if (!ti)
+ goto free_tsk;
err = arch_dup_task_struct(tsk, orig);
if (err)
- goto out;
+ goto free_ti;
tsk->stack = ti;
@@ -295,13 +335,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
+ tsk->task_frag.page = NULL;
account_kernel_stack(ti, 1);
return tsk;
-out:
+free_ti:
free_thread_info(ti);
+free_tsk:
free_task_struct(tsk);
return NULL;
}
@@ -313,10 +355,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
- struct mempolicy *pol;
+ uprobe_start_dup_mmap();
down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
/*
* Not linked in yet - no deadlock potential:
*/
@@ -324,9 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_cache = NULL;
- mm->free_area_cache = oldmm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ mm->vmacache_seqnum = 0;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -345,16 +386,15 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- long pages = vma_pages(mpnt);
- mm->total_vm -= pages;
vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -pages);
+ -vma_pages(mpnt));
continue;
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
- if (security_vm_enough_memory(len))
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
}
@@ -363,11 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
goto fail_nomem;
*tmp = *mpnt;
INIT_LIST_HEAD(&tmp->anon_vma_chain);
- pol = mpol_dup(vma_policy(mpnt));
- retval = PTR_ERR(pol);
- if (IS_ERR(pol))
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
goto fail_nomem_policy;
- vma_set_policy(tmp, pol);
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
@@ -375,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
@@ -386,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mapping->i_mmap_writable++;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- vma_prio_tree_add(tmp, mpnt);
+ if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+ vma_nonlinear_insert(tmp,
+ &mapping->i_mmap_nonlinear);
+ else
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -427,9 +470,10 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ uprobe_end_dup_mmap();
return retval;
fail_nomem_anon_vma_fork:
- mpol_put(pol);
+ mpol_put(vma_policy(tmp));
fail_nomem_policy:
kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
@@ -479,7 +523,7 @@ static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
- INIT_HLIST_HEAD(&mm->ioctx_list);
+ mm->ioctx_table = NULL;
#endif
}
@@ -489,19 +533,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
- mm->flags = (current->mm) ?
- (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
- mm->nr_ptes = 0;
+ atomic_long_set(&mm->nr_ptes, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
+ clear_tlb_flush_pending(mm);
- if (likely(!mm_alloc_pgd(mm))) {
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
mm->def_flags = 0;
+ }
+
+ if (likely(!mm_alloc_pgd(mm))) {
mmu_notifier_mm_init(mm);
return mm;
}
@@ -510,6 +558,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
return NULL;
}
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
/*
* Allocate and initialize an mm_struct.
*/
@@ -537,9 +602,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON(mm->pmd_huge_pte);
-#endif
+ check_mm(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -552,6 +615,7 @@ void mmput(struct mm_struct *mm)
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) {
+ uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
@@ -562,7 +626,6 @@ void mmput(struct mm_struct *mm)
list_del(&mm->mmlist);
spin_unlock(&mmlist_lock);
}
- put_swap_token(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
@@ -570,26 +633,6 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
- fput(mm->exe_file);
- mm->exe_file = NULL;
- }
-
-}
-
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
if (new_exe_file)
@@ -597,15 +640,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
if (mm->exe_file)
fput(mm->exe_file);
mm->exe_file = new_exe_file;
- mm->num_exe_file_vmas = 0;
}
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of
- * VM_EXECUTABLE vmas */
+ /* We need mmap_sem to protect against races with removal of exe_file */
down_read(&mm->mmap_sem);
exe_file = mm->exe_file;
if (exe_file)
@@ -667,6 +708,38 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
return mm;
}
+static void complete_vfork_done(struct task_struct *tsk)
+{
+ struct completion *vfork;
+
+ task_lock(tsk);
+ vfork = tsk->vfork_done;
+ if (likely(vfork)) {
+ tsk->vfork_done = NULL;
+ complete(vfork);
+ }
+ task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+ struct completion *vfork)
+{
+ int killed;
+
+ freezer_do_not_count();
+ killed = wait_for_completion_killable(vfork);
+ freezer_count();
+
+ if (killed) {
+ task_lock(child);
+ child->vfork_done = NULL;
+ task_unlock(child);
+ }
+
+ put_task_struct(child);
+ return killed;
+}
+
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
@@ -682,8 +755,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
*/
void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- struct completion *vfork_done = tsk->vfork_done;
-
/* Get rid of any futexes when releasing the mm */
#ifdef CONFIG_FUTEX
if (unlikely(tsk->robust_list)) {
@@ -700,20 +771,17 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
exit_pi_state_list(tsk);
#endif
+ uprobe_free_utask(tsk);
+
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
- /* notify parent sleeping on vfork() */
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
-
/*
* If we're exiting normally, clear a user-space tid field if
* requested. We leave this alone when dying by signal, to leave
* the value intact in a core dump, and to save the unnecessary
- * trouble otherwise. Userland only wants this done for a sys_exit.
+ * trouble, say, a killed vfork parent shouldn't touch this mm.
+ * Userland only wants this done for a sys_exit.
*/
if (tsk->clear_child_tid) {
if (!(tsk->flags & PF_SIGNALED) &&
@@ -728,20 +796,24 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
}
tsk->clear_child_tid = NULL;
}
+
+ /*
+ * All done, finally we can wake up parent and return this mm to him.
+ * Also kthread_stop() uses this completion for synchronization.
+ */
+ if (tsk->vfork_done)
+ complete_vfork_done(tsk);
}
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
-struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
- if (!oldmm)
- return NULL;
-
mm = allocate_mm();
if (!mm)
goto fail_nomem;
@@ -749,14 +821,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
mm_init_cpumask(mm);
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
-
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -818,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
@@ -830,10 +900,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
goto fail_nomem;
good_mm:
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
@@ -901,9 +967,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
* Share io context with parent, if CLONE_IO is set
*/
if (clone_flags & CLONE_IO) {
- tsk->io_context = ioc_task_link(ioc);
- if (unlikely(!tsk->io_context))
- return -ENOMEM;
+ ioc_task_link(ioc);
+ tsk->io_context = ioc;
} else if (ioprio_valid(ioc->ioprio)) {
new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
if (unlikely(!new_ioc))
@@ -935,8 +1000,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count))
+ if (atomic_dec_and_test(&sighand->count)) {
+ signalfd_cleanup(sighand);
kmem_cache_free(sighand_cachep, sighand);
+ }
}
@@ -977,9 +1044,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
+
+ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
+
init_waitqueue_head(&sig->wait_chldexit);
- if (clone_flags & CLONE_NEWPID)
- sig->flags |= SIGNAL_UNKILLABLE;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
@@ -1000,25 +1070,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_rwsem(&sig->group_rwsem);
#endif
- sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+ sig->has_child_subreaper = current->signal->has_child_subreaper ||
+ current->signal->is_child_subreaper;
+
mutex_init(&sig->cred_guard_mutex);
return 0;
}
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
-{
- unsigned long new_flags = p->flags;
-
- new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
- new_flags |= PF_FORKNOEXEC;
- new_flags |= PF_STARTING;
- p->flags = new_flags;
-}
-
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@ -1030,17 +1092,19 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- plist_head_init(&p->pi_waiters);
+ p->pi_waiters = RB_ROOT;
+ p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
+ p->pi_top_task = NULL;
#endif
}
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
mm->owner = p;
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Initialize POSIX timer handling for a single task.
@@ -1055,6 +1119,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}
+static inline void
+init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
+{
+ task->pids[type].pid = pid;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1065,7 +1135,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
*/
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
@@ -1073,11 +1142,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1103,6 +1174,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
+ /*
+ * If the new process will be in a different pid or user namespace
+ * do not allow it to share a thread group or signal handlers or
+ * parent with the forking task.
+ */
+ if (clone_flags & CLONE_SIGHAND) {
+ if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
+ (task_active_pid_ns(current) !=
+ current->nsproxy->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
@@ -1113,6 +1196,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1123,8 +1207,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
- if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- p->real_cred->user != INIT_USER)
+ if (p->real_cred->user != INIT_USER &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
@@ -1145,9 +1229,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- copy_flags(clone_flags, p);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
@@ -1158,9 +1242,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = p->prev_stime = 0;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqlock_init(&p->vtime_seqlock);
+ p->vtime_snap = 0;
+ p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
@@ -1185,21 +1275,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_cgroup;
+ goto bad_fork_cleanup_threadgroup_lock;
}
- mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+ seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- p->hardirqs_enabled = 1;
-#else
p->hardirqs_enabled = 0;
-#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
@@ -1221,13 +1307,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
- sched_fork(p);
+ retval = sched_fork(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
@@ -1260,22 +1352,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+ retval = copy_thread(clone_flags, stack_start, stack_size, p);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (!pid)
goto bad_fork_cleanup_io;
}
- p->pid = pid_nr(pid);
- p->tgid = p->pid;
- if (clone_flags & CLONE_THREAD)
- p->tgid = current->tgid;
-
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
@@ -1310,28 +1397,32 @@ static struct task_struct *copy_process(unsigned long clone_flags,
clear_all_latency_tracing(p);
/* ok, now we should be set up.. */
- p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
- p->pdeath_signal = 0;
- p->exit_state = 0;
+ p->pid = pid_nr(pid);
+ if (clone_flags & CLONE_THREAD) {
+ p->exit_signal = -1;
+ p->group_leader = current->group_leader;
+ p->tgid = current->tgid;
+ } else {
+ if (clone_flags & CLONE_PARENT)
+ p->exit_signal = current->group_leader->exit_signal;
+ else
+ p->exit_signal = (clone_flags & CSIGNAL);
+ p->group_leader = p;
+ p->tgid = p->pid;
+ }
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
- /*
- * Ok, make it visible to the rest of the system.
- * We dont wake it up yet.
- */
- p->group_leader = p;
+ p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
+ p->task_works = NULL;
- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
- /* Need tasklist lock for parent etc handling! */
+ /*
+ * Make it visible to the rest of the system, but dont wake it up yet.
+ * Need tasklist lock for parent etc handling!
+ */
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
@@ -1361,36 +1452,44 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_free_pid;
}
- if (clone_flags & CLONE_THREAD) {
- current->signal->nr_threads++;
- atomic_inc(&current->signal->live);
- atomic_inc(&current->signal->sigcnt);
- p->group_leader = current->group_leader;
- list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
- }
-
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
+ init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
- if (is_child_reaper(pid))
- p->nsproxy->pid_ns->child_reaper = p;
+ init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ init_task_pid(p, PIDTYPE_SID, task_session(current));
+
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
- attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
- attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ attach_pid(p, PIDTYPE_PGID);
+ attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
+ } else {
+ current->signal->nr_threads++;
+ atomic_inc(&current->signal->live);
+ atomic_inc(&current->signal->sigcnt);
+ list_add_tail_rcu(&p->thread_group,
+ &p->group_leader->thread_group);
+ list_add_tail_rcu(&p->thread_node,
+ &p->signal->thread_head);
}
- attach_pid(p, PIDTYPE_PID, pid);
+ attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
@@ -1398,6 +1497,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
+ uprobe_copy_process(p, clone_flags);
return p;
@@ -1429,11 +1529,10 @@ bad_fork_cleanup_policy:
perf_event_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_cgroup:
+bad_fork_cleanup_threadgroup_lock:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
@@ -1445,12 +1544,6 @@ fork_out:
return ERR_PTR(retval);
}
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-{
- memset(regs, 0, sizeof(struct pt_regs));
- return regs;
-}
-
static inline void init_idle_pids(struct pid_link *links)
{
enum pid_type type;
@@ -1461,13 +1554,10 @@ static inline void init_idle_pids(struct pid_link *links)
}
}
-struct task_struct * __cpuinit fork_idle(int cpu)
+struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- struct pt_regs regs;
-
- task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
- &init_struct_pid, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1484,7 +1574,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
@@ -1494,27 +1583,12 @@ long do_fork(unsigned long clone_flags,
long nr;
/*
- * Do some preliminary argument and permissions checking before we
- * actually start allocating stuff
- */
- if (clone_flags & CLONE_NEWUSER) {
- if (clone_flags & CLONE_THREAD)
- return -EINVAL;
- /* hopefully this check will go away when userns support is
- * complete
- */
- if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- !capable(CAP_SETGID))
- return -EPERM;
- }
-
- /*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
- if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1526,7 +1600,7 @@ long do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, regs, stack_size,
+ p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
@@ -1534,10 +1608,12 @@ long do_fork(unsigned long clone_flags,
*/
if (!IS_ERR(p)) {
struct completion vfork;
+ struct pid *pid;
trace_sched_process_fork(current, p);
- nr = task_pid_vnr(p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
@@ -1545,34 +1621,84 @@ long do_fork(unsigned long clone_flags,
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
+ get_task_struct(p);
}
- /*
- * We set PF_STARTING at creation in case tracing wants to
- * use this to distinguish a fully live task from one that
- * hasn't finished SIGSTOP raising yet. Now we clear it
- * and set the child going.
- */
- p->flags &= ~PF_STARTING;
-
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
- ptrace_event(trace, nr);
+ ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
- freezer_do_not_count();
- wait_for_completion(&vfork);
- freezer_count();
- ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL);
+}
+
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+ return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+ /* can not support in nommu mode */
+ return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int, tls_val,
+ int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int, stack_size,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#endif
+{
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+}
+#endif
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
@@ -1622,7 +1748,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
{
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
@@ -1689,19 +1816,35 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
- err = check_unshare_flags(unshare_flags);
- if (err)
- goto bad_unshare_out;
-
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto bad_unshare_out;
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -1715,11 +1858,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
- err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+ err = unshare_userns(unshare_flags, &new_cred);
if (err)
goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
- if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1727,10 +1874,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
exit_sem(current);
}
- if (new_nsproxy) {
+ if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
- new_nsproxy = NULL;
- }
task_lock(current);
@@ -1752,11 +1897,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
task_unlock(current);
- }
- if (new_nsproxy)
- put_nsproxy(new_nsproxy);
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
+ }
+bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed..aa6a8aadb91 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
bool pm_freezing;
bool pm_nosig_freezing;
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
+
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
@@ -33,7 +39,7 @@ static DEFINE_SPINLOCK(freezer_lock);
*/
bool freezing_slow_path(struct task_struct *p)
{
- if (p->flags & PF_NOFREEZE)
+ if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
@@ -99,9 +105,9 @@ static void fake_signal_wake_up(struct task_struct *p)
* freeze_task - send a freeze request to given task
* @p: task to send the request to
*
- * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
- * flag and either sending a fake signal to it or waking it up, depending
- * on whether it has %PF_FREEZER_NOSIG set.
+ * If @p is freezing, the freeze request is sent either by sending a fake
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ * thread).
*
* RETURNS:
* %false, if @p is not freezing or already frozen; %true, otherwise
@@ -110,23 +116,28 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
+ /*
+ * This check can race with freezer_do_not_count, but worst case that
+ * will result in an extra wakeup being sent to the task. It does not
+ * race with freezer_count(), the barriers in freezer_count() and
+ * freezer_should_skip() ensure that either freezer_count() sees
+ * freezing == true in try_to_freeze() and freezes, or
+ * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
+ * normally.
+ */
+ if (freezer_should_skip(p))
+ return false;
+
spin_lock_irqsave(&freezer_lock, flags);
if (!freezing(p) || frozen(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
- if (!(p->flags & PF_KTHREAD)) {
+ if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
- /*
- * fake_signal_wake_up() goes through p's scheduler
- * lock and guarantees that TASK_STOPPED/TRACED ->
- * TASK_RUNNING transition can't race with task state
- * testing in try_to_freeze_tasks().
- */
- } else {
+ else
wake_up_state(p, TASK_INTERRUPTIBLE);
- }
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173..b632b5f3f09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,14 +59,121 @@
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
+#include <linux/bootmem.h>
#include <asm/futex.h>
-#include "rtmutex_common.h"
+#include "locking/rtmutex_common.h"
-int __read_mostly futex_cmpxchg_enabled;
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `-------> mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+int __read_mostly futex_cmpxchg_enabled;
+#endif
/*
* Futex flags used to encode options to functions and preserve them across
@@ -143,11 +250,59 @@ static const struct futex_q futex_q_init = {
* waiting on a futex.
*/
struct futex_hash_bucket {
+ atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
+
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static inline void futex_get_mm(union futex_key *key)
+{
+ atomic_inc(&key->private.mm->mm_count);
+ /*
+ * Ensure futex_get_mm() implies a full barrier such that
+ * get_futex_key() implies a full barrier. This is relied upon
+ * as full barrier (B), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
/*
* We hash on the keys returned from get_futex_key (see below).
@@ -157,7 +312,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
- return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+ return &futex_queues[hash & (futex_hashsize - 1)];
}
/*
@@ -183,10 +338,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode);
+ ihold(key->shared.inode); /* implies MB (B) */
break;
case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
+ futex_get_mm(key); /* implies MB (B) */
break;
}
}
@@ -221,10 +376,11 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -246,6 +402,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
return -EINVAL;
address -= key->both.offset;
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -254,11 +413,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
- return -EFAULT;
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
return 0;
}
@@ -283,7 +440,7 @@ again:
put_page(page);
/* serialize against __split_huge_page_splitting() */
local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
page_head = compound_head(page);
/*
* page_head is valid pointer but we must pin
@@ -362,10 +519,10 @@ again:
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = page_head->index;
+ key->shared.pgoff = basepage_index(page);
}
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
out:
unlock_page(page_head);
@@ -586,27 +743,74 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
+ */
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct plist_head *head;
struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex(&this->key, key)) {
/*
- * Another waiter already exists - bump up
- * the refcount and return its pi_state:
+ * Sanity check the waiter before increasing
+ * the refcount and attaching to it.
*/
pi_state = this->pi_state;
/*
- * Userspace might have messed up non-PI and PI futexes
+ * Userspace might have messed up non-PI and
+ * PI futexes [3]
*/
if (unlikely(!pi_state))
return -EINVAL;
@@ -614,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
WARN_ON(!atomic_read(&pi_state->refcount));
/*
- * When pi_state->owner is NULL then the owner died
- * and another waiter is on the fly. pi_state->owner
- * is fixed up by the task which acquires
- * pi_state->rt_mutex.
- *
- * We do not check for pid == 0 which can happen when
- * the owner died and robust_list_exit() cleared the
- * TID.
+ * Handle the owner died case:
*/
- if (pid && pi_state->owner) {
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and
+ * wakes the topmost waiter. The task which
+ * acquires the pi_state->rt_mutex will fixup
+ * owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user
+ * space TID is not 0. Inconsistent
+ * state. [5]
+ */
+ if (pid)
+ return -EINVAL;
+ /*
+ * Take a ref on the state and
+ * return. [4]
+ */
+ goto out_state;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner
+ * has not yet executed exit_pi_state_list()
+ * or some waiter acquired the rtmutex in the
+ * pi state, but did not yet fixup the TID in
+ * user space.
+ *
+ * Take a ref on the state and return. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
/*
- * Bail out if user space manipulated the
- * futex value.
+ * If the owner died bit is not set,
+ * then the pi_state must have an
+ * owner. [7]
*/
- if (pid != task_pid_vnr(pi_state->owner))
+ if (!pi_state->owner)
return -EINVAL;
}
+ /*
+ * Bail out if user space manipulated the
+ * futex value. If pi state exists then the
+ * owner TID must be the same as the user
+ * space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+
+ out_state:
atomic_inc(&pi_state->refcount);
*ps = pi_state;
-
return 0;
}
}
/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0
+ * the new pi_state to it, but bail out when TID = 0 [1]
*/
if (!pid)
return -ESRCH;
@@ -649,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
@@ -669,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return ret;
}
+ /*
+ * No existing pi state. First waiter. [2]
+ */
pi_state = alloc_pi_state();
/*
@@ -703,9 +951,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -715,7 +963,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, ownerdied = 0;
+ int lock_taken, ret, force_take = 0;
u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
@@ -740,10 +988,18 @@ retry:
return -EDEADLK;
/*
- * Surprise - we got the lock. Just return to userspace:
+ * Surprise - we got the lock, but we do not trust user space at all.
*/
- if (unlikely(!curval))
- return 1;
+ if (unlikely(!curval)) {
+ /*
+ * We verify whether there is kernel state for this
+ * futex. If not, we can safely assume, that the 0 ->
+ * TID transition is correct. If state exists, we do
+ * not bother to fixup the user space state as it was
+ * corrupted already.
+ */
+ return futex_top_waiter(hb, key) ? -EINVAL : 1;
+ }
uval = curval;
@@ -754,17 +1010,15 @@ retry:
newval = curval | FUTEX_WAITERS;
/*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
+ * Should we force take the futex? See below.
*/
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
+ if (unlikely(force_take)) {
+ /*
+ * Keep the OWNER_DIED and the WAITERS bit and set the
+ * new TID value.
+ */
newval = (curval & ~FUTEX_TID_MASK) | vpid;
- ownerdied = 0;
+ force_take = 0;
lock_taken = 1;
}
@@ -774,7 +1028,7 @@ retry:
goto retry;
/*
- * We took the lock due to owner died take over.
+ * We took the lock due to forced take over.
*/
if (unlikely(lock_taken))
return 1;
@@ -789,20 +1043,25 @@ retry:
switch (ret) {
case -ESRCH:
/*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
+ * We failed to find an owner for this
+ * futex. So we have no pi_state to block
+ * on. This can happen in two cases:
+ *
+ * 1) The owner died
+ * 2) A stale FUTEX_WAITERS bit
+ *
+ * Re-read the futex value.
*/
if (get_futex_value_locked(&curval, uaddr))
return -EFAULT;
/*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
+ * If the owner died or we have a stale
+ * WAITERS bit the owner TID in the user space
+ * futex is 0.
*/
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
+ if (!(curval & FUTEX_TID_MASK)) {
+ force_take = 1;
goto retry;
}
default:
@@ -829,6 +1088,7 @@ static void __unqueue_futex(struct futex_q *q)
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
}
/*
@@ -839,6 +1099,9 @@ static void wake_futex(struct futex_q *q)
{
struct task_struct *p = q->task;
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
* a non-futex wake up happens on another CPU then the task
@@ -867,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ int ret = 0;
if (!pi_state)
return -EINVAL;
@@ -890,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = this->task;
/*
- * We pass it to the next owner. (The WAITERS bit is always
- * kept enabled while there is PI state around. We must also
- * preserve the owner died bit.)
+ * We pass it to the next owner. The WAITERS bit is always
+ * kept enabled while there is PI state around. We cleanup the
+ * owner died bit, because we are the owner.
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- int ret = 0;
-
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -974,7 +1234,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
int ret;
@@ -986,10 +1245,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
goto out;
hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+ goto out_put_key;
+
spin_lock(&hb->lock);
- head = &hb->chain;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
@@ -1007,6 +1270,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
+out_put_key:
put_futex_key(&key);
out:
return ret;
@@ -1022,7 +1286,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret;
@@ -1070,10 +1333,12 @@ retry_private:
goto retry;
}
- head = &hb1->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (match_futex (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
@@ -1081,11 +1346,13 @@ retry_private:
}
if (op_ret > 0) {
- head = &hb2->chain;
-
op_ret = 0;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
if (match_futex (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++op_ret >= nr_wake2)
break;
@@ -1094,6 +1361,7 @@ retry_private:
ret += op_ret;
}
+out_unlock:
double_unlock_hb(hb1, hb2);
out_put_keys:
put_futex_key(&key2);
@@ -1121,7 +1389,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
*/
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -1174,9 +1444,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1187,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1215,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
@@ -1237,8 +1509,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1249,12 +1521,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
int drop_count = 0, task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head1;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
* requeue_pi requires a pi_state, try to allocate it now
* without any locks in case it fails.
*/
@@ -1292,10 +1569,20 @@ retry:
if (unlikely(ret != 0))
goto out_put_key1;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry_private:
+ hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -1305,6 +1592,7 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1);
if (ret)
@@ -1337,16 +1625,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1354,6 +1651,7 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
@@ -1363,6 +1661,7 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
cond_resched();
@@ -1372,8 +1671,7 @@ retry_private:
}
}
- head1 = &hb1->chain;
- plist_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (task_count - nr_wake >= nr_requeue)
break;
@@ -1383,9 +1681,13 @@ retry_private:
/*
* FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter)) {
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
ret = -EINVAL;
break;
}
@@ -1435,6 +1737,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
/*
* drop_futex_key_refs() must be called outside the spinlocks. During
@@ -1462,17 +1765,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
struct futex_hash_bucket *hb;
hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock);
+ spin_lock(&hb->lock); /* implies MB (A) */
return hb;
}
static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
spin_unlock(&hb->lock);
+ hb_waiters_dec(hb);
}
/**
@@ -1515,8 +1830,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1686,9 +2001,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1785,7 +2100,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- schedule();
+ freezable_schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -1803,8 +2118,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -1842,7 +2157,7 @@ retry_private:
ret = get_futex_value_locked(&uval, uaddr);
if (ret) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = get_user(uval, uaddr);
if (ret)
@@ -1856,7 +2171,7 @@ retry_private:
}
if (uval != val) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = -EWOULDBLOCK;
}
@@ -2004,7 +2319,7 @@ retry_private:
* Task is exiting and we just wait for the
* exit to complete.
*/
- queue_unlock(&q, hb);
+ queue_unlock(hb);
put_futex_key(&q.key);
cond_resched();
goto retry;
@@ -2056,7 +2371,7 @@ retry_private:
goto out_put_key;
out_unlock_put_key:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
out_put_key:
put_futex_key(&q.key);
@@ -2066,7 +2381,7 @@ out:
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
@@ -2088,7 +2403,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
u32 uval, vpid = task_pid_vnr(current);
int ret;
@@ -2112,9 +2426,10 @@ retry:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
- * anyone else up:
+ * anyone else up. We only try this if neither the waiters nor
+ * the owner died bit are set.
*/
- if (!(uval & FUTEX_OWNER_DIED) &&
+ if (!(uval & ~FUTEX_TID_MASK) &&
cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
@@ -2128,9 +2443,7 @@ retry:
* Ok, other tasks may need to be woken up - check waiters
* and do the wakeup if necessary:
*/
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -2146,11 +2459,9 @@ retry:
/*
* No waiters - kernel unlocks the futex:
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
- }
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
out_unlock:
spin_unlock(&hb->lock);
@@ -2182,9 +2493,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2207,6 +2518,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* Unqueue the futex_q and determine which it was.
*/
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2226,18 +2538,17 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2255,8 +2566,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2271,6 +2582,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
if (!bitset)
return -EINVAL;
@@ -2289,6 +2603,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* code while we sleep on uaddr.
*/
debug_rt_mutex_init_waiter(&rt_waiter);
+ RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&rt_waiter.tree_entry);
rt_waiter.task = NULL;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2307,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (ret)
goto out_key2;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (match_futex(&q.key, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -2342,7 +2667,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
* the pi_state.
*/
- WARN_ON(!&q.pi_state);
+ WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2369,7 +2694,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
- if (rt_mutex_owner(pi_mutex) == current)
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
@@ -2443,40 +2768,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
{
struct robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- /* If victim is in different user_ns, then uids are not
- comparable, so we must have CAP_SYS_PTRACE */
- if (cred->user->user_ns != pcred->user->user_ns) {
- if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
- goto ok;
- }
- /* If victim is in same user_ns, then uids are comparable */
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
-ok:
- head = p->robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
@@ -2628,7 +2942,7 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ int cmd = op & FUTEX_CMD_MASK;
unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2955,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
}
switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_UNLOCK_PI:
+ case FUTEX_TRYLOCK_PI:
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, flags, val, timeout, val3);
- break;
+ return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, flags, val, val3);
- break;
+ return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- break;
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
- break;
+ return futex_lock_pi(uaddr, flags, val, timeout, 0);
case FUTEX_UNLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, flags);
- break;
+ return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
- break;
+ return futex_lock_pi(uaddr, flags, 0, timeout, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- break;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- break;
- default:
- ret = -ENOSYS;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
}
- return ret;
+ return -ENOSYS;
}
@@ -2720,10 +3029,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
- int i;
/*
* This will fail and we want it. Some arch implementations do
@@ -2737,8 +3046,31 @@ static int __init futex_init(void)
*/
if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ futex_detect_cmpxchg();
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c