aboutsummaryrefslogtreecommitdiff
path: root/arch/tile/kernel
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2013-09-16 14:41:21 -0400
committerChris Metcalf <cmetcalf@tilera.com>2013-09-16 15:47:32 -0400
commite823acc0a9e36a5645cdf0c57fa5f738b51bb999 (patch)
treed009fadb93f9867ec691ea3590d152d7982251d1 /arch/tile/kernel
parent88e2692a4dff0d1f80e9761ade3db2ea59206c1e (diff)
tile: remove stray blank space
The compat sys_llseek() definition addition added a bogus space on an otherwise-blank line. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Diffstat (limited to 'arch/tile/kernel')
-rw-r--r--arch/tile/kernel/compat.c2
1 files changed, 1 insertions, 1 deletions
diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c
index ed378416b86..49120843ff9 100644
--- a/arch/tile/kernel/compat.c
+++ b/arch/tile/kernel/compat.c
@@ -84,7 +84,7 @@ COMPAT_SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned int, offset_high,
{
return sys_llseek(fd, offset_high, offset_low, result, origin);
}
-
+
/* Provide the compat syscall number to call mapping. */
#undef __SYSCALL
#define __SYSCALL(nr, call) [nr] = (call),
'mode'>-rw-r--r--kernel/audit_tree.c459
-rw-r--r--kernel/audit_watch.c519
-rw-r--r--kernel/auditfilter.c1186
-rw-r--r--kernel/auditsc.c1670
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/bounds.c10
-rw-r--r--kernel/capability.c211
-rw-r--r--kernel/cgroup.c6413
-rw-r--r--kernel/cgroup_debug.c107
-rw-r--r--kernel/cgroup_freezer.c611
-rw-r--r--kernel/compat.c705
-rw-r--r--kernel/configs.c7
-rw-r--r--kernel/context_tracking.c216
-rw-r--r--kernel/cpu.c467
-rw-r--r--kernel/cpu_pm.c233
-rw-r--r--kernel/cpuset.c2114
-rw-r--r--kernel/crash_dump.c45
-rw-r--r--kernel/cred.c556
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c1067
-rw-r--r--kernel/debug/debug_core.h85
-rw-r--r--kernel/debug/gdbstub.c1145
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c553
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds31
-rw-r--r--kernel/debug/kdb/kdb_debugger.c182
-rw-r--r--kernel/debug/kdb/kdb_io.c852
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c263
-rw-r--r--kernel/debug/kdb/kdb_main.c2879
-rw-r--r--kernel/debug/kdb/kdb_private.h260
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/delayacct.c17
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/dma.c3
-rw-r--r--kernel/elfcore.c24
-rw-r--r--kernel/events/Makefile9
-rw-r--r--kernel/events/callchain.c209
-rw-r--r--kernel/events/core.c8168
-rw-r--r--kernel/events/hw_breakpoint.c662
-rw-r--r--kernel/events/internal.h198
-rw-r--r--kernel/events/ring_buffer.c417
-rw-r--r--kernel/events/uprobes.c1993
-rw-r--r--kernel/exec_domain.c75
-rw-r--r--kernel/exit.c1425
-rw-r--r--kernel/extable.c76
-rw-r--r--kernel/fork.c1334
-rw-r--r--kernel/freezer.c220
-rw-r--r--kernel/futex.c2497
-rw-r--r--kernel/futex_compat.c61
-rw-r--r--kernel/gcov/Kconfig79
-rw-r--r--kernel/gcov/Makefile33
-rw-r--r--kernel/gcov/base.c158
-rw-r--r--kernel/gcov/fs.c792
-rw-r--r--kernel/gcov/gcc_3_4.c562
-rw-r--r--kernel/gcov/gcc_4_7.c565
-rw-r--r--kernel/gcov/gcov.h85
-rw-r--r--kernel/groups.c271
-rw-r--r--kernel/hrtimer.c902
-rw-r--r--kernel/hung_task.c251
-rw-r--r--kernel/irq/Kconfig85
-rw-r--r--kernel/irq/Makefile6
-rw-r--r--kernel/irq/autoprobe.c77
-rw-r--r--kernel/irq/chip.c879
-rw-r--r--kernel/irq/debug.h45
-rw-r--r--kernel/irq/devres.c67
-rw-r--r--kernel/irq/dummychip.c61
-rw-r--r--kernel/irq/generic-chip.c591
-rw-r--r--kernel/irq/handle.c552
-rw-r--r--kernel/irq/internals.h205
-rw-r--r--kernel/irq/irqdesc.c554
-rw-r--r--kernel/irq/irqdomain.c711
-rw-r--r--kernel/irq/manage.c1488
-rw-r--r--kernel/irq/migration.c56
-rw-r--r--kernel/irq/numa_migrate.c119
-rw-r--r--kernel/irq/pm.c130
-rw-r--r--kernel/irq/proc.c286
-rw-r--r--kernel/irq/resend.c35
-rw-r--r--kernel/irq/settings.h156
-rw-r--r--kernel/irq/spurious.c317
-rw-r--r--kernel/irq_work.c205
-rw-r--r--kernel/itimer.c179
-rw-r--r--kernel/jump_label.c460
-rw-r--r--kernel/kallsyms.c290
-rw-r--r--kernel/kcmp.c197
-rw-r--r--kernel/kexec.c514
-rw-r--r--kernel/kfifo.c197
-rw-r--r--kernel/kgdb.c1736
-rw-r--r--kernel/kmod.c599
-rw-r--r--kernel/kprobes.c1857
-rw-r--r--kernel/ksysfs.c78
-rw-r--r--kernel/kthread.c638
-rw-r--r--kernel/latencytop.c64
-rw-r--r--kernel/lockdep_internals.h97
-rw-r--r--kernel/locking/Makefile28
-rw-r--r--kernel/locking/lglock.c89
-rw-r--r--kernel/locking/lockdep.c (renamed from kernel/lockdep.c)2275
-rw-r--r--kernel/locking/lockdep_internals.h170
-rw-r--r--kernel/locking/lockdep_proc.c (renamed from kernel/lockdep_proc.c)257
-rw-r--r--kernel/locking/lockdep_states.h9
-rw-r--r--kernel/locking/locktorture.c454
-rw-r--r--kernel/locking/mcs_spinlock.c210
-rw-r--r--kernel/locking/mcs_spinlock.h130
-rw-r--r--kernel/locking/mutex-debug.c (renamed from kernel/mutex-debug.c)32
-rw-r--r--kernel/locking/mutex-debug.h (renamed from kernel/mutex-debug.h)30
-rw-r--r--kernel/locking/mutex.c930
-rw-r--r--kernel/locking/mutex.h (renamed from kernel/mutex.h)22
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/qrwlock.c133
-rw-r--r--kernel/locking/rtmutex-debug.c (renamed from kernel/rtmutex-debug.c)84
-rw-r--r--kernel/locking/rtmutex-debug.h (renamed from kernel/rtmutex-debug.h)5
-rw-r--r--kernel/locking/rtmutex-tester.c (renamed from kernel/rtmutex-tester.c)75
-rw-r--r--kernel/locking/rtmutex.c1373
-rw-r--r--kernel/locking/rtmutex.h (renamed from kernel/rtmutex.h)5
-rw-r--r--kernel/locking/rtmutex_common.h (renamed from kernel/rtmutex_common.h)47
-rw-r--r--kernel/locking/rwsem-spinlock.c296
-rw-r--r--kernel/locking/rwsem-xadd.c513
-rw-r--r--kernel/locking/rwsem.c (renamed from kernel/rwsem.c)46
-rw-r--r--kernel/locking/semaphore.c (renamed from kernel/semaphore.c)40
-rw-r--r--kernel/locking/spinlock.c399
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module-internal.h (renamed from kernel/cred-internals.h)15
-rw-r--r--kernel/module.c3035
-rw-r--r--kernel/module_signing.c250
-rw-r--r--kernel/mutex.c387
-rw-r--r--kernel/notifier.c59
-rw-r--r--kernel/ns_cgroup.c104
-rw-r--r--kernel/nsproxy.c161
-rw-r--r--kernel/padata.c1110
-rw-r--r--kernel/panic.c338
-rw-r--r--kernel/params.c542
-rw-r--r--kernel/pid.c263
-rw-r--r--kernel/pid_namespace.c260
-rw-r--r--kernel/pm_qos_params.c423
-rw-r--r--kernel/posix-cpu-timers.c1354
-rw-r--r--kernel/posix-timers.c517
-rw-r--r--kernel/power/Kconfig308
-rw-r--r--kernel/power/Makefile14
-rw-r--r--kernel/power/autosleep.c128
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/console.c168
-rw-r--r--kernel/power/disk.c926
-rw-r--r--kernel/power/hibernate.c1168
-rw-r--r--kernel/power/main.c912
-rw-r--r--kernel/power/power.h125
-rw-r--r--kernel/power/poweroff.c8
-rw-r--r--kernel/power/process.c226
-rw-r--r--kernel/power/qos.c601
-rw-r--r--kernel/power/snapshot.c733
-rw-r--r--kernel/power/suspend.c443
-rw-r--r--kernel/power/suspend_test.c185
-rw-r--r--kernel/power/swap.c1375
-rw-r--r--kernel/power/swsusp.c386
-rw-r--r--kernel/power/user.c277
-rw-r--r--kernel/power/wakelock.c268
-rw-r--r--kernel/printk.c1302
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/braille.c49
-rw-r--r--kernel/printk/braille.h48
-rw-r--r--kernel/printk/console_cmdline.h14
-rw-r--r--kernel/printk/printk.c3009
-rw-r--r--kernel/profile.c176
-rw-r--r--kernel/ptrace.c975
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h134
-rw-r--r--kernel/rcu/rcutorture.c1707
-rw-r--r--kernel/rcu/srcu.c693
-rw-r--r--kernel/rcu/tiny.c384
-rw-r--r--kernel/rcu/tiny_plugin.h174
-rw-r--r--kernel/rcu/tree.c3744
-rw-r--r--kernel/rcu/tree.h600
-rw-r--r--kernel/rcu/tree_plugin.h2854
-rw-r--r--kernel/rcu/tree_trace.c501
-rw-r--r--kernel/rcu/update.c352
-rw-r--r--kernel/rcuclassic.c788
-rw-r--r--kernel/rcupdate.c189
-rw-r--r--kernel/rcupreempt.c1505
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c1227
-rw-r--r--kernel/rcutree.c1532
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/reboot.c433
-rw-r--r--kernel/relay.c96
-rw-r--r--kernel/res_counter.c146
-rw-r--r--kernel/resource.c647
-rw-r--r--kernel/rtmutex.c1006
-rw-r--r--kernel/sched.c10022
-rw-r--r--kernel/sched/Makefile21
-rw-r--r--kernel/sched/auto_group.c260
-rw-r--r--kernel/sched/auto_group.h64
-rw-r--r--kernel/sched/clock.c422
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c8096
-rw-r--r--kernel/sched/cpuacct.c283
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.c229
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)155
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)13
-rw-r--r--kernel/sched/cputime.c838
-rw-r--r--kernel/sched/deadline.c1676
-rw-r--r--kernel/sched/debug.c665
-rw-r--r--kernel/sched/fair.c7815
-rw-r--r--kernel/sched/features.h85
-rw-r--r--kernel/sched/idle.c273
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)80
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)1141
-rw-r--r--kernel/sched/sched.h1549
-rw-r--r--kernel/sched/stats.c145
-rw-r--r--kernel/sched/stats.h267
-rw-r--r--kernel/sched/stop_task.c131
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)236
-rw-r--r--kernel/sched_clock.c270
-rw-r--r--kernel/sched_debug.c509
-rw-r--r--kernel/sched_fair.c1835
-rw-r--r--kernel/sched_features.h16
-rw-r--r--kernel/sched_stats.h375
-rw-r--r--kernel/seccomp.c473
-rw-r--r--kernel/signal.c2309
-rw-r--r--kernel/smp.c810
-rw-r--r--kernel/smpboot.c313
-rw-r--r--kernel/smpboot.h20
-rw-r--r--kernel/softirq.c671
-rw-r--r--kernel/softlockup.c378
-rw-r--r--kernel/spinlock.c455
-rw-r--r--kernel/srcu.c257
-rw-r--r--kernel/stacktrace.c14
-rw-r--r--kernel/stop_machine.c709
-rw-r--r--kernel/sys.c1932
-rw-r--r--kernel/sys_ni.c44
-rw-r--r--kernel/sysctl.c2776
-rw-r--r--kernel/sysctl_binary.c1493
-rw-r--r--kernel/sysctl_check.c1545
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/task_work.c128
-rw-r--r--kernel/taskstats.c331
-rw-r--r--kernel/test_kprobes.c14
-rw-r--r--kernel/time.c123
-rw-r--r--kernel/time/Kconfig199
-rw-r--r--kernel/time/Makefile10
-rw-r--r--kernel/time/alarmtimer.c861
-rw-r--r--kernel/time/clockevents.c613
-rw-r--r--kernel/time/clocksource.c966
-rw-r--r--kernel/time/jiffies.c72
-rw-r--r--kernel/time/ntp.c704
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/posix-clock.c446
-rw-r--r--kernel/time/sched_clock.c217
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c106
-rw-r--r--kernel/time/tick-broadcast.c591
-rw-r--r--kernel/time/tick-common.c267
-rw-r--r--kernel/time/tick-internal.h52
-rw-r--r--kernel/time/tick-oneshot.c69
-rw-r--r--kernel/time/tick-sched.c964
-rw-r--r--kernel/time/timecompare.c191
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c1649
-rw-r--r--kernel/time/timekeeping_debug.c74
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/time/timer_list.c154
-rw-r--r--kernel/time/timer_stats.c52
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c877
-rw-r--r--kernel/torture.c733
-rw-r--r--kernel/trace/Kconfig486
-rw-r--r--kernel/trace/Makefile39
-rw-r--r--kernel/trace/blktrace.c1838
-rw-r--r--kernel/trace/ftrace.c5003
-rw-r--r--kernel/trace/power-traces.c17
-rw-r--r--kernel/trace/ring_buffer.c4139
-rw-r--r--kernel/trace/ring_buffer_benchmark.c488
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c7305
-rw-r--r--kernel/trace/trace.h1258
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_boot.c186
-rw-r--r--kernel/trace/trace_branch.c315
-rw-r--r--kernel/trace/trace_clock.c137
-rw-r--r--kernel/trace/trace_entries.h324
-rw-r--r--kernel/trace/trace_event_perf.c370
-rw-r--r--kernel/trace/trace_events.c2728
-rw-r--r--kernel/trace/trace_events_filter.c2449
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_export.c197
-rw-r--r--kernel/trace/trace_functions.c589
-rw-r--r--kernel/trace/trace_functions_graph.c1482
-rw-r--r--kernel/trace/trace_hw_branches.c195
-rw-r--r--kernel/trace/trace_irqsoff.c471
-rw-r--r--kernel/trace/trace_kdb.c135
-rw-r--r--kernel/trace/trace_kprobe.c1496
-rw-r--r--kernel/trace/trace_mmiotrace.c74
-rw-r--r--kernel/trace/trace_nop.c10
-rw-r--r--kernel/trace/trace_output.c1543
-rw-r--r--kernel/trace/trace_output.h55
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_printk.c368
-rw-r--r--kernel/trace/trace_probe.c726
-rw-r--r--kernel/trace/trace_probe.h400
-rw-r--r--kernel/trace/trace_sched_switch.c155
-rw-r--r--kernel/trace/trace_sched_wakeup.c620
-rw-r--r--kernel/trace/trace_selftest.c812
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c302
-rw-r--r--kernel/trace/trace_stat.c359
-rw-r--r--kernel/trace/trace_stat.h33
-rw-r--r--kernel/trace/trace_syscalls.c762
-rw-r--r--kernel/trace/trace_sysprof.c334
-rw-r--r--kernel/trace/trace_uprobe.c1340
-rw-r--r--kernel/tracepoint.c668
-rw-r--r--kernel/tsacct.c76
-rw-r--r--kernel/uid16.c106
-rw-r--r--kernel/up.c71
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/user.c385
-rw-r--r--kernel/user_namespace.c911
-rw-r--r--kernel/utsname.c88
-rw-r--r--kernel/utsname_sysctl.c67
-rw-r--r--kernel/watchdog.c648
-rw-r--r--kernel/workqueue.c5160
-rw-r--r--kernel/workqueue_internal.h74
337 files changed, 164955 insertions, 61335 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f43..790d83c7d16 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,5 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
+x509_certificate_list
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b0..2a202a84675 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
default 1000 if HZ_1000
config SCHED_HRTICK
- def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+ def_bool HIGH_RES_TIMERS
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 00000000000..76768ee812b
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,239 @@
+#
+# The ARCH_INLINE foo is necessary because select ignores "depends on"
+#
+config ARCH_INLINE_SPIN_TRYLOCK
+ bool
+
+config ARCH_INLINE_SPIN_TRYLOCK_BH
+ bool
+
+config ARCH_INLINE_SPIN_LOCK
+ bool
+
+config ARCH_INLINE_SPIN_LOCK_BH
+ bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQ
+ bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQSAVE
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK_BH
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQ
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+ bool
+
+
+config ARCH_INLINE_READ_TRYLOCK
+ bool
+
+config ARCH_INLINE_READ_LOCK
+ bool
+
+config ARCH_INLINE_READ_LOCK_BH
+ bool
+
+config ARCH_INLINE_READ_LOCK_IRQ
+ bool
+
+config ARCH_INLINE_READ_LOCK_IRQSAVE
+ bool
+
+config ARCH_INLINE_READ_UNLOCK
+ bool
+
+config ARCH_INLINE_READ_UNLOCK_BH
+ bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQ
+ bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+ bool
+
+
+config ARCH_INLINE_WRITE_TRYLOCK
+ bool
+
+config ARCH_INLINE_WRITE_LOCK
+ bool
+
+config ARCH_INLINE_WRITE_LOCK_BH
+ bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQ
+ bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQSAVE
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK_BH
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQ
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ bool
+
+config UNINLINE_SPIN_UNLOCK
+ bool
+
+#
+# lock_* functions are inlined when:
+# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
+#
+# trylock_* functions are inlined when:
+# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+# unlock and unlock_irq functions are inlined when:
+# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+# or
+# - DEBUG_SPINLOCK=n and PREEMPT=n
+#
+# unlock_bh and unlock_irqrestore functions are inlined when:
+# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+
+if !DEBUG_SPINLOCK
+
+config INLINE_SPIN_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK
+
+config INLINE_SPIN_TRYLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK_BH
+
+config INLINE_SPIN_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+
+config INLINE_SPIN_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
+
+config INLINE_SPIN_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
+
+config INLINE_SPIN_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
+
+config INLINE_SPIN_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_BH
+
+config INLINE_SPIN_UNLOCK_IRQ
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
+
+config INLINE_SPIN_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+
+
+config INLINE_READ_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_READ_TRYLOCK
+
+config INLINE_READ_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+
+config INLINE_READ_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
+
+config INLINE_READ_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
+
+config INLINE_READ_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
+
+config INLINE_READ_UNLOCK
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
+
+config INLINE_READ_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_BH
+
+config INLINE_READ_UNLOCK_IRQ
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
+
+config INLINE_READ_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+
+
+config INLINE_WRITE_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_WRITE_TRYLOCK
+
+config INLINE_WRITE_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+
+config INLINE_WRITE_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
+
+config INLINE_WRITE_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
+
+config INLINE_WRITE_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
+
+config INLINE_WRITE_UNLOCK
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
+
+config INLINE_WRITE_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_BH
+
+config INLINE_WRITE_UNLOCK_IRQ
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
+
+config INLINE_WRITE_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+endif
+
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
+
+config MUTEX_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b35..3f9c97419f0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,8 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
+ select PREEMPT_COUNT
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -52,3 +54,5 @@ config PREEMPT
endchoice
+config PREEMPT_COUNT
+ bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55..f2a8b6246ce 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,125 +2,222 @@
# Makefile for the linux kernel.
#
-obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
+obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
- sysctl.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
- async.o
+ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
+ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ extable.o params.o posix-timers.o \
+ kthread.o sys_ni.o posix-cpu-timers.o \
+ hrtimer.o nsproxy.o \
+ notifier.o ksysfs.o cred.o reboot.o \
+ async.o range.o groups.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
endif
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
+obj-y += sched/
+obj-y += locking/
+obj-y += power/
+obj-y += printk/
+obj-y += irq/
+obj-y += rcu/
+
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
-obj-$(CONFIG_LOCKDEP) += lockdep.o
-ifeq ($(CONFIG_PROC_FS),y)
-obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
-endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
-obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_KGDB) += debug/
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
-obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
-obj-$(CONFIG_SMP) += sched_cpupri.o
-
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only. Why this used to be enabled for all architectures is beyond
-# me. I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
-endif
+obj-$(CONFIG_TRACE_CLOCK) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_CPU_PM) += cpu_pm.o
+
+obj-$(CONFIG_PERF_EVENTS) += events/
+
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-quiet_cmd_ikconfiggz = IKCFG $@
- cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call if_changed,ikconfiggz)
+ $(call filechk,ikconfiggz)
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
+#
+# We look in the source root and the build root for all files whose name ends
+# in ".x509". Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
+#
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+ $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
+
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
+
+ifneq ($(wildcard $(obj)/.x509.list),)
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+
+kernel/system_certificates.o: $(obj)/x509_certificate_list
+
+quiet_cmd_x509certs = CERTS $@
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+
+targets += $(obj)/x509_certificate_list
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+ $(call if_changed,x509certs)
+
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+ @echo $(X509_CERTIFICATES) >$@
+endif
+
+clean-files := x509_certificate_list .x509.list
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+ifndef CONFIG_MODULE_SIG_HASH
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and uses a hardware random"
+ @echo "### number generator if one is available."
+ @echo "###"
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv 2>&1
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "O = Magrathea"
+ @echo >>x509.genkey "CN = Glacier signing key"
+ @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa3156416..808a86ff229 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
* the cache line to have the data after getting the lock.
*/
struct bsd_acct_struct {
- volatile int active;
- volatile int needcheck;
+ int active;
+ unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct timer_list timer;
struct list_head list;
};
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
static LIST_HEAD(acct_list);
/*
- * Called whenever the timer says to check the free space.
- */
-static void acct_timeout(unsigned long x)
-{
- struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
- acct->needcheck = 1;
-}
-
-/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,23 +102,23 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
struct kstatfs sbuf;
int res;
int act;
- sector_t resume;
- sector_t suspend;
+ u64 resume;
+ u64 suspend;
spin_lock(&acct_lock);
res = acct->active;
- if (!file || !acct->needcheck)
+ if (!file || time_is_before_jiffies(acct->needcheck))
goto out;
spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(file->f_path.dentry, &sbuf))
+ if (vfs_statfs(&file->f_path, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
- sector_div(suspend, 100);
- sector_div(resume, 100);
+ do_div(suspend, 100);
+ do_div(resume, 100);
if (sbuf.f_bavail <= suspend)
act = -1;
@@ -144,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
- res = act>0;
+ res = act > 0;
goto out;
}
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
}
}
- del_timer(&acct->timer);
- acct->needcheck = 0;
- acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct->timer);
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
res = acct->active;
out:
spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
if (acct->file) {
old_acct = acct->file;
old_ns = acct->ns;
- del_timer(&acct->timer);
acct->active = 0;
- acct->needcheck = 0;
acct->file = NULL;
acct->ns = NULL;
list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
if (file) {
acct->file = file;
acct->ns = ns;
- acct->needcheck = 0;
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
acct->active = 1;
list_add(&acct->list, &acct_list);
- /* It's been deleted if it was used before so this is safe */
- setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
- acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct->timer);
}
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
@@ -212,19 +193,19 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
}
}
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
{
struct file *file;
- int error;
+ struct vfsmount *mnt;
struct pid_namespace *ns;
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -243,24 +224,18 @@ static int acct_on(char *name)
}
}
- error = security_acct(file);
- if (error) {
- kfree(acct);
- filp_close(file, NULL);
- return error;
- }
-
spin_lock(&acct_lock);
if (ns->bacct == NULL) {
ns->bacct = acct;
acct = NULL;
}
- mnt_pin(file->f_path.mnt);
+ mnt = file->f_path.mnt;
+ mnt_pin(mnt);
acct_file_reopen(ns->bacct, file, ns);
spin_unlock(&acct_lock);
- mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+ mntput(mnt); /* it's pinned, now give up active reference */
kfree(acct);
return 0;
@@ -279,15 +254,15 @@ static int acct_on(char *name)
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
- int error;
+ int error = 0;
if (!capable(CAP_SYS_PACCT))
return -EPERM;
if (name) {
- char *tmp = getname(name);
+ struct filename *tmp = getname(name);
if (IS_ERR(tmp))
- return (PTR_ERR(tmp));
+ return PTR_ERR(tmp);
error = acct_on(tmp);
putname(tmp);
} else {
@@ -297,13 +272,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (acct == NULL)
return 0;
- error = security_acct(NULL);
- if (!error) {
- spin_lock(&acct_lock);
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
- }
+ spin_lock(&acct_lock);
+ acct_file_reopen(acct, NULL, NULL);
+ spin_unlock(&acct_lock);
}
+
return error;
}
@@ -342,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+ if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
@@ -351,17 +324,17 @@ restart:
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct;
+ struct bsd_acct_struct *acct = ns->bacct;
- spin_lock(&acct_lock);
- acct = ns->bacct;
- if (acct != NULL) {
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
+ if (acct == NULL)
+ return;
- kfree(acct);
- }
+ spin_lock(&acct_lock);
+ if (acct->file != NULL)
+ acct_file_reopen(acct, NULL, NULL);
spin_unlock(&acct_lock);
+
+ kfree(acct);
}
/*
@@ -489,19 +462,23 @@ static void do_acct_process(struct bsd_acct_struct *acct,
u64 run_time;
struct timespec uptime;
struct tty_struct *tty;
+ const struct cred *orig_cred;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct, file))
- return;
+ goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset((caddr_t)&ac, 0, sizeof(acct_t));
+ memset(&ac, 0, sizeof(acct_t));
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -530,7 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
/* we really need to bite the bullet and change layout */
- current_uid_gid(&ac.ac_uid, &ac.ac_gid);
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
@@ -562,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
@@ -576,16 +560,9 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
-}
-
-/**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
- memset(pacct, 0, sizeof(struct pacct_struct));
- pacct->ac_utime = pacct->ac_stime = cputime_zero;
+ file_end_write(file);
+out:
+ revert_creds(orig_cred);
}
/**
@@ -596,6 +573,7 @@ void acct_init_pacct(struct pacct_struct *pacct)
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -623,8 +601,9 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
- pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index f565891f2c9..61f023ce022 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,96 +49,74 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
-#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
+#include <linux/export.h>
#include <linux/wait.h>
#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <asm/atomic.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "workqueue_internal.h"
static async_cookie_t next_cookie = 1;
-#define MAX_THREADS 256
-#define MAX_WORK 32768
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
-static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
-static int async_enabled = 0;
-
struct async_entry {
- struct list_head list;
- async_cookie_t cookie;
- async_func_ptr *func;
- void *data;
- struct list_head *running;
+ struct list_head domain_list;
+ struct list_head global_list;
+ struct work_struct work;
+ async_cookie_t cookie;
+ async_func_t func;
+ void *data;
+ struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
-static DECLARE_WAIT_QUEUE_HEAD(async_new);
static atomic_t entry_count;
-static atomic_t thread_count;
-
-extern int initcall_debug;
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t __lowest_in_progress(struct list_head *running)
-{
- struct async_entry *entry;
- if (!list_empty(running)) {
- entry = list_first_entry(running,
- struct async_entry, list);
- return entry->cookie;
- } else if (!list_empty(&async_pending)) {
- entry = list_first_entry(&async_pending,
- struct async_entry, list);
- return entry->cookie;
- } else {
- /* nothing in progress... next_cookie is "infinity" */
- return next_cookie;
- }
-
-}
-
-static async_cookie_t lowest_in_progress(struct list_head *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
+ struct list_head *pending;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
- async_cookie_t ret;
spin_lock_irqsave(&async_lock, flags);
- ret = __lowest_in_progress(running);
+
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
+
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
+
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
+
/*
* pick the first pending entry and run it
*/
-static void run_one_entry(void)
+static void async_run_entry_fn(struct work_struct *work)
{
+ struct async_entry *entry =
+ container_of(work, struct async_entry, work);
unsigned long flags;
- struct async_entry *entry;
- ktime_t calltime, delta, rettime;
+ ktime_t uninitialized_var(calltime), delta, rettime;
- /* 1) pick one task from the pending queue */
-
- spin_lock_irqsave(&async_lock, flags);
- if (list_empty(&async_pending))
- goto out;
- entry = list_first_entry(&async_pending, struct async_entry, list);
-
- /* 2) move it to the running queue */
- list_move_tail(&entry->list, entry->running);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* 3) run it (and print duration)*/
+ /* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
+ printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ (long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
@@ -146,37 +124,32 @@ static void run_one_entry(void)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+ printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
}
- /* 4) remove it from the running queue */
+ /* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
- list_del(&entry->list);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
- /* 5) free the entry */
+ /* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 6) wake up any waiters. */
+ /* 4) wake up any waiters */
wake_up(&async_done);
- return;
-
-out:
- spin_unlock_irqrestore(&async_lock, flags);
}
-
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
-
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -185,59 +158,74 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
- if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
- ptr(data, newcookie);
+ func(data, newcookie);
return newcookie;
}
- entry->func = ptr;
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
entry->data = data;
- entry->running = running;
+ entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->list, &async_pending);
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- wake_up(&async_new);
+
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
+ /* schedule for execution */
+ queue_work(system_unbound_wq, &entry->work);
+
return newcookie;
}
/**
* async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
{
- return __async_schedule(ptr, data, &async_running);
+ return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
+ struct async_domain *domain)
{
- return __async_schedule(ptr, data, running);
+ return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -248,51 +236,66 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- do {
- async_synchronize_cookie(next_cookie);
- } while (!list_empty(&async_running) || !list_empty(&async_pending));
+ async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+ spin_lock_irq(&async_lock);
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
+ domain->registered = 0;
+ spin_unlock_irq(&async_lock);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
+/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by @domain have been done.
*/
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, list);
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie,
- struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t starttime, delta, endtime;
+ ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("async_waiting @ %i\n", task_pid_nr(current));
+ printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
- wait_event(async_done, lowest_in_progress(running) >= cookie);
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk("async_continuing @ %i after %lli usec\n",
+ printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
@@ -308,99 +311,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_domain(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
-
-static int async_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int ret = HZ;
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * check the list head without lock.. false positives
- * are dealt with inside run_one_entry() while holding
- * the lock.
- */
- rmb();
- if (!list_empty(&async_pending))
- run_one_entry();
- else
- ret = schedule_timeout(HZ);
-
- if (ret == 0) {
- /*
- * we timed out, this means we as thread are redundant.
- * we sign off and die, but we to avoid any races there
- * is a last-straw check to see if work snuck in.
- */
- atomic_dec(&thread_count);
- wmb(); /* manager must see our departure first */
- if (list_empty(&async_pending))
- break;
- /*
- * woops work came in between us timing out and us
- * signing off; we need to stay alive and keep working.
- */
- atomic_inc(&thread_count);
- }
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int async_manager_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int tc, ec;
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- tc = atomic_read(&thread_count);
- rmb();
- ec = atomic_read(&entry_count);
-
- while (tc < ec && tc < MAX_THREADS) {
- if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
- tc))) {
- msleep(100);
- continue;
- }
- atomic_inc(&thread_count);
- tc++;
- }
-
- schedule();
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int __init async_init(void)
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
{
- if (async_enabled)
- if (IS_ERR(kthread_run(async_manager_thread, NULL,
- "async/mgr")))
- async_enabled = 0;
- return 0;
-}
+ struct worker *worker = current_wq_worker();
-static int __init setup_async(char *str)
-{
- async_enabled = 1;
- return 1;
+ return worker && worker->current_func == async_run_entry_fn;
}
-
-__setup("fastboot", setup_async);
-
-
-core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131..3ef2e0e797e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,23 +41,31 @@
* Example user-space utilities: http://people.redhat.com/sgrubb/audit/
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
-#include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
#include <linux/audit.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/inotify.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
#include <linux/freezer.h>
#include <linux/tty.h>
+#include <linux/pid_namespace.h>
+#include <net/netns/generic.h>
#include "audit.h"
@@ -71,35 +79,39 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-int audit_enabled;
-int audit_ever_enabled;
+u32 audit_enabled;
+u32 audit_ever_enabled;
+
+EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static int audit_default;
+static u32 audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
-static int audit_failure = AUDIT_FAIL_PRINTK;
+static u32 audit_failure = AUDIT_FAIL_PRINTK;
/*
* If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_pid contains
- * the pid to use to send netlink messages to that process.
+ * contains the pid of the auditd process and audit_nlk_portid contains
+ * the portid to use to send netlink messages to that process.
*/
int audit_pid;
-static int audit_nlk_pid;
+static __u32 audit_nlk_portid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
-static int audit_rate_limit;
+static u32 audit_rate_limit;
-/* Number of outstanding audit_buffers allowed. */
-static int audit_backlog_limit = 64;
-static int audit_backlog_wait_time = 60 * HZ;
-static int audit_backlog_wait_overflow = 0;
+/* Number of outstanding audit_buffers allowed.
+ * When set to zero, this means unlimited. */
+static u32 audit_backlog_limit = 64;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
-uid_t audit_sig_uid = -1;
+kuid_t audit_sig_uid = INVALID_UID;
pid_t audit_sig_pid = -1;
u32 audit_sig_sid = 0;
@@ -114,9 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
+int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -135,8 +145,19 @@ static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
+ .mask = -1,
+ .features = 0,
+ .lock = 0,};
+
+static char *audit_feature_names[2] = {
+ "only_unset_loginuid",
+ "loginuid_immutable",
+};
+
+
/* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -160,27 +181,27 @@ struct audit_buffer {
};
struct audit_reply {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff *skb;
};
-static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_pid = pid;
+ nlh->nlmsg_pid = portid;
}
}
void audit_panic(const char *message)
{
- switch (audit_failure)
- {
+ switch (audit_failure) {
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
if (printk_ratelimit())
- printk(KERN_ERR "audit: %s\n", message);
+ pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
/* test audit_pid since printk is always losey, why bother? */
@@ -251,9 +272,7 @@ void audit_log_lost(const char *message)
if (print) {
if (printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_lost=%d audit_rate_limit=%d "
- "audit_backlog_limit=%d\n",
+ pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
atomic_read(&audit_lost),
audit_rate_limit,
audit_backlog_limit);
@@ -261,39 +280,29 @@ void audit_log_lost(const char *message)
}
}
-static int audit_log_config_change(char *function_name, int new, int old,
- uid_t loginuid, u32 sessionid, u32 sid,
+static int audit_log_config_change(char *function_name, u32 new, u32 old,
int allow_changes)
{
struct audit_buffer *ab;
int rc = 0;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
- old, loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
-
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc) {
- audit_log_format(ab, " sid=%u", sid);
- allow_changes = 0; /* Something weird, deny request */
- } else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (unlikely(!ab))
+ return rc;
+ audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_session_info(ab);
+ rc = audit_log_task_context(ab);
+ if (rc)
+ allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
-static int audit_do_config_change(char *function_name, int *to_change,
- int new, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
- int allow_changes, rc = 0, old = *to_change;
+ int allow_changes, rc = 0;
+ u32 old = *to_change;
/* check if we are locked */
if (audit_enabled == AUDIT_LOCKED)
@@ -302,8 +311,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
- rc = audit_log_config_change(function_name, new, old, loginuid,
- sessionid, sid, allow_changes);
+ rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
@@ -317,44 +325,43 @@ static int audit_do_config_change(char *function_name, int *to_change,
return rc;
}
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_rate_limit(u32 limit)
+{
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
+}
+
+static int audit_set_backlog_limit(u32 limit)
{
- return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_backlog_wait_time(u32 timeout)
{
- return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_wait_time",
+ &audit_backlog_wait_time, timeout);
}
-static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(u32 state)
{
int rc;
if (state < AUDIT_OFF || state > AUDIT_LOCKED)
return -EINVAL;
- rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
- loginuid, sessionid, sid);
-
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
-static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(u32 state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
- return audit_do_config_change("audit_failure", &audit_failure, state,
- loginuid, sessionid, sid);
+ return audit_do_config_change("audit_failure", &audit_failure, state);
}
/*
@@ -369,189 +376,222 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
static void audit_hold_skb(struct sk_buff *skb)
{
if (audit_default &&
- skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+ (!audit_backlog_limit ||
+ skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
skb_queue_tail(&audit_skb_hold_queue, skb);
else
kfree_skb(skb);
}
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ char *data = nlmsg_data(nlh);
+
+ if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit())
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
+ else
+ audit_log_lost("printk limit exceeded");
+ }
+
+ audit_hold_skb(skb);
+}
+
static void kauditd_send_skb(struct sk_buff *skb)
{
int err;
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
- BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
- printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd dissapeared\n");
- audit_pid = 0;
+ BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+ if (audit_pid) {
+ pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd disappeared");
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
} else
/* drop the extra reference if sent ok */
- kfree_skb(skb);
+ consume_skb(skb);
}
-static int kauditd_thread(void *dummy)
+/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, GFP_KERNEL);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+
+/*
+ * flush_hold_queue - empty the hold queue if auditd appears
+ *
+ * If auditd just started, drain the queue of messages already
+ * sent to syslog/printk. Remember loss here is ok. We already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * If you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+static void flush_hold_queue(void)
{
struct sk_buff *skb;
+ if (!audit_default || !audit_pid)
+ return;
+
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (likely(!skb))
+ return;
+
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+
+ /*
+ * if auditd just disappeared but we
+ * dequeued an skb we need to drop ref
+ */
+ if (skb)
+ consume_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
set_freezable();
while (!kthread_should_stop()) {
- /*
- * if auditd just started drain the queue of messages already
- * sent to syslog/printk. remember loss here is ok. we already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
- *
- * if you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
- */
- if (audit_default && audit_pid) {
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (unlikely(skb)) {
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
- }
- }
+ struct sk_buff *skb;
+ DECLARE_WAITQUEUE(wait, current);
+
+ flush_hold_queue();
skb = skb_dequeue(&audit_skb_queue);
- wake_up(&audit_backlog_wait);
+
if (skb) {
+ if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
- else {
- if (printk_ratelimit())
- printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
- else
- audit_log_lost("printk limit exceeded\n");
-
- audit_hold_skb(skb);
- }
- } else {
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
-
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
+ else
+ audit_printk_skb(skb);
+ continue;
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ if (!skb_queue_len(&audit_skb_queue)) {
+ try_to_freeze();
+ schedule();
}
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
}
return 0;
}
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
-{
- struct task_struct *tsk;
- int err;
-
- read_lock(&tasklist_lock);
- tsk = find_task_by_vpid(pid);
- err = -ESRCH;
- if (!tsk)
- goto out;
- err = 0;
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (!tsk->signal->audit_tty)
- err = -EPERM;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (err)
- goto out;
-
- tty_audit_push_task(tsk, loginuid, sessionid);
-out:
- read_unlock(&tasklist_lock);
- return err;
-}
-
int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
- int pid = dest->pid;
struct sk_buff *skb;
+ struct net *net = dest->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(audit_sock, skb, pid, 0);
+ netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
}
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
- mutex_lock(&audit_cmd_mutex);
- audit_prune_trees();
- mutex_unlock(&audit_cmd_mutex);
- return 0;
-}
-
-void audit_schedule_prune(void)
-{
- kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
-
-struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
- int multi, void *payload, int size)
+struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
- int len = NLMSG_SPACE(size);
void *data;
int flags = multi ? NLM_F_MULTI : 0;
int t = done ? NLMSG_DONE : type;
- skb = alloc_skb(len, GFP_KERNEL);
+ skb = nlmsg_new(size, GFP_KERNEL);
if (!skb)
return NULL;
- nlh = NLMSG_PUT(skb, pid, seq, t, size);
- nlh->nlmsg_flags = flags;
- data = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ if (!nlh)
+ goto out_kfree_skb;
+ data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_PUT */
- if (skb)
- kfree_skb(skb);
+out_kfree_skb:
+ kfree_skb(skb);
return NULL;
}
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
+ struct net *net = reply->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
- * @pid: process id to send reply to
+ * @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
@@ -559,12 +599,14 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-void audit_send_reply(int pid, int seq, int type, int done, int multi,
- void *payload, int size)
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -573,11 +615,12 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
if (!reply)
return;
- skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
+ skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
if (!skb)
goto out;
- reply->pid = pid;
+ reply->net = get_net(net);
+ reply->portid = portid;
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -596,27 +639,49 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
+ /* Only support initial user namespace for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
+ if (current_user_ns() != &init_user_ns)
+ return -ECONNREFUSED;
+
switch (msg_type) {
- case AUDIT_GET:
case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- case AUDIT_SET:
case AUDIT_ADD:
- case AUDIT_ADD_RULE:
case AUDIT_DEL:
+ return -EOPNOTSUPP;
+ case AUDIT_GET:
+ case AUDIT_SET:
+ case AUDIT_GET_FEATURE:
+ case AUDIT_SET_FEATURE:
+ case AUDIT_LIST_RULES:
+ case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if ((task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -626,45 +691,125 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
- u32 pid, u32 uid, uid_t auid, u32 ses,
- u32 sid)
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
- char *ctx = NULL;
- u32 len;
+ uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
- if (!audit_enabled) {
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return rc;
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
- audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
- pid, uid, auid, ses);
- if (sid) {
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc)
- audit_log_format(*ab, " ssid=%u", sid);
- else {
- audit_log_format(*ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
+ if (unlikely(!*ab))
+ return rc;
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_session_info(*ab);
+ audit_log_task_context(*ab);
+
+ return rc;
+}
+
+int is_audit_feature_set(int i)
+{
+ return af.features & AUDIT_FEATURE_TO_MASK(i);
+}
+
+
+static int audit_get_feature(struct sk_buff *skb)
+{
+ u32 seq;
+
+ seq = nlmsg_hdr(skb)->nlmsg_seq;
+
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
+
+ return 0;
+}
+
+static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
+ u32 old_lock, u32 new_lock, int res)
+{
+ struct audit_buffer *ab;
+
+ if (audit_enabled == AUDIT_OFF)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_feature_names[which], !!old_feature, !!new_feature,
+ !!old_lock, !!new_lock, res);
+ audit_log_end(ab);
+}
+
+static int audit_set_feature(struct sk_buff *skb)
+{
+ struct audit_features *uaf;
+ int i;
+
+ BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+ uaf = nlmsg_data(nlmsg_hdr(skb));
+
+ /* if there is ever a version 2 we should handle that here */
+
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
+
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+ old_lock = af.lock & feature;
+
+ /* are we changing a locked feature? */
+ if (old_lock && (new_feature != old_feature)) {
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 0);
+ return -EPERM;
}
}
+ /* nothing invalid, do the changes */
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
- return rc;
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ old_lock = af.lock & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+
+ if (new_feature != old_feature)
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 1);
+
+ if (new_feature)
+ af.features |= feature;
+ else
+ af.features &= ~feature;
+ af.lock |= new_lock;
+ }
+
+ return 0;
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- u32 uid, pid, seq, sid;
+ u32 seq;
void *data;
- struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
- uid_t loginuid; /* loginuid of sender */
- u32 sessionid;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
@@ -675,70 +820,90 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
- pid = NETLINK_CREDS(skb)->pid;
- uid = NETLINK_CREDS(skb)->uid;
- loginuid = NETLINK_CB(skb).loginuid;
- sessionid = NETLINK_CB(skb).sessionid;
- sid = NETLINK_CB(skb).sid;
seq = nlh->nlmsg_seq;
- data = NLMSG_DATA(nlh);
+ data = nlmsg_data(nlh);
switch (msg_type) {
- case AUDIT_GET:
- status_set.enabled = audit_enabled;
- status_set.failure = audit_failure;
- status_set.pid = audit_pid;
- status_set.rate_limit = audit_rate_limit;
- status_set.backlog_limit = audit_backlog_limit;
- status_set.lost = atomic_read(&audit_lost);
- status_set.backlog = skb_queue_len(&audit_skb_queue);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
- &status_set, sizeof(status_set));
+ case AUDIT_GET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
+ s.pid = audit_pid;
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_skb_queue);
+ s.version = AUDIT_VERSION_LATEST;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
- case AUDIT_SET:
- if (nlh->nlmsg_len < sizeof(struct audit_status))
- return -EINVAL;
- status_get = (struct audit_status *)data;
- if (status_get->mask & AUDIT_STATUS_ENABLED) {
- err = audit_set_enabled(status_get->enabled,
- loginuid, sessionid, sid);
+ }
+ case AUDIT_SET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ if (s.mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(s.enabled);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_FAILURE) {
- err = audit_set_failure(status_get->failure,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(s.failure);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_PID) {
- int new_pid = status_get->pid;
+ if (s.mask & AUDIT_STATUS_PID) {
+ int new_pid = s.pid;
+ if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+ return -EACCES;
if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid,
- audit_pid, loginuid,
- sessionid, sid, 1);
-
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_pid = new_pid;
- audit_nlk_pid = NETLINK_CB(skb).pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
+ audit_sock = skb->sk;
+ }
+ if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
+ err = audit_set_rate_limit(s.rate_limit);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
+ err = audit_set_backlog_limit(s.backlog_limit);
+ if (err < 0)
+ return err;
}
- if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
- err = audit_set_rate_limit(status_get->rate_limit,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+ if (sizeof(s) > (size_t)nlh->nlmsg_len)
+ return -EINVAL;
+ if (s.backlog_wait_time < 0 ||
+ s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+ return -EINVAL;
+ err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
- err = audit_set_backlog_limit(status_get->backlog_limit,
- loginuid, sessionid, sid);
+ break;
+ }
+ case AUDIT_GET_FEATURE:
+ err = audit_get_feature(skb);
+ if (err)
+ return err;
+ break;
+ case AUDIT_SET_FEATURE:
+ err = audit_set_feature(skb);
+ if (err)
+ return err;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -746,76 +911,54 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(&NETLINK_CB(skb));
- if (err == 1) {
+ err = audit_filter_user(msg_type);
+ if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = audit_prepare_user_tty(pid, loginuid,
- sessionid);
+ err = tty_audit_push_current();
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type, pid, uid,
- loginuid, sessionid, sid);
-
+ mutex_unlock(&audit_cmd_mutex);
+ audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
- audit_log_format(ab, " msg='%.1024s'",
+ audit_log_format(ab, " msg='%.*s'",
+ AUDIT_MESSAGE_TEXT_MAX,
(char *)data);
else {
int size;
- audit_log_format(ab, " msg=");
+ audit_log_format(ab, " data=");
size = nlmsg_len(nlh);
+ if (size > 0 &&
+ ((unsigned char *)data)[size - 1] == '\0')
+ size--;
audit_log_n_untrustedstring(ab, data, size);
}
- audit_set_pid(ab, pid);
+ audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
+ mutex_lock(&audit_cmd_mutex);
}
break;
- case AUDIT_ADD:
- case AUDIT_DEL:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule))
- return -EINVAL;
- if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
- audit_log_end(ab);
- return -EPERM;
- }
- /* fallthrough */
- case AUDIT_LIST:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
- break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
audit_log_end(ab);
return -EPERM;
}
- /* fallthrough */
+ err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh));
+ break;
case AUDIT_LIST_RULES:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
+ err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
-
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -845,8 +988,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
@@ -859,59 +1001,68 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
case AUDIT_SIGNAL_INFO:
- err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
- if (err)
- return err;
+ len = 0;
+ if (audit_sig_sid) {
+ err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+ if (err)
+ return err;
+ }
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
if (!sig_data) {
- security_release_secctx(ctx, len);
+ if (audit_sig_sid)
+ security_release_secctx(ctx, len);
return -ENOMEM;
}
- sig_data->uid = audit_sig_uid;
+ sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
- memcpy(sig_data->ctx, ctx, len);
- security_release_secctx(ctx, len);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ if (audit_sig_sid) {
+ memcpy(sig_data->ctx, ctx, len);
+ security_release_secctx(ctx, len);
+ }
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk;
-
- read_lock(&tasklist_lock);
- tsk = find_task_by_vpid(pid);
- if (!tsk)
- err = -ESRCH;
- else {
- spin_lock_irq(&tsk->sighand->siglock);
- s.enabled = tsk->signal->audit_tty != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
- }
- read_unlock(&tasklist_lock);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
- &s, sizeof(s));
+ struct task_struct *tsk = current;
+
+ spin_lock(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty;
+ s.log_passwd = tsk->signal->audit_tty_log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
- struct audit_tty_status *s;
- struct task_struct *tsk;
-
- if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
- return -EINVAL;
- s = data;
- if (s->enabled != 0 && s->enabled != 1)
- return -EINVAL;
- read_lock(&tasklist_lock);
- tsk = find_task_by_vpid(pid);
- if (!tsk)
- err = -ESRCH;
- else {
- spin_lock_irq(&tsk->sighand->siglock);
- tsk->signal->audit_tty = s->enabled != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
+ struct audit_tty_status s, old;
+ struct task_struct *tsk = current;
+ struct audit_buffer *ab;
+
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ /* check if new data is valid */
+ if ((s.enabled != 0 && s.enabled != 1) ||
+ (s.log_passwd != 0 && s.log_passwd != 1))
+ err = -EINVAL;
+
+ spin_lock(&tsk->sighand->siglock);
+ old.enabled = tsk->signal->audit_tty;
+ old.log_passwd = tsk->signal->audit_tty_log_passwd;
+ if (!err) {
+ tsk->signal->audit_tty = s.enabled;
+ tsk->signal->audit_tty_log_passwd = s.log_passwd;
}
- read_unlock(&tasklist_lock);
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
+ " old-log_passwd=%d new-log_passwd=%d res=%d",
+ old.enabled, s.enabled, old.log_passwd,
+ s.log_passwd, !err);
+ audit_log_end(ab);
break;
}
default:
@@ -923,28 +1074,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/*
- * Get message from skb (based on rtnetlink_rcv_skb). Each message is
- * processed by audit_receive_msg. Malformed skbs with wrong length are
- * discarded silently.
+ * Get message from skb. Each message is processed by audit_receive_msg.
+ * Malformed skbs with wrong length are discarded silently.
*/
static void audit_receive_skb(struct sk_buff *skb)
{
- int err;
- struct nlmsghdr *nlh;
- u32 rlen;
+ struct nlmsghdr *nlh;
+ /*
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
+ * if the nlmsg_len was not aligned
+ */
+ int len;
+ int err;
- while (skb->len >= NLMSG_SPACE(0)) {
- nlh = nlmsg_hdr(skb);
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- if ((err = audit_receive_msg(skb, nlh))) {
+ nlh = nlmsg_hdr(skb);
+ len = skb->len;
+
+ while (nlmsg_ok(nlh, len)) {
+ err = audit_receive_msg(skb, nlh);
+ /* if err or if this message says it wants a response */
+ if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- } else if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
+
+ nlh = nlmsg_next(nlh, &len);
}
}
@@ -956,12 +1108,55 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
- .handle_event = audit_handle_ievent,
- .destroy_watch = audit_free_parent,
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
+static int __net_init audit_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
+ };
+
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->nlsk == NULL) {
+ audit_panic("cannot initialize netlink socket in namespace");
+ return -ENOMEM;
+ }
+ aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ return 0;
+}
+
+static void __net_exit audit_net_exit(struct net *net)
+{
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+ if (sock == audit_sock) {
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
+
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
+ synchronize_net();
+ netlink_kernel_release(sock);
+}
+
+static struct pernet_operations audit_net_ops __net_initdata = {
+ .init = audit_net_init,
+ .exit = audit_net_exit,
+ .id = &audit_net_id,
+ .size = sizeof(struct audit_net),
};
-#endif
/* Initialize audit support at boot time. */
static int __init audit_init(void)
@@ -971,14 +1166,9 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
- audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
- audit_receive, NULL, THIS_MODULE);
- if (!audit_sock)
- audit_panic("cannot initialize netlink socket");
- else
- audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ pr_info("initializing netlink subsys (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ register_pernet_subsys(&audit_net_ops);
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
@@ -988,12 +1178,6 @@ static int __init audit_init(void)
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-#ifdef CONFIG_AUDITSYSCALL
- audit_ih = inotify_init(&audit_inotify_ops);
- if (IS_ERR(audit_ih))
- audit_panic("cannot initialize inotify handle");
-#endif
-
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
@@ -1008,22 +1192,32 @@ static int __init audit_enable(char *str)
if (!audit_default)
audit_initialized = AUDIT_DISABLED;
- printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+ pr_info("%s\n", audit_default ?
+ "enabled (after initialization)" : "disabled (until reboot)");
- if (audit_initialized == AUDIT_INITIALIZED) {
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
- } else if (audit_initialized == AUDIT_UNINITIALIZED) {
- printk(" (after initialization)");
- } else {
- printk(" (until reboot)");
+ return 1;
+}
+__setup("audit=", audit_enable);
+
+/* Process kernel command-line parameter at boot time.
+ * audit_backlog_limit=<n> */
+static int __init audit_backlog_limit_set(char *str)
+{
+ u32 audit_backlog_limit_arg;
+
+ pr_info("audit_backlog_limit: ");
+ if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
+ pr_cont("using default of %u, unable to parse %s\n",
+ audit_backlog_limit, str);
+ return 1;
}
- printk("\n");
+
+ audit_backlog_limit = audit_backlog_limit_arg;
+ pr_cont("%d\n", audit_backlog_limit);
return 1;
}
-
-__setup("audit=", audit_enable);
+__setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
@@ -1067,18 +1261,22 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
goto err;
}
- ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
+ ab->ctx = ctx;
+ ab->gfp_mask = gfp_mask;
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
- ab->ctx = ctx;
- ab->gfp_mask = gfp_mask;
- nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
- nlh->nlmsg_type = type;
- nlh->nlmsg_flags = 0;
- nlh->nlmsg_pid = 0;
- nlh->nlmsg_seq = 0;
+ nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+ if (!nlh)
+ goto out_kfree_skb;
+
return ab;
+
+out_kfree_skb:
+ kfree_skb(ab->skb);
+ ab->skb = NULL;
err:
audit_buffer_free(ab);
return NULL;
@@ -1127,12 +1325,24 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
-/* Obtain an audit buffer. This routine does locking to obtain the
- * audit buffer, but then no locking is required for calls to
- * audit_log_*format. If the tsk is a task that is currently in a
- * syscall, then the syscall is marked as auditable and an audit record
- * will be written at syscall exit. If there is no associated task, tsk
- * should be NULL. */
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static long wait_for_auditd(long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ sleep_time = schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+
+ return sleep_time;
+}
/**
* audit_log_start - obtain an audit buffer
@@ -1155,7 +1365,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
struct audit_buffer *ab = NULL;
struct timespec t;
unsigned int uninitialized_var(serial);
- int reserve;
+ int reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
unsigned long timeout_start = jiffies;
if (audit_initialized != AUDIT_INITIALIZED)
@@ -1164,42 +1375,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT)
- reserve = 0;
- else
- reserve = 5; /* Allow atomic callers to go up to five
- entries over the normal backlog limit */
+ if (gfp_mask & __GFP_WAIT) {
+ if (audit_pid && audit_pid == current->pid)
+ gfp_mask &= ~__GFP_WAIT;
+ else
+ reserve = 0;
+ }
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
- && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
-
- /* Wait for auditd to drain the queue a little */
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
- continue;
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ long sleep_time;
+
+ sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
+ if (sleep_time > 0) {
+ sleep_time = wait_for_auditd(sleep_time);
+ if (sleep_time > 0)
+ continue;
+ }
}
if (audit_rate_check() && printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_backlog=%d > "
- "audit_backlog_limit=%d\n",
- skb_queue_len(&audit_skb_queue),
- audit_backlog_limit);
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_skb_queue),
+ audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
audit_backlog_wait_time = audit_backlog_wait_overflow;
wake_up(&audit_backlog_wait);
return NULL;
}
+ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
@@ -1270,12 +1476,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
avail = audit_expand(ab,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
- goto out;
+ goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
- va_end(args2);
if (len > 0)
skb_put(skb, len);
+out_va_end:
+ va_end(args2);
out:
return;
}
@@ -1316,7 +1523,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
int i, avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
- static const unsigned char *hex = "0123456789ABCDEF";
if (!ab)
return;
@@ -1334,10 +1540,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
}
ptr = skb_tail_pointer(skb);
- for (i=0; i<len; i++) {
- *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
- *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
- }
+ for (i = 0; i < len; i++)
+ ptr = hex_byte_pack_upper(ptr, buf[i]);
*ptr = 0;
skb_put(skb, len << 1); /* new string is twice the old string */
}
@@ -1382,7 +1586,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
int audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
- for (p = string; p < (const unsigned char *)string + len && *p; p++) {
+ for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
return 1;
}
@@ -1427,36 +1631,324 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
- struct path *path)
+ const struct path *path)
{
char *p, *pathname;
if (prefix)
- audit_log_format(ab, " %s", prefix);
+ audit_log_format(ab, "%s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_format(ab, "<no memory>");
+ audit_log_string(ab, "<no_memory>");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_format(ab, "<too long>");
+ audit_log_string(ab, "<too_long>");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
}
+void audit_log_session_info(struct audit_buffer *ab)
+{
+ unsigned int sessionid = audit_get_sessionid(current);
+ uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+ audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+}
+
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+ audit_log_format(ab, " key=");
+ if (key)
+ audit_log_untrustedstring(ab, key);
+ else
+ audit_log_format(ab, "(null)");
+}
+
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i) {
+ audit_log_format(ab, "%08x",
+ cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ }
+}
+
+void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ kernel_cap_t *perm = &name->fcap.permitted;
+ kernel_cap_t *inh = &name->fcap.inheritable;
+ int log = 0;
+
+ if (!cap_isclear(*perm)) {
+ audit_log_cap(ab, "cap_fp", perm);
+ log = 1;
+ }
+ if (!cap_isclear(*inh)) {
+ audit_log_cap(ab, "cap_fi", inh);
+ log = 1;
+ }
+
+ if (log)
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+ struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ audit_log_format(ab, " nametype=");
+ switch(n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, "NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, "PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, "DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, "CREATE");
+ break;
+ default:
+ audit_log_format(ab, "UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return 0;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return 0;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ const struct cred *cred;
+ char name[sizeof(tsk->comm)];
+ struct mm_struct *mm = tsk->mm;
+ char *tty;
+
+ if (!ab)
+ return;
+
+ /* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
+ from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ tty, audit_get_sessionid(tsk));
+
+ get_task_comm(name, tsk);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
+ audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
+/**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+ struct audit_buffer *ab;
+ struct audit_names *name;
+
+ name = kzalloc(sizeof(*name), GFP_NOFS);
+ if (!name)
+ return;
+
+ /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_ANOM_LINK);
+ if (!ab)
+ goto out;
+ audit_log_format(ab, "op=%s", operation);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " res=0");
+ audit_log_end(ab);
+
+ /* Generate AUDIT_PATH record with object. */
+ name->type = AUDIT_TYPE_NORMAL;
+ audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+ audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+ kfree(name);
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is placed on a queue and a tasklet is scheduled to
- * remove them from the queue outside the irq context. May be called in
- * any context.
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1466,21 +1958,25 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+
+ kauditd_send_multicast_skb(ab->skb);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
+ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
wake_up_interruptible(&kauditd_wait);
} else {
- if (nlh->nlmsg_type != AUDIT_EOE) {
- if (printk_ratelimit()) {
- printk(KERN_NOTICE "type=%d %s\n",
- nlh->nlmsg_type,
- ab->skb->data + NLMSG_SPACE(0));
- } else
- audit_log_lost("printk limit exceeded\n");
- }
- audit_hold_skb(ab->skb);
+ audit_printk_skb(ab->skb);
}
ab->skb = NULL;
}
@@ -1514,6 +2010,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+ u32 len;
+ char *secctx;
+
+ if (security_secid_to_secctx(secid, &secctx, &len)) {
+ audit_panic("Cannot convert secid to context");
+ } else {
+ audit_log_format(ab, " obj=%s", secctx);
+ security_release_secctx(secctx, len);
+ }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661..7bb65730c89 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
/* 0 = no checking
1 = put_count checking
@@ -29,6 +30,11 @@
*/
#define AUDIT_DEBUG 0
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
+
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
@@ -36,12 +42,8 @@ enum audit_state {
AUDIT_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
- * but don't necessarily fill it in at
- * syscall entry time (i.e., filter
- * instead). */
AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
- * and always fill it in at syscall
+ * and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
@@ -53,18 +55,7 @@ enum audit_state {
};
/* Rule lists */
-struct audit_parent;
-
-struct audit_watch {
- atomic_t count; /* reference count */
- char *path; /* insertion path */
- dev_t dev; /* associated superblock device */
- unsigned long ino; /* associated inode number */
- struct audit_parent *parent; /* associated parent */
- struct list_head wlist; /* entry in parent->watches list */
- struct list_head rules; /* associated rules */
-};
-
+struct audit_watch;
struct audit_tree;
struct audit_chunk;
@@ -74,10 +65,167 @@ struct audit_entry {
struct audit_krule rule;
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
-extern int audit_ever_enabled;
+struct audit_cap_data {
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+ union {
+ unsigned int fE; /* effective bit of file cap */
+ kernel_cap_t effective; /* effective set of process */
+ };
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+ struct list_head list; /* audit_context->names_list */
+
+ struct filename *name;
+ int name_len; /* number of chars to log */
+ bool hidden; /* don't log this record */
+ bool name_put; /* call __putname()? */
+
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ unsigned char type; /* record type */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit.
+ */
+ bool should_free;
+};
+
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state, current_state;
+ unsigned int serial; /* serial number for record */
+ int major; /* syscall number */
+ struct timespec ctime; /* time of syscall entry */
+ unsigned long argv[4]; /* syscall arguments */
+ long return_code;/* syscall return code */
+ u64 prio;
+ int return_valid; /* return code is valid */
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* struct audit_names->list anchor */
+ char *filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+ struct sockaddr_storage *sockaddr;
+ size_t sockaddr_len;
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ kuid_t target_auid;
+ kuid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ struct list_head killed_trees;
+ int tree_count;
+
+ int type;
+ union {
+ struct {
+ int nargs;
+ long args[6];
+ } socketcall;
+ struct {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ u32 osid;
+ int has_perm;
+ uid_t perm_uid;
+ gid_t perm_gid;
+ umode_t perm_mode;
+ unsigned long qbytes;
+ } ipc;
+ struct {
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+ } mq_getsetattr;
+ struct {
+ mqd_t mqdes;
+ int sigev_signo;
+ } mq_notify;
+ struct {
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+ } mq_sendrecv;
+ struct {
+ int oflag;
+ umode_t mode;
+ struct mq_attr attr;
+ } mq_open;
+ struct {
+ pid_t pid;
+ struct audit_cap_data cap;
+ } capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
+ struct {
+ int argc;
+ } execve;
+ };
+ int fds[2];
+ struct audit_proctitle proctitle;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
#endif
+};
+
+extern u32 audit_ever_enabled;
+
+extern void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap);
+extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
+extern void audit_log_name(struct audit_context *context,
+ struct audit_names *n, struct path *path,
+ int record_num, int *call_panic);
extern int audit_pid;
@@ -89,38 +237,60 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen);
-extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
- int done, int multi,
- void *payload, int size);
-extern void audit_send_reply(int pid, int seq, int type,
- int done, int multi,
- void *payload, int size);
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
+ int done, int multi,
+ const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff_head q;
};
int audit_send_list(void *);
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
+struct audit_net {
+ struct sock *nlsk;
+};
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
- const char *, struct inode *);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+
+/* audit watch functions */
+#ifdef CONFIG_AUDIT_WATCH
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
+#define audit_put_watch(w) {}
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
+
+#endif /* CONFIG_AUDIT_WATCH */
+
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +300,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
extern const char *audit_tree_path(struct audit_tree *);
extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
@@ -142,12 +311,13 @@ extern void audit_put_tree(struct audit_tree *);
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
+#define audit_kill_trees(list) BUG()
#endif
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
-extern uid_t audit_sig_uid;
+extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
@@ -160,7 +330,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
return 0;
}
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
+
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db..135944a7b28 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,7 +1,9 @@
#include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
struct audit_tree;
struct audit_chunk;
@@ -20,7 +22,7 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct inotify_watch watch;
+ struct fsnotify_mark mark;
struct list_head trees; /* with root here */
int dead;
int count;
@@ -57,7 +59,7 @@ static LIST_HEAD(prune_list);
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
* of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
@@ -66,7 +68,7 @@ static LIST_HEAD(prune_list);
* that makes a difference. Some.
*/
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
static struct audit_tree *alloc_tree(const char *s)
{
@@ -91,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
atomic_inc(&tree->count);
}
-static void __put_tree(struct rcu_head *rcu)
-{
- struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
- kfree(tree);
-}
-
static inline void put_tree(struct audit_tree *tree)
{
if (atomic_dec_and_test(&tree->count))
- call_rcu(&tree->head, __put_tree);
+ kfree_rcu(tree, head);
}
/* to avoid bringing the entire thing in audit.h */
@@ -109,29 +105,6 @@ const char *audit_tree_path(struct audit_tree *tree)
return tree->pathname;
}
-static struct audit_chunk *alloc_chunk(int count)
-{
- struct audit_chunk *chunk;
- size_t size;
- int i;
-
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- INIT_LIST_HEAD(&chunk->hash);
- INIT_LIST_HEAD(&chunk->trees);
- chunk->count = count;
- atomic_long_set(&chunk->refs, 1);
- for (i = 0; i < count; i++) {
- INIT_LIST_HEAD(&chunk->owners[i].list);
- chunk->owners[i].index = i;
- }
- inotify_init_watch(&chunk->watch);
- return chunk;
-}
-
static void free_chunk(struct audit_chunk *chunk)
{
int i;
@@ -155,6 +128,35 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+ struct audit_chunk *chunk;
+ size_t size;
+ int i;
+
+ size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+ chunk = kzalloc(size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+ }
+ fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ return chunk;
+}
+
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -165,10 +167,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- struct list_head *list = chunk_hash(chunk->watch.inode);
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct list_head *list;
+
+ if (!entry->i.inode)
+ return;
+ list = chunk_hash(entry->i.inode);
list_add_rcu(&chunk->hash, list);
}
@@ -179,7 +186,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- if (p->watch.inode == inode) {
+ /* mark.inode may have gone NULL, but who cares? */
+ if (p->mark.i.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -208,38 +216,24 @@ static struct audit_chunk *find_chunk(struct node *p)
static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
- struct audit_chunk *new;
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct audit_chunk *new = NULL;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
- if (!pin_inotify_watch(&chunk->watch)) {
- /*
- * Filesystem is shutting down; all watches are getting
- * evicted, just take it off the node list for this
- * tree and let the eviction logics take care of the
- * rest.
- */
- owner = p->owner;
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
- return;
- }
+ fsnotify_get_mark(entry);
spin_unlock(&hash_lock);
- /*
- * pin_inotify_watch() succeeded, so the watch won't go away
- * from under us.
- */
- mutex_lock(&chunk->watch.inode->inotify_mutex);
- if (chunk->dead) {
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ if (size)
+ new = alloc_chunk(size);
+
+ spin_lock(&entry->lock);
+ if (chunk->dead || !entry->i.inode) {
+ spin_unlock(&entry->lock);
+ if (new)
+ free_chunk(new);
goto out;
}
@@ -254,17 +248,17 @@ static void untag_chunk(struct node *p)
list_del_init(&p->list);
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
- new = alloc_chunk(size);
if (!new)
goto Fallback;
- if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
- free_chunk(new);
+
+ fsnotify_duplicate_mark(&new->mark, entry);
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+ fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -276,7 +270,7 @@ static void untag_chunk(struct node *p)
owner->root = NULL;
}
- for (i = j = 0; i < size; i++, j++) {
+ for (i = j = 0; j <= size; i++, j++) {
struct audit_tree *s;
if (&chunk->owners[j] == p) {
list_del_init(&p->list);
@@ -289,16 +283,16 @@ static void untag_chunk(struct node *p)
if (!s) /* result of earlier fallback */
continue;
get_tree(s);
- list_replace_init(&chunk->owners[i].list, &new->owners[j].list);
+ list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
}
list_replace_rcu(&chunk->hash, &new->hash);
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
Fallback:
@@ -312,31 +306,33 @@ Fallback:
p->owner = NULL;
put_tree(owner);
spin_unlock(&hash_lock);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ spin_unlock(&entry->lock);
out:
- unpin_inotify_watch(&chunk->watch);
+ fsnotify_put_mark(entry);
spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
+ struct fsnotify_mark *entry;
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk)
return -ENOMEM;
- if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
- free_chunk(chunk);
+ entry = &chunk->mark;
+ if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
+ fsnotify_put_mark(entry);
return -ENOSPC;
}
- mutex_lock(&inode->inotify_mutex);
+ spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(entry);
return 0;
}
chunk->owners[0].index = (1U << 31);
@@ -349,52 +345,77 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
}
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&inode->inotify_mutex);
+ spin_unlock(&entry->lock);
+ fsnotify_put_mark(entry); /* drop initial reference */
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct inotify_watch *watch;
+ struct fsnotify_mark *old_entry, *chunk_entry;
struct audit_tree *owner;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
- if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+ old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+ if (!old_entry)
return create_chunk(inode, tree);
- old = container_of(watch, struct audit_chunk, watch);
+ old = container_of(old_entry, struct audit_chunk, mark);
/* are we already there? */
spin_lock(&hash_lock);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- put_inotify_watch(watch);
+ fsnotify_put_mark(old_entry);
return 0;
}
}
spin_unlock(&hash_lock);
chunk = alloc_chunk(old->count + 1);
- if (!chunk)
+ if (!chunk) {
+ fsnotify_put_mark(old_entry);
return -ENOMEM;
+ }
- mutex_lock(&inode->inotify_mutex);
- if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
- mutex_unlock(&inode->inotify_mutex);
+ chunk_entry = &chunk->mark;
+
+ spin_lock(&old_entry->lock);
+ if (!old_entry->i.inode) {
+ /* old_entry is being shot, lets just lie */
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(old_entry);
free_chunk(chunk);
+ return -ENOENT;
+ }
+
+ fsnotify_duplicate_mark(chunk_entry, old_entry);
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
return -ENOSPC;
}
+
+ /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+ spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
+
+ /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
+
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
return 0;
}
list_replace_init(&old->trees, &chunk->trees);
@@ -420,17 +441,34 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
list_add(&tree->same_root, &chunk->trees);
}
spin_unlock(&hash_lock);
- inotify_evict_watch(&old->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
+ fsnotify_put_mark(chunk_entry); /* drop initial reference */
+ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
return 0;
}
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
- struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
@@ -438,16 +476,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=remove rule dir=");
- audit_log_untrustedstring(ab, rule->tree->pathname);
- if (rule->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab, rule->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
- audit_log_format(ab, " list=%d res=1", rule->listnr);
- audit_log_end(ab);
+ audit_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
@@ -517,6 +546,8 @@ static void trim_marked(struct audit_tree *tree)
}
}
+static void audit_schedule_prune(void);
+
/* called with audit_filter_mutex */
int audit_remove_tree_rule(struct audit_krule *rule)
{
@@ -542,6 +573,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
return 0;
}
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+ return mnt->mnt_root->d_inode == arg;
+}
+
void audit_trim_trees(void)
{
struct list_head cursor;
@@ -553,7 +589,6 @@ void audit_trim_trees(void)
struct path path;
struct vfsmount *root_mnt;
struct node *node;
- struct list_head list;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -566,53 +601,31 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(path.mnt, path.dentry);
+ root_mnt = collect_mounts(&path);
path_put(&path);
- if (!root_mnt)
+ if (IS_ERR(root_mnt))
goto skip_it;
- list_add_tail(&list, &root_mnt->mnt_list);
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
- struct inode *inode = chunk->watch.inode;
- struct vfsmount *mnt;
+ /* this could be NULL if the watch is dying else where... */
+ struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
- list_for_each_entry(mnt, &list, mnt_list) {
- if (mnt->mnt_root->d_inode == inode) {
- node->index &= ~(1U<<31);
- break;
- }
- }
+ if (iterate_mounts(compare_root, inode, root_mnt))
+ node->index &= ~(1U<<31);
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
- list_del_init(&list);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
}
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
- struct path *path)
-{
- if (mnt != path->mnt) {
- for (;;) {
- if (mnt->mnt_parent == mnt)
- return 0;
- if (mnt->mnt_parent == path->mnt)
- break;
- mnt = mnt->mnt_parent;
- }
- dentry = mnt->mnt_mountpoint;
- }
- return is_subdir(dentry, path->dentry);
-}
-
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
@@ -632,15 +645,20 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+ return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
+
/* called with audit_filter_mutex */
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
- struct vfsmount *mnt, *p;
- struct list_head list;
+ struct vfsmount *mnt;
int err;
+ rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
@@ -658,22 +676,15 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(path.mnt, path.dentry);
+ mnt = collect_mounts(&path);
path_put(&path);
- if (!mnt) {
- err = -ENOMEM;
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto Err;
}
- list_add_tail(&list, &mnt->mnt_list);
get_tree(tree);
- list_for_each_entry(p, &list, mnt_list) {
- err = tag_chunk(p->mnt_root->d_inode, tree);
- if (err)
- break;
- }
-
- list_del(&list);
+ err = iterate_mounts(tag_mount, tree, mnt);
drop_collected_mounts(mnt);
if (!err) {
@@ -708,34 +719,23 @@ int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
- struct path path;
+ struct path path1, path2;
struct vfsmount *tagged;
- struct list_head list;
- struct vfsmount *mnt;
- struct dentry *dentry;
int err;
- err = kern_path(new, 0, &path);
+ err = kern_path(new, 0, &path2);
if (err)
return err;
- tagged = collect_mounts(path.mnt, path.dentry);
- path_put(&path);
- if (!tagged)
- return -ENOMEM;
+ tagged = collect_mounts(&path2);
+ path_put(&path2);
+ if (IS_ERR(tagged))
+ return PTR_ERR(tagged);
- err = kern_path(old, 0, &path);
+ err = kern_path(old, 0, &path1);
if (err) {
drop_collected_mounts(tagged);
return err;
}
- mnt = mntget(path.mnt);
- dentry = dget(path.dentry);
- path_put(&path);
-
- if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
- follow_up(&mnt, &dentry);
-
- list_add_tail(&list, &tagged->mnt_list);
mutex_lock(&audit_filter_mutex);
list_add(&barrier, &tree_list);
@@ -743,7 +743,7 @@ int audit_tag_tree(char *old, char *new)
while (cursor.next != &tree_list) {
struct audit_tree *tree;
- struct vfsmount *p;
+ int good_one = 0;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
@@ -751,30 +751,19 @@ int audit_tag_tree(char *old, char *new)
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
- err = kern_path(tree->pathname, 0, &path);
- if (err) {
- put_tree(tree);
- mutex_lock(&audit_filter_mutex);
- continue;
+ err = kern_path(tree->pathname, 0, &path2);
+ if (!err) {
+ good_one = path_is_under(&path1, &path2);
+ path_put(&path2);
}
- spin_lock(&vfsmount_lock);
- if (!is_under(mnt, dentry, &path)) {
- spin_unlock(&vfsmount_lock);
- path_put(&path);
+ if (!good_one) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
- spin_unlock(&vfsmount_lock);
- path_put(&path);
-
- list_for_each_entry(p, &list, mnt_list) {
- failed = tag_chunk(p->mnt_root->d_inode, tree);
- if (failed)
- break;
- }
+ failed = iterate_mounts(tag_mount, tree, tagged);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -815,20 +804,19 @@ int audit_tag_tree(char *old, char *new)
}
list_del(&barrier);
list_del(&cursor);
- list_del(&list);
mutex_unlock(&audit_filter_mutex);
- dput(dentry);
- mntput(mnt);
+ path_put(&path1);
drop_collected_mounts(tagged);
return failed;
}
/*
* That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
*/
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
{
+ mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
@@ -845,16 +833,51 @@ void audit_prune_trees(void)
}
mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+ return 0;
+}
+
+static void audit_schedule_prune(void)
+{
+ kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall. Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+ mutex_lock(&audit_cmd_mutex);
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(list->next, struct audit_tree, list);
+ kill_rules(victim);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
}
/*
* Here comes the stuff asynchronous to auditctl operations
*/
-/* inode->inotify_mutex is locked */
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
+ struct list_head *postponed = audit_killed_trees();
+ int need_prune = 0;
int n;
if (chunk->dead)
@@ -870,47 +893,59 @@ static void evict_chunk(struct audit_chunk *chunk)
owner->root = NULL;
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
- kill_rules(owner);
- list_move(&owner->list, &prune_list);
- audit_schedule_prune();
+ if (!postponed) {
+ kill_rules(owner);
+ list_move(&owner->list, &prune_list);
+ need_prune = 1;
+ } else {
+ list_move(&owner->list, postponed);
+ }
spin_lock(&hash_lock);
}
list_del_rcu(&chunk->hash);
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
+ if (need_prune)
+ audit_schedule_prune();
mutex_unlock(&audit_filter_mutex);
}
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-
- if (mask & IN_IGNORED) {
- evict_chunk(chunk);
- put_inotify_watch(watch);
- }
+ return 0;
}
-static void destroy_watch(struct inotify_watch *watch)
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
- call_rcu(&chunk->head, __put_chunk);
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+
+ evict_chunk(chunk);
+
+ /*
+ * We are guaranteed to have at least one reference to the mark from
+ * either the inode or the caller of fsnotify_destroy_mark().
+ */
+ BUG_ON(atomic_read(&entry->refcnt) < 1);
}
-static const struct inotify_operations rtree_inotify_ops = {
- .handle_event = handle_event,
- .destroy_watch = destroy_watch,
+static const struct fsnotify_ops audit_tree_ops = {
+ .handle_event = audit_tree_handle_event,
+ .freeing_mark = audit_tree_freeing_mark,
};
static int __init audit_tree_init(void)
{
int i;
- rtree_ih = inotify_init(&rtree_inotify_ops);
- if (IS_ERR(rtree_ih))
- audit_panic("cannot initialize inotify handle for rectree watches");
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ if (IS_ERR(audit_tree_group))
+ audit_panic("cannot initialize fsnotify group for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 00000000000..70b4554d2fb
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,519 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
+ * event. Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * audit_remove_watch(). Additionally, an audit_watch may exist
+ * temporarily to assist in searching existing filter data. Each
+ * audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_watch {
+ atomic_t count; /* reference count */
+ dev_t dev; /* associated superblock device */
+ char *path; /* insertion path */
+ unsigned long ino; /* associated inode number */
+ struct audit_parent *parent; /* associated parent */
+ struct list_head wlist; /* entry in parent->watches list */
+ struct list_head rules; /* anchor for krule->rlist */
+};
+
+struct audit_parent {
+ struct list_head watches; /* anchor for audit_watch->wlist */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_watch_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+
+static void audit_free_parent(struct audit_parent *parent)
+{
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
+
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(entry, struct audit_parent, mark);
+ audit_free_parent(parent);
+}
+
+static void audit_get_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode. If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+ struct audit_parent *parent = NULL;
+ struct fsnotify_mark *entry;
+
+ entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ if (entry)
+ parent = container_of(entry, struct audit_parent, mark);
+
+ return parent;
+}
+
+void audit_get_watch(struct audit_watch *watch)
+{
+ atomic_inc(&watch->count);
+}
+
+void audit_put_watch(struct audit_watch *watch)
+{
+ if (atomic_dec_and_test(&watch->count)) {
+ WARN_ON(watch->parent);
+ WARN_ON(!list_empty(&watch->rules));
+ kfree(watch->path);
+ kfree(watch);
+ }
+}
+
+static void audit_remove_watch(struct audit_watch *watch)
+{
+ list_del(&watch->wlist);
+ audit_put_parent(watch->parent);
+ watch->parent = NULL;
+ audit_put_watch(watch); /* match initial get */
+}
+
+char *audit_watch_path(struct audit_watch *watch)
+{
+ return watch->path;
+}
+
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
+{
+ return (watch->ino != (unsigned long)-1) &&
+ (watch->ino == ino) &&
+ (watch->dev == dev);
+}
+
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct path *path)
+{
+ struct inode *inode = path->dentry->d_inode;
+ struct audit_parent *parent;
+ int ret;
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (unlikely(!parent))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&parent->watches);
+
+ fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ parent->mark.mask = AUDIT_FS_WATCH;
+ ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ if (ret < 0) {
+ audit_free_parent(parent);
+ return ERR_PTR(ret);
+ }
+
+ return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+ struct audit_watch *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (unlikely(!watch))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&watch->rules);
+ atomic_set(&watch->count, 1);
+ watch->path = path;
+ watch->dev = (dev_t)-1;
+ watch->ino = (unsigned long)-1;
+
+ return watch;
+}
+
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+ struct audit_watch *watch;
+
+ if (!audit_watch_group)
+ return -EOPNOTSUPP;
+
+ if (path[0] != '/' || path[len-1] == '/' ||
+ krule->listnr != AUDIT_FILTER_EXIT ||
+ op != Audit_equal ||
+ krule->inode_f || krule->watch || krule->tree)
+ return -EINVAL;
+
+ watch = audit_init_watch(path);
+ if (IS_ERR(watch))
+ return PTR_ERR(watch);
+
+ audit_get_watch(watch);
+ krule->watch = watch;
+
+ return 0;
+}
+
+/* Duplicate the given audit watch. The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+ char *path;
+ struct audit_watch *new;
+
+ path = kstrdup(old->path, GFP_KERNEL);
+ if (unlikely(!path))
+ return ERR_PTR(-ENOMEM);
+
+ new = audit_init_watch(path);
+ if (IS_ERR(new)) {
+ kfree(path);
+ goto out;
+ }
+
+ new->dev = old->dev;
+ new->ino = old->ino;
+ audit_get_parent(old->parent);
+ new->parent = old->parent;
+
+out:
+ return new;
+}
+
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, w->path);
+ audit_log_key(ab, r->filterkey);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
+ }
+}
+
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+ const char *dname, dev_t dev,
+ unsigned long ino, unsigned invalidating)
+{
+ struct audit_watch *owatch, *nwatch, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *oentry, *nentry;
+
+ mutex_lock(&audit_filter_mutex);
+ /* Run all of the watches on this parent looking for the one that
+ * matches the given dname */
+ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
+ continue;
+
+ /* If the update involves invalidating rules, do the inode-based
+ * filtering now, so we don't omit records. */
+ if (invalidating && !audit_dummy_context())
+ audit_filter_inodes(current, current->audit_context);
+
+ /* updating ino will likely change which audit_hash_list we
+ * are on so we need a new watch for the new list */
+ nwatch = audit_dupe_watch(owatch);
+ if (IS_ERR(nwatch)) {
+ mutex_unlock(&audit_filter_mutex);
+ audit_panic("error updating watch, skipping");
+ return;
+ }
+ nwatch->dev = dev;
+ nwatch->ino = ino;
+
+ list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+ oentry = container_of(r, struct audit_entry, rule);
+ list_del(&oentry->rule.rlist);
+ list_del_rcu(&oentry->list);
+
+ nentry = audit_dupe_rule(&oentry->rule);
+ if (IS_ERR(nentry)) {
+ list_del(&oentry->rule.list);
+ audit_panic("error updating watch, removing");
+ } else {
+ int h = audit_hash_ino((u32)ino);
+
+ /*
+ * nentry->rule.watch == oentry->rule.watch so
+ * we must drop that reference and set it to our
+ * new watch.
+ */
+ audit_put_watch(nentry->rule.watch);
+ audit_get_watch(nwatch);
+ nentry->rule.watch = nwatch;
+ list_add(&nentry->rule.rlist, &nwatch->rules);
+ list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+ list_replace(&oentry->rule.list,
+ &nentry->rule.list);
+ }
+
+ audit_watch_log_rule_change(r, owatch, "updated rules");
+
+ call_rcu(&oentry->rcu, audit_free_rule_rcu);
+ }
+
+ audit_remove_watch(owatch);
+ goto add_watch_to_parent; /* event applies to a single watch */
+ }
+ mutex_unlock(&audit_filter_mutex);
+ return;
+
+add_watch_to_parent:
+ list_add(&nwatch->wlist, &parent->watches);
+ mutex_unlock(&audit_filter_mutex);
+ return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+ struct audit_watch *w, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *e;
+
+ mutex_lock(&audit_filter_mutex);
+ list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+ list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+ e = container_of(r, struct audit_entry, rule);
+ audit_watch_log_rule_change(r, w, "remove rule");
+ list_del(&r->rlist);
+ list_del(&r->list);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+ }
+ audit_remove_watch(w);
+ }
+ mutex_unlock(&audit_filter_mutex);
+
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
+{
+ struct dentry *d = kern_path_locked(watch->path, parent);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ mutex_unlock(&parent->dentry->d_inode->i_mutex);
+ if (d->d_inode) {
+ /* update watch filter fields */
+ watch->dev = d->d_inode->i_sb->s_dev;
+ watch->ino = d->d_inode->i_ino;
+ }
+ dput(d);
+ return 0;
+}
+
+/* Associate the given rule with an existing parent.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+ struct audit_parent *parent)
+{
+ struct audit_watch *w, *watch = krule->watch;
+ int watch_found = 0;
+
+ BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
+ list_for_each_entry(w, &parent->watches, wlist) {
+ if (strcmp(watch->path, w->path))
+ continue;
+
+ watch_found = 1;
+
+ /* put krule's and initial refs to temporary watch */
+ audit_put_watch(watch);
+ audit_put_watch(watch);
+
+ audit_get_watch(w);
+ krule->watch = watch = w;
+ break;
+ }
+
+ if (!watch_found) {
+ audit_get_parent(parent);
+ watch->parent = parent;
+
+ list_add(&watch->wlist, &parent->watches);
+ }
+ list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
+{
+ struct audit_watch *watch = krule->watch;
+ struct audit_parent *parent;
+ struct path parent_path;
+ int h, ret = 0;
+
+ mutex_unlock(&audit_filter_mutex);
+
+ /* Avoid calling path_lookup under audit_filter_mutex. */
+ ret = audit_get_nd(watch, &parent_path);
+
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+
+ if (ret)
+ return ret;
+
+ /* either find an old parent or attach a new one */
+ parent = audit_find_parent(parent_path.dentry->d_inode);
+ if (!parent) {
+ parent = audit_init_parent(&parent_path);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ goto error;
+ }
+ }
+
+ audit_add_to_parent(krule, parent);
+
+ /* match get in audit_find_parent or audit_init_parent */
+ audit_put_parent(parent);
+
+ h = audit_hash_ino((u32)watch->ino);
+ *list = &audit_inode_hash[h];
+error:
+ path_put(&parent_path);
+ return ret;
+}
+
+void audit_remove_watch_rule(struct audit_krule *krule)
+{
+ struct audit_watch *watch = krule->watch;
+ struct audit_parent *parent = watch->parent;
+
+ list_del(&krule->rlist);
+
+ if (list_empty(&watch->rules)) {
+ audit_remove_watch(watch);
+
+ if (list_empty(&parent->watches)) {
+ audit_get_parent(parent);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+ audit_put_parent(parent);
+ }
+ }
+}
+
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
+{
+ struct inode *inode;
+ struct audit_parent *parent;
+
+ parent = container_of(inode_mark, struct audit_parent, mark);
+
+ BUG_ON(group != audit_watch_group);
+
+ switch (data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = ((struct path *)data)->dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = (struct inode *)data;
+ break;
+ default:
+ BUG();
+ inode = NULL;
+ break;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+ else if (mask & (FS_DELETE|FS_MOVED_FROM))
+ audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ audit_remove_parent_watches(parent);
+
+ return 0;
+}
+
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+ .handle_event = audit_watch_handle_event,
+};
+
+static int __init audit_watch_init(void)
+{
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ if (IS_ERR(audit_watch_group)) {
+ audit_watch_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
+ return 0;
+}
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d9..8e9bc9c3dbb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -27,8 +29,10 @@
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
-#include <linux/inotify.h>
+#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -44,36 +48,6 @@
* be written directly provided audit_filter_mutex is held.
*/
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- * event. Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- * audit_remove_watch(). Additionally, an audit_watch may exist
- * temporarily to assist in searching existing filter data. Each
- * audit_krule holds a reference to its associated watch.
- */
-
-struct audit_parent {
- struct list_head ilist; /* entry in inotify registration list */
- struct list_head watches; /* associated watches */
- struct inotify_watch wdata; /* inotify watch data */
- unsigned flags; /* status flags */
-};
-
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID 0x001
-
/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,56 +71,22 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-
-void audit_free_parent(struct inotify_watch *i_watch)
-{
- struct audit_parent *parent;
-
- parent = container_of(i_watch, struct audit_parent, wdata);
- WARN_ON(!list_empty(&parent->watches));
- kfree(parent);
-}
-
-static inline void audit_get_watch(struct audit_watch *watch)
-{
- atomic_inc(&watch->count);
-}
-
-static void audit_put_watch(struct audit_watch *watch)
-{
- if (atomic_dec_and_test(&watch->count)) {
- WARN_ON(watch->parent);
- WARN_ON(!list_empty(&watch->rules));
- kfree(watch->path);
- kfree(watch);
- }
-}
-
-static void audit_remove_watch(struct audit_watch *watch)
-{
- list_del(&watch->wlist);
- put_inotify_watch(&watch->parent->wdata);
- watch->parent = NULL;
- audit_put_watch(watch); /* match initial get */
-}
-
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
+ struct audit_krule *erule = &e->rule;
/* some rules don't have associated watches */
- if (e->rule.watch)
- audit_put_watch(e->rule.watch);
- if (e->rule.fields)
- for (i = 0; i < e->rule.field_count; i++) {
- struct audit_field *f = &e->rule.fields[i];
+ if (erule->watch)
+ audit_put_watch(erule->watch);
+ if (erule->fields)
+ for (i = 0; i < erule->field_count; i++) {
+ struct audit_field *f = &erule->fields[i];
kfree(f->lsm_str);
security_audit_rule_free(f->lsm_rule);
}
- kfree(e->rule.fields);
- kfree(e->rule.filterkey);
+ kfree(erule->fields);
+ kfree(erule->filterkey);
kfree(e);
}
@@ -156,50 +96,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
audit_free_rule(e);
}
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
- struct audit_parent *parent;
- s32 wd;
-
- parent = kzalloc(sizeof(*parent), GFP_KERNEL);
- if (unlikely(!parent))
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&parent->watches);
- parent->flags = 0;
-
- inotify_init_watch(&parent->wdata);
- /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
- get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata,
- ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
- if (wd < 0) {
- audit_free_parent(&parent->wdata);
- return ERR_PTR(wd);
- }
-
- return parent;
-}
-
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
- struct audit_watch *watch;
-
- watch = kzalloc(sizeof(*watch), GFP_KERNEL);
- if (unlikely(!watch))
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&watch->rules);
- atomic_set(&watch->count, 1);
- watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
-
- return watch;
-}
-
/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
@@ -260,31 +156,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
return 0;
}
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
- u32 op)
-{
- struct audit_watch *watch;
-
- if (!audit_ih)
- return -EOPNOTSUPP;
-
- if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
- op != Audit_equal ||
- krule->inode_f || krule->watch || krule->tree)
- return -EINVAL;
-
- watch = audit_init_watch(path);
- if (IS_ERR(watch))
- return PTR_ERR(watch);
-
- audit_get_watch(watch);
- krule->watch = watch;
-
- return 0;
-}
-
static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
@@ -357,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
#endif
/* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
@@ -368,17 +239,19 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
switch(listnr) {
default:
goto exit_err;
- case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
+ if (rule->action == AUDIT_ALWAYS)
+ goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_TASK:
#endif
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
- printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -441,103 +314,84 @@ static u32 audit_to_op(u32 op)
return n;
}
-
-/* Translate struct audit_rule to kernel's rule respresentation.
- * Exists for backward compatibility with userspace. */
-static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+/* check if an audit field is valid */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- struct audit_entry *entry;
- int err = 0;
- int i;
-
- entry = audit_to_entry_common(rule);
- if (IS_ERR(entry))
- goto exit_nofree;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &entry->rule.fields[i];
- u32 n;
-
- n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
-
- /* Support for legacy operators where
- * AUDIT_NEGATE bit signifies != and otherwise assumes == */
- if (n & AUDIT_NEGATE)
- f->op = Audit_not_equal;
- else if (!n)
- f->op = Audit_equal;
- else
- f->op = audit_to_op(n);
-
- entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
-
- f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
- f->val = rule->values[i];
-
- err = -EINVAL;
- if (f->op == Audit_bad)
- goto exit_free;
-
- switch(f->type) {
- default:
- goto exit_free;
- case AUDIT_PID:
- case AUDIT_UID:
- case AUDIT_EUID:
- case AUDIT_SUID:
- case AUDIT_FSUID:
- case AUDIT_GID:
- case AUDIT_EGID:
- case AUDIT_SGID:
- case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- /* bit ops are only useful on syscall args */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
- break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- break;
- /* arch is only allowed to be = or != */
- case AUDIT_ARCH:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
- goto exit_free;
- entry->rule.arch_f = f;
- break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
- goto exit_free;
- break;
- case AUDIT_INODE:
- err = audit_to_inode(&entry->rule, f);
- if (err)
- goto exit_free;
- break;
- }
- }
-
- if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
- entry->rule.inode_f = NULL;
-
-exit_nofree:
- return entry;
+ switch(f->type) {
+ case AUDIT_MSGTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ entry->rule.listnr != AUDIT_FILTER_USER)
+ return -EINVAL;
+ break;
+ };
-exit_free:
- audit_free_rule(entry);
- return ERR_PTR(err);
+ switch(f->type) {
+ default:
+ return -EINVAL;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ case AUDIT_PID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_INODE:
+ /* bit ops are only useful on syscall args */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ return -EINVAL;
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_WATCH:
+ case AUDIT_DIR:
+ case AUDIT_FILTERKEY:
+ break;
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ /* FALL THROUGH */
+ case AUDIT_ARCH:
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ return -EINVAL;
+ break;
+ case AUDIT_FILETYPE:
+ if (f->val & ~S_IFMT)
+ return -EINVAL;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
+ return -EINVAL;
+ break;
+ };
+ return 0;
}
/* Translate struct audit_rule_data to kernel's rule respresentation. */
@@ -551,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
int i;
char *str;
- entry = audit_to_entry_common((struct audit_rule *)data);
+ entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
@@ -568,30 +422,54 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->type = data->fields[i];
f->val = data->values[i];
+ f->uid = INVALID_UID;
+ f->gid = INVALID_GID;
f->lsm_str = NULL;
f->lsm_rule = NULL;
- switch(f->type) {
- case AUDIT_PID:
+
+ /* Support legacy tests for a valid loginuid */
+ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ f->type = AUDIT_LOGINUID_SET;
+ f->val = 0;
+ }
+
+ if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+ struct pid *pid;
+ rcu_read_lock();
+ pid = find_vpid(f->val);
+ if (!pid) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto exit_free;
+ }
+ f->val = pid_nr(pid);
+ rcu_read_unlock();
+ }
+
+ err = audit_field_valid(entry, f);
+ if (err)
+ goto exit_free;
+
+ err = -EINVAL;
+ switch (f->type) {
+ case AUDIT_LOGINUID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
+ case AUDIT_OBJ_UID:
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
+ case AUDIT_OBJ_GID:
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -616,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM "
- "\'%s\' is invalid\n", str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
err = 0;
}
if (err) {
@@ -655,7 +533,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
break;
case AUDIT_FILTERKEY:
- err = -EINVAL;
if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f->val);
@@ -664,16 +541,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
- goto exit_free;
- break;
- default:
- goto exit_free;
}
}
@@ -684,6 +551,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -699,36 +570,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule.
- * Exists for backward compatibility with userspace. */
-static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
-{
- struct audit_rule *rule;
- int i;
-
- rule = kzalloc(sizeof(*rule), GFP_KERNEL);
- if (unlikely(!rule))
- return NULL;
-
- rule->flags = krule->flags | krule->listnr;
- rule->action = krule->action;
- rule->field_count = krule->field_count;
- for (i = 0; i < rule->field_count; i++) {
- rule->values[i] = krule->fields[i].val;
- rule->fields[i] = krule->fields[i].type;
-
- if (krule->vers_ops == 1) {
- if (krule->fields[i].op == Audit_not_equal)
- rule->fields[i] |= AUDIT_NEGATE;
- } else {
- rule->fields[i] |= audit_ops[krule->fields[i].op];
- }
- }
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
-
- return rule;
-}
-
/* Translate kernel rule respresentation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
@@ -766,7 +607,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
break;
case AUDIT_WATCH:
data->buflen += data->values[i] =
- audit_pack_string(&bufp, krule->watch->path);
+ audit_pack_string(&bufp,
+ audit_watch_path(krule->watch));
break;
case AUDIT_DIR:
data->buflen += data->values[i] =
@@ -818,7 +660,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 1;
break;
case AUDIT_WATCH:
- if (strcmp(a->watch->path, b->watch->path))
+ if (strcmp(audit_watch_path(a->watch),
+ audit_watch_path(b->watch)))
return 1;
break;
case AUDIT_DIR:
@@ -831,6 +674,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+ return 1;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -844,32 +704,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 0;
}
-/* Duplicate the given audit watch. The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
- char *path;
- struct audit_watch *new;
-
- path = kstrdup(old->path, GFP_KERNEL);
- if (unlikely(!path))
- return ERR_PTR(-ENOMEM);
-
- new = audit_init_watch(path);
- if (IS_ERR(new)) {
- kfree(path);
- goto out;
- }
-
- new->dev = old->dev;
- new->ino = old->ino;
- get_inotify_watch(&old->parent->wdata);
- new->parent = old->parent;
-
-out:
- return new;
-}
-
/* Duplicate LSM field information. The lsm_rule is opaque, so must be
* re-initialized. */
static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -890,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM \'%s\' is "
- "invalid\n", df->lsm_str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
ret = 0;
}
@@ -904,8 +738,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -927,8 +760,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
new->prio = old->prio;
new->buflen = old->buflen;
new->inode_f = old->inode_f;
- new->watch = NULL;
new->field_count = old->field_count;
+
/*
* note that we are OK with not refcounting here; audit_match_tree()
* never dereferences tree and we can't get false positives there
@@ -969,145 +802,14 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
}
}
- if (watch) {
- audit_get_watch(watch);
- new->watch = watch;
+ if (old->watch) {
+ audit_get_watch(old->watch);
+ new->watch = old->watch;
}
return entry;
}
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
- const char *dname, dev_t dev,
- unsigned long ino, unsigned invalidating)
-{
- struct audit_watch *owatch, *nwatch, *nextw;
- struct audit_krule *r, *nextr;
- struct audit_entry *oentry, *nentry;
-
- mutex_lock(&audit_filter_mutex);
- list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
- continue;
-
- /* If the update involves invalidating rules, do the inode-based
- * filtering now, so we don't omit records. */
- if (invalidating && current->audit_context)
- audit_filter_inodes(current, current->audit_context);
-
- nwatch = audit_dupe_watch(owatch);
- if (IS_ERR(nwatch)) {
- mutex_unlock(&audit_filter_mutex);
- audit_panic("error updating watch, skipping");
- return;
- }
- nwatch->dev = dev;
- nwatch->ino = ino;
-
- list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-
- oentry = container_of(r, struct audit_entry, rule);
- list_del(&oentry->rule.rlist);
- list_del_rcu(&oentry->list);
-
- nentry = audit_dupe_rule(&oentry->rule, nwatch);
- if (IS_ERR(nentry)) {
- list_del(&oentry->rule.list);
- audit_panic("error updating watch, removing");
- } else {
- int h = audit_hash_ino((u32)ino);
- list_add(&nentry->rule.rlist, &nwatch->rules);
- list_add_rcu(&nentry->list, &audit_inode_hash[h]);
- list_replace(&oentry->rule.list,
- &nentry->rule.list);
- }
-
- call_rcu(&oentry->rcu, audit_free_rule_rcu);
- }
-
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL,
- AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "auid=%u ses=%u",
- audit_get_loginuid(current),
- audit_get_sessionid(current));
- audit_log_format(ab,
- " op=updated rules specifying path=");
- audit_log_untrustedstring(ab, owatch->path);
- audit_log_format(ab, " with dev=%u ino=%lu\n",
- dev, ino);
- audit_log_format(ab, " list=%d res=1", r->listnr);
- audit_log_end(ab);
- }
- audit_remove_watch(owatch);
- goto add_watch_to_parent; /* event applies to a single watch */
- }
- mutex_unlock(&audit_filter_mutex);
- return;
-
-add_watch_to_parent:
- list_add(&nwatch->wlist, &parent->watches);
- mutex_unlock(&audit_filter_mutex);
- return;
-}
-
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
- struct audit_watch *w, *nextw;
- struct audit_krule *r, *nextr;
- struct audit_entry *e;
-
- mutex_lock(&audit_filter_mutex);
- parent->flags |= AUDIT_PARENT_INVALID;
- list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
- list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
- e = container_of(r, struct audit_entry, rule);
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL,
- AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "auid=%u ses=%u",
- audit_get_loginuid(current),
- audit_get_sessionid(current));
- audit_log_format(ab, " op=remove rule path=");
- audit_log_untrustedstring(ab, w->path);
- if (r->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab,
- r->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
- audit_log_format(ab, " list=%d res=1",
- r->listnr);
- audit_log_end(ab);
- }
- list_del(&r->rlist);
- list_del(&r->list);
- list_del_rcu(&e->list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
- }
- audit_remove_watch(w);
- }
- mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
- struct audit_parent *p, *n;
-
- list_for_each_entry_safe(p, n, in_list, ilist) {
- list_del(&p->ilist);
- inotify_rm_watch(audit_ih, &p->wdata);
- /* the unpin matching the pin in audit_do_del_rule() */
- unpin_inotify_watch(&p->wdata);
- }
-}
-
/* Find an existing audit rule.
* Caller must hold audit_filter_mutex to prevent stale rule data. */
static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +847,6 @@ out:
return found;
}
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
- struct nameidata **ndw)
-{
- struct nameidata *ndparent, *ndwatch;
- int err;
-
- ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
- if (unlikely(!ndparent))
- return -ENOMEM;
-
- ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
- if (unlikely(!ndwatch)) {
- kfree(ndparent);
- return -ENOMEM;
- }
-
- err = path_lookup(path, LOOKUP_PARENT, ndparent);
- if (err) {
- kfree(ndparent);
- kfree(ndwatch);
- return err;
- }
-
- err = path_lookup(path, 0, ndwatch);
- if (err) {
- kfree(ndwatch);
- ndwatch = NULL;
- }
-
- *ndp = ndparent;
- *ndw = ndwatch;
-
- return 0;
-}
-
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
- if (ndp) {
- path_put(&ndp->path);
- kfree(ndp);
- }
- if (ndw) {
- path_put(&ndw->path);
- kfree(ndw);
- }
-}
-
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
- struct audit_parent *parent)
-{
- struct audit_watch *w, *watch = krule->watch;
- int watch_found = 0;
-
- list_for_each_entry(w, &parent->watches, wlist) {
- if (strcmp(watch->path, w->path))
- continue;
-
- watch_found = 1;
-
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
- audit_put_watch(watch);
-
- audit_get_watch(w);
- krule->watch = watch = w;
- break;
- }
-
- if (!watch_found) {
- get_inotify_watch(&parent->wdata);
- watch->parent = parent;
-
- list_add(&watch->wlist, &parent->watches);
- }
- list_add(&krule->rlist, &watch->rules);
-}
-
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
- struct nameidata *ndw)
-{
- struct audit_watch *watch = krule->watch;
- struct inotify_watch *i_watch;
- struct audit_parent *parent;
- int ret = 0;
-
- /* update watch filter fields */
- if (ndw) {
- watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->path.dentry->d_inode->i_ino;
- }
-
- /* The audit_filter_mutex must not be held during inotify calls because
- * we hold it during inotify event callback processing. If an existing
- * inotify watch is found, inotify_find_watch() grabs a reference before
- * returning.
- */
- mutex_unlock(&audit_filter_mutex);
-
- if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
- &i_watch) < 0) {
- parent = audit_init_parent(ndp);
- if (IS_ERR(parent)) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
- return PTR_ERR(parent);
- }
- } else
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- mutex_lock(&audit_filter_mutex);
-
- /* parent was moved before we took audit_filter_mutex */
- if (parent->flags & AUDIT_PARENT_INVALID)
- ret = -ENOENT;
- else
- audit_add_to_parent(krule, parent);
-
- /* match get in audit_init_parent or inotify_find_watch */
- put_inotify_watch(&parent->wdata);
- return ret;
-}
-
static u64 prio_low = ~0ULL/2;
static u64 prio_high = ~0ULL/2 - 1;
@@ -1282,9 +856,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_entry *e;
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
- struct nameidata *ndp = NULL, *ndw = NULL;
struct list_head *list;
- int h, err;
+ int err;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -1296,8 +869,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
- mutex_unlock(&audit_filter_mutex);
if (e) {
+ mutex_unlock(&audit_filter_mutex);
err = -EEXIST;
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
@@ -1305,23 +878,19 @@ static inline int audit_add_rule(struct audit_entry *entry)
goto error;
}
- /* Avoid calling path_lookup under audit_filter_mutex. */
- if (watch) {
- err = audit_get_nd(watch->path, &ndp, &ndw);
- if (err)
- goto error;
- }
-
- mutex_lock(&audit_filter_mutex);
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
- err = audit_add_watch(&entry->rule, ndp, ndw);
+ err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
+ /*
+ * normally audit_add_tree_rule() will free it
+ * on failure
+ */
+ if (tree)
+ audit_put_tree(tree);
goto error;
}
- h = audit_hash_ino((u32)watch->ino);
- list = &audit_inode_hash[h];
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
@@ -1358,11 +927,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- audit_put_nd(ndp, ndw); /* NULL args OK */
return 0;
error:
- audit_put_nd(ndp, ndw); /* NULL args OK */
if (watch)
audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
@@ -1372,10 +939,9 @@ error:
static inline int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+ struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- LIST_HEAD(inotify_list);
int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -1394,29 +960,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
goto out;
}
- watch = e->rule.watch;
- if (watch) {
- struct audit_parent *parent = watch->parent;
-
- list_del(&e->rule.rlist);
-
- if (list_empty(&watch->rules)) {
- audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- /* Put parent on the inotify un-registration
- * list. Grab a reference before releasing
- * audit_filter_mutex, to be released in
- * audit_inotify_unregister().
- * If filesystem is going away, just leave
- * the sucker alone, eviction will take
- * care of it.
- */
- if (pin_inotify_watch(&parent->wdata))
- list_add(&parent->ilist, &inotify_list);
- }
- }
- }
+ if (e->rule.watch)
+ audit_remove_watch_rule(&e->rule);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
@@ -1434,49 +979,17 @@ static inline int audit_del_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- if (!list_empty(&inotify_list))
- audit_inotify_unregister(&inotify_list);
-
out:
- if (tmp_watch)
- audit_put_watch(tmp_watch); /* match initial get */
+ if (watch)
+ audit_put_watch(watch); /* match initial get */
if (tree)
audit_put_tree(tree); /* that's the temporary one */
return ret;
}
-/* List rules using struct audit_rule. Exists for backward
- * compatibility with userspace. */
-static void audit_list(int pid, int seq, struct sk_buff_head *q)
-{
- struct sk_buff *skb;
- struct audit_krule *r;
- int i;
-
- /* This is a blocking read, so use audit_filter_mutex instead of rcu
- * iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
- list_for_each_entry(r, &audit_rules_list[i], list) {
- struct audit_rule *rule;
-
- rule = audit_krule_to_rule(r);
- if (unlikely(!rule))
- break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
- rule, sizeof(*rule));
- if (skb)
- skb_queue_tail(q, skb);
- kfree(rule);
- }
- }
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
- if (skb)
- skb_queue_tail(q, skb);
-}
-
/* List rules using struct audit_rule_data. */
-static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
@@ -1491,24 +1004,25 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
- data, sizeof(*data) + data->buflen);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+ 0, 1, data,
+ sizeof(*data) + data->buflen);
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
/* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
- char *action, struct audit_krule *rule,
- int res)
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
+ uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+ unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1516,104 +1030,47 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
- audit_log_format(ab, " op=%s rule key=", action);
- if (rule->filterkey)
- audit_log_untrustedstring(ab, rule->filterkey);
- else
- audit_log_format(ab, "(null)");
+ audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " op=");
+ audit_log_string(ab, action);
+ audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
audit_log_end(ab);
}
/**
- * audit_receive_filter - apply all rules to the specified message type
+ * audit_rule_change - apply all rules to the specified message type
* @type: audit message type
- * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
+ * @portid: target port id for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
*/
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
- size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+int audit_rule_change(int type, __u32 portid, int seq, void *data,
+ size_t datasz)
{
- struct task_struct *tsk;
- struct audit_netlink_list *dest;
int err = 0;
struct audit_entry *entry;
switch (type) {
- case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- /* We can't just spew out the rules here because we might fill
- * the available socket buffer space and deadlock waiting for
- * auditctl to read from it... which isn't ever going to
- * happen if we're actually running in the context of auditctl
- * trying to _send_ the stuff */
-
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
- if (!dest)
- return -ENOMEM;
- dest->pid = pid;
- skb_queue_head_init(&dest->q);
-
- mutex_lock(&audit_filter_mutex);
- if (type == AUDIT_LIST)
- audit_list(pid, seq, &dest->q);
- else
- audit_list_rules(pid, seq, &dest->q);
- mutex_unlock(&audit_filter_mutex);
-
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
- if (IS_ERR(tsk)) {
- skb_queue_purge(&dest->q);
- kfree(dest);
- err = PTR_ERR(tsk);
- }
- break;
- case AUDIT_ADD:
case AUDIT_ADD_RULE:
- if (type == AUDIT_ADD)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add",
- &entry->rule, !err);
-
+ audit_log_rule_change("add rule", &entry->rule, !err);
if (err)
audit_free_rule(entry);
break;
- case AUDIT_DEL:
case AUDIT_DEL_RULE:
- if (type == AUDIT_DEL)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove",
- &entry->rule, !err);
-
+ audit_log_rule_change("remove rule", &entry->rule, !err);
audit_free_rule(entry);
break;
default:
@@ -1623,6 +1080,46 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return err;
}
+/**
+ * audit_list_rules_send - list the audit rules
+ * @request_skb: skb of request we are replying to (used to target the reply)
+ * @seq: netlink audit message sequence (serial) number
+ */
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
+{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct task_struct *tsk;
+ struct audit_netlink_list *dest;
+ int err = 0;
+
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest->net = get_net(net);
+ dest->portid = portid;
+ skb_queue_head_init(&dest->q);
+
+ mutex_lock(&audit_filter_mutex);
+ audit_list_rules(portid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+
+ return err;
+}
+
int audit_comparator(u32 left, u32 op, u32 right)
{
switch (op) {
@@ -1648,66 +1145,155 @@ int audit_comparator(u32 left, u32 op, u32 right)
}
}
-/* Compare given dentry name with last component in given path,
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen)
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
- int dlen, plen;
- const char *p;
+ switch (op) {
+ case Audit_equal:
+ return uid_eq(left, right);
+ case Audit_not_equal:
+ return !uid_eq(left, right);
+ case Audit_lt:
+ return uid_lt(left, right);
+ case Audit_le:
+ return uid_lte(left, right);
+ case Audit_gt:
+ return uid_gt(left, right);
+ case Audit_ge:
+ return uid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
- if (!dname || !path)
- return 1;
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return gid_eq(left, right);
+ case Audit_not_equal:
+ return !gid_eq(left, right);
+ case Audit_lt:
+ return gid_lt(left, right);
+ case Audit_le:
+ return gid_lte(left, right);
+ case Audit_gt:
+ return gid_gt(left, right);
+ case Audit_ge:
+ return gid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+ int plen;
+ const char *p;
- dlen = strlen(dname);
plen = strlen(path);
- if (plen < dlen)
- return 1;
+
+ if (plen == 0)
+ return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
- /* find last path component */
- p = p - dlen + 1;
- if (p < path)
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
return 1;
- else if (p > path) {
- if (*--p != '/')
- return 1;
- else
- p++;
- }
- /* return length of path's directory component */
- if (dirlen)
- *dirlen = p - path;
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
- struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
enum audit_state *state)
{
int i;
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ pid_t pid;
int result = 0;
+ u32 sid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(cb->creds.pid, f->op, f->val);
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
- result = audit_comparator(cb->creds.uid, f->op, f->val);
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cb->creds.gid, f->op, f->val);
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(cb->loginuid, f->op, f->val);
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(type, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
+ f->type,
+ f->op,
+ f->lsm_rule,
+ NULL);
+ }
break;
}
@@ -1721,23 +1307,26 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
return 1;
}
-int audit_filter_user(struct netlink_skb_parms *cb)
+int audit_filter_user(int type)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
- int ret = 1;
+ int rc, ret;
+
+ ret = 1; /* Audit by default */
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(cb, &e->rule, &state)) {
- if (state == AUDIT_DISABLED)
+ rc = audit_filter_user_rules(&e->rule, type, &state);
+ if (rc) {
+ if (rc > 0 && state == AUDIT_DISABLED)
ret = 0;
break;
}
}
rcu_read_unlock();
- return ret; /* Audit by default */
+ return ret;
}
int audit_filter_type(int type)
@@ -1772,30 +1361,23 @@ static int update_lsm_rule(struct audit_krule *r)
{
struct audit_entry *entry = container_of(r, struct audit_entry, rule);
struct audit_entry *nentry;
- struct audit_watch *watch;
- struct audit_tree *tree;
int err = 0;
if (!security_audit_rule_known(r))
return 0;
- watch = r->watch;
- tree = r->tree;
- nentry = audit_dupe_rule(r, watch);
+ nentry = audit_dupe_rule(r);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
err = PTR_ERR(nentry);
audit_panic("error updating LSM filters");
- if (watch)
+ if (r->watch)
list_del(&r->rlist);
list_del_rcu(&entry->list);
list_del(&r->list);
} else {
- if (watch) {
- list_add(&nentry->rule.rlist, &watch->rules);
- list_del(&r->rlist);
- } else if (tree)
+ if (r->watch || r->tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
list_replace_rcu(&entry->list, &nentry->list);
list_replace(&r->list, &nentry->rule.list);
@@ -1829,27 +1411,3 @@ int audit_update_lsm_rules(void)
return err;
}
-
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
-{
- struct audit_parent *parent;
-
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev,
- inode->i_ino, 0);
- else if (mask & (IN_DELETE|IN_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
- /* inotify automatically removes the watch and sends IN_IGNORED */
- else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
- audit_remove_parent_watches(parent);
- /* inotify does not remove the watch, so remove it manually */
- else if(mask & IN_MOVE_SELF) {
- audit_remove_parent_watches(parent);
- inotify_remove_watch_locked(audit_ih, i_watch);
- } else if (mask & IN_IGNORED)
- put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c28..21eae3c05ec 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,13 +42,16 @@
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/socket.h>
#include <linux/mqueue.h>
@@ -64,56 +67,30 @@
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/inotify.h>
#include <linux/capability.h>
+#include <linux/fs_struct.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
#include "audit.h"
-/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES 20
-
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
/* number of audit rules */
int audit_n_rules;
/* determines whether we collect data for signals sent */
int audit_signals;
-struct audit_cap_data {
- kernel_cap_t permitted;
- kernel_cap_t inheritable;
- union {
- unsigned int fE; /* effective bit of a file capability */
- kernel_cap_t effective; /* effective set of a process */
- };
-};
-
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
- *
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
-struct audit_names {
- const char *name;
- int name_len; /* number of name's characters to log */
- unsigned name_put; /* call __putname() for this name */
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- uid_t uid;
- gid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
-};
-
struct audit_aux_data {
struct audit_aux_data *next;
int type;
@@ -124,18 +101,11 @@ struct audit_aux_data {
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
-struct audit_aux_data_execve {
- struct audit_aux_data d;
- int argc;
- int envc;
- struct mm_struct *mm;
-};
-
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
- uid_t target_auid[AUDIT_AUX_PIDS];
- uid_t target_uid[AUDIT_AUX_PIDS];
+ kuid_t target_auid[AUDIT_AUX_PIDS];
+ kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -150,105 +120,11 @@ struct audit_aux_data_bprm_fcaps {
struct audit_cap_data new_pcap;
};
-struct audit_aux_data_capset {
- struct audit_aux_data d;
- pid_t pid;
- struct audit_cap_data cap;
-};
-
struct audit_tree_refs {
struct audit_tree_refs *next;
struct audit_chunk *c[31];
};
-/* The per-task audit context. */
-struct audit_context {
- int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
- enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
- struct timespec ctime; /* time of syscall entry */
- int major; /* syscall number */
- unsigned long argv[4]; /* syscall arguments */
- int return_valid; /* return code is valid */
- long return_code;/* syscall return code */
- u64 prio;
- int name_count;
- struct audit_names names[AUDIT_NAMES];
- char * filterkey; /* key for rule that triggered record */
- struct path pwd;
- struct audit_context *previous; /* For nested syscalls */
- struct audit_aux_data *aux;
- struct audit_aux_data *aux_pids;
- struct sockaddr_storage *sockaddr;
- size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
- uid_t uid, euid, suid, fsuid;
- gid_t gid, egid, sgid, fsgid;
- unsigned long personality;
- int arch;
-
- pid_t target_pid;
- uid_t target_auid;
- uid_t target_uid;
- unsigned int target_sessionid;
- u32 target_sid;
- char target_comm[TASK_COMM_LEN];
-
- struct audit_tree_refs *trees, *first_trees;
- int tree_count;
-
- int type;
- union {
- struct {
- int nargs;
- long args[6];
- } socketcall;
- struct {
- uid_t uid;
- gid_t gid;
- mode_t mode;
- u32 osid;
- int has_perm;
- uid_t perm_uid;
- gid_t perm_gid;
- mode_t perm_mode;
- unsigned long qbytes;
- } ipc;
- struct {
- mqd_t mqdes;
- struct mq_attr mqstat;
- } mq_getsetattr;
- struct {
- mqd_t mqdes;
- int sigev_signo;
- } mq_notify;
- struct {
- mqd_t mqdes;
- size_t msg_len;
- unsigned int msg_prio;
- struct timespec abs_timeout;
- } mq_sendrecv;
- struct {
- int oflag;
- mode_t mode;
- struct mq_attr attr;
- } mq_open;
- struct {
- pid_t pid;
- struct audit_cap_data cap;
- } capset;
- };
- int fds[2];
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
-};
-
-#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
static inline int open_arg(int flags, int mask)
{
int n = ACC_MODE(flags);
@@ -300,21 +176,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
}
}
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
{
- unsigned index = which & ~S_IFMT;
- mode_t mode = which & S_IFMT;
+ struct audit_names *n;
+ umode_t mode = (umode_t)val;
if (unlikely(!ctx))
return 0;
- if (index >= ctx->name_count)
- return 0;
- if (ctx->names[index].ino == -1)
- return 0;
- if ((ctx->names[index].mode ^ mode) & S_IFMT)
- return 0;
- return 1;
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if ((n->ino != -1) &&
+ ((n->mode & S_IFMT) == mode))
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -328,6 +204,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
*/
#ifdef CONFIG_AUDIT_TREE
+static void audit_set_auditable(struct audit_context *ctx)
+{
+ if (!ctx->prio) {
+ ctx->prio = 1;
+ ctx->current_state = AUDIT_RECORD_CONTEXT;
+ }
+}
+
static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
@@ -428,57 +312,202 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
+static int audit_compare_uid(kuid_t uid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_uid_comparator(uid, f->op, name->uid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_uid_comparator(uid, f->op, n->uid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_compare_gid(kgid_t gid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_gid_comparator(gid, f->op, name->gid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_gid_comparator(gid, f->op, n->gid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+ const struct cred *cred,
+ struct audit_field *f,
+ struct audit_context *ctx,
+ struct audit_names *name)
+{
+ switch (f->val) {
+ /* process to file object comparisons */
+ case AUDIT_COMPARE_UID_TO_OBJ_UID:
+ return audit_compare_uid(cred->uid, name, f, ctx);
+ case AUDIT_COMPARE_GID_TO_OBJ_GID:
+ return audit_compare_gid(cred->gid, name, f, ctx);
+ case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->euid, name, f, ctx);
+ case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->egid, name, f, ctx);
+ case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+ return audit_compare_uid(tsk->loginuid, name, f, ctx);
+ case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->suid, name, f, ctx);
+ case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->sgid, name, f, ctx);
+ case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->fsuid, name, f, ctx);
+ case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->fsgid, name, f, ctx);
+ /* uid comparisons */
+ case AUDIT_COMPARE_UID_TO_AUID:
+ return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+ case AUDIT_COMPARE_UID_TO_EUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->euid);
+ case AUDIT_COMPARE_UID_TO_SUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->suid);
+ case AUDIT_COMPARE_UID_TO_FSUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
+ /* auid comparisons */
+ case AUDIT_COMPARE_AUID_TO_EUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+ case AUDIT_COMPARE_AUID_TO_SUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+ case AUDIT_COMPARE_AUID_TO_FSUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+ /* euid comparisons */
+ case AUDIT_COMPARE_EUID_TO_SUID:
+ return audit_uid_comparator(cred->euid, f->op, cred->suid);
+ case AUDIT_COMPARE_EUID_TO_FSUID:
+ return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
+ /* suid comparisons */
+ case AUDIT_COMPARE_SUID_TO_FSUID:
+ return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
+ /* gid comparisons */
+ case AUDIT_COMPARE_GID_TO_EGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->egid);
+ case AUDIT_COMPARE_GID_TO_SGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->sgid);
+ case AUDIT_COMPARE_GID_TO_FSGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
+ /* egid comparisons */
+ case AUDIT_COMPARE_EGID_TO_SGID:
+ return audit_gid_comparator(cred->egid, f->op, cred->sgid);
+ case AUDIT_COMPARE_EGID_TO_FSGID:
+ return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
+ /* sgid comparison */
+ case AUDIT_COMPARE_SGID_TO_FSGID:
+ return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
+ default:
+ WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
+ return 0;
+ }
+ return 0;
+}
+
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time. This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
struct audit_names *name,
- enum audit_state *state)
+ enum audit_state *state,
+ bool task_creation)
{
- const struct cred *cred = get_task_cred(tsk);
- int i, j, need_sid = 1;
+ const struct cred *cred;
+ int i, need_sid = 1;
u32 sid;
+ cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
+
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ struct audit_names *n;
int result = 0;
+ pid_t pid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(tsk->pid, f->op, f->val);
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
- ctx->ppid = sys_getppid();
+ ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
case AUDIT_UID:
- result = audit_comparator(cred->uid, f->op, f->val);
+ result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
case AUDIT_EUID:
- result = audit_comparator(cred->euid, f->op, f->val);
+ result = audit_uid_comparator(cred->euid, f->op, f->uid);
break;
case AUDIT_SUID:
- result = audit_comparator(cred->suid, f->op, f->val);
+ result = audit_uid_comparator(cred->suid, f->op, f->uid);
break;
case AUDIT_FSUID:
- result = audit_comparator(cred->fsuid, f->op, f->val);
+ result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cred->gid, f->op, f->val);
+ result = audit_gid_comparator(cred->gid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_group_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_group_p(f->gid);
+ }
break;
case AUDIT_EGID:
- result = audit_comparator(cred->egid, f->op, f->val);
+ result = audit_gid_comparator(cred->egid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_egroup_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_egroup_p(f->gid);
+ }
break;
case AUDIT_SGID:
- result = audit_comparator(cred->sgid, f->op, f->val);
+ result = audit_gid_comparator(cred->sgid, f->op, f->gid);
break;
case AUDIT_FSGID:
- result = audit_comparator(cred->fsgid, f->op, f->val);
+ result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
@@ -501,12 +530,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMAJOR:
- if (name)
- result = audit_comparator(MAJOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -514,12 +545,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMINOR:
- if (name)
- result = audit_comparator(MINOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+ audit_comparator(MINOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+ audit_comparator(MINOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -528,10 +561,34 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_INODE:
if (name)
- result = (name->ino == f->val);
+ result = audit_comparator(name->ino, f->op, f->val);
else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->ino, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_UID:
+ if (name) {
+ result = audit_uid_comparator(name->uid, f->op, f->uid);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_uid_comparator(n->uid, f->op, f->uid)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_GID:
+ if (name) {
+ result = audit_gid_comparator(name->gid, f->op, f->gid);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_gid_comparator(n->gid, f->op, f->gid)) {
++result;
break;
}
@@ -539,9 +596,8 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name && rule->watch->ino != (unsigned long)-1)
- result = (name->dev == rule->watch->dev &&
- name->ino == rule->watch->ino);
+ if (name)
+ result = audit_watch_compare(rule->watch, name->ino, name->dev);
break;
case AUDIT_DIR:
if (ctx)
@@ -550,7 +606,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID:
result = 0;
if (ctx)
- result = audit_comparator(tsk->loginuid, f->op, f->val);
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
@@ -587,11 +646,10 @@ static int audit_filter_rules(struct task_struct *tsk,
name->osid, f->type, f->op,
f->lsm_rule, ctx);
} else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (security_audit_rule_match(
- ctx->names[j].osid,
- f->type, f->op,
- f->lsm_rule, ctx)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (security_audit_rule_match(n->osid, f->type,
+ f->op, f->lsm_rule,
+ ctx)) {
++result;
break;
}
@@ -623,12 +681,12 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
break;
+ case AUDIT_FIELD_COMPARE:
+ result = audit_field_compare(tsk, cred, f, ctx, name);
+ break;
}
-
- if (!result) {
- put_cred(cred);
+ if (!result)
return 0;
- }
}
if (ctx) {
@@ -644,7 +702,6 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
- put_cred(cred);
return 1;
}
@@ -659,7 +716,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
- if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+ &state, true)) {
if (state == AUDIT_RECORD_CONTEXT)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
@@ -670,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
return AUDIT_BUILD_CONTEXT;
}
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+ int word, bit;
+
+ if (val > 0xffffffff)
+ return false;
+
+ word = AUDIT_WORD(val);
+ if (word >= AUDIT_BITMASK_SIZE)
+ return false;
+
+ bit = AUDIT_BIT(val);
+
+ return rule->mask[word] & bit;
+}
+
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
@@ -687,13 +761,10 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
-
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state)) {
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return state;
@@ -704,58 +775,61 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_BUILD_CONTEXT;
}
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+ struct audit_names *n,
+ struct audit_context *ctx) {
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (list_empty(list))
+ return 0;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, ctx->major) &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
* collected during syscall processing. We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
* Regarding audit_state, same rules apply as for audit_filter_syscall().
*/
void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
- int i;
- struct audit_entry *e;
- enum audit_state state;
+ struct audit_names *n;
if (audit_pid && tsk->tgid == audit_pid)
return;
rcu_read_lock();
- for (i = 0; i < ctx->name_count; i++) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
- struct audit_names *n = &ctx->names[i];
- int h = audit_hash_ino((u32)n->ino);
- struct list_head *list = &audit_inode_hash[h];
-
- if (list_empty(list))
- continue;
- list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_filter_inode_name(tsk, n, ctx))
+ break;
}
rcu_read_unlock();
}
-static void audit_set_auditable(struct audit_context *ctx)
-{
- if (!ctx->prio) {
- ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
- }
-}
-
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
int return_valid,
- int return_code)
+ long return_code)
{
struct audit_context *context = tsk->audit_context;
- if (likely(!context))
+ if (!context)
return NULL;
context->return_valid = return_valid;
@@ -786,23 +860,30 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
return context;
}
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
static inline void audit_free_names(struct audit_context *context)
{
- int i;
+ struct audit_names *n, *next;
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d"
- " ino_count=%d [NOT freeing]\n",
- __FILE__, __LINE__,
+ int i = 0;
+
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d ino_count=%d"
+ " [NOT freeing]\n", __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
- for (i = 0; i < context->name_count; i++) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
+ list_for_each_entry(n, &context->names_list, list) {
+ pr_err("names[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -813,9 +894,12 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count = 0;
#endif
- for (i = 0; i < context->name_count; i++) {
- if (context->names[i].name && context->names[i].name_put)
- __putname(context->names[i].name);
+ list_for_each_entry_safe(n, next, &context->names_list, list) {
+ list_del(&n->list);
+ if (n->name && n->name_put)
+ final_putname(n->name);
+ if (n->should_free)
+ kfree(n);
}
context->name_count = 0;
path_put(&context->pwd);
@@ -837,21 +921,17 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
-static inline void audit_zero_context(struct audit_context *context,
- enum audit_state state)
-{
- memset(context, 0, sizeof(*context));
- context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
- if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
return NULL;
- audit_zero_context(context, state);
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ INIT_LIST_HEAD(&context->killed_trees);
+ INIT_LIST_HEAD(&context->names_list);
return context;
}
@@ -874,8 +954,10 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED) {
+ clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
return 0;
+ }
if (!(context = audit_alloc_context(state))) {
kfree(key);
@@ -891,91 +973,18 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
- struct audit_context *previous;
- int count = 0;
-
- do {
- previous = context->previous;
- if (previous || (count && count < 10)) {
- ++count;
- printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
- " freeing multiple contexts (%d)\n",
- context->serial, context->major,
- context->name_count, count);
- }
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- free_tree_refs(context);
- audit_free_aux(context);
- kfree(context->filterkey);
- kfree(context->sockaddr);
- kfree(context);
- context = previous;
- } while (context);
- if (count >= 10)
- printk(KERN_ERR "audit: freed %d contexts\n", count);
-}
-
-void audit_log_task_context(struct audit_buffer *ab)
-{
- char *ctx = NULL;
- unsigned len;
- int error;
- u32 sid;
-
- security_task_getsecid(current, &sid);
- if (!sid)
- return;
-
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
- return;
- }
-
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- return;
-
-error_path:
- audit_panic("error in audit_log_task_context");
- return;
-}
-
-EXPORT_SYMBOL(audit_log_task_context);
-
-static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-{
- char name[sizeof(tsk->comm)];
- struct mm_struct *mm = tsk->mm;
- struct vm_area_struct *vma;
-
- /* tsk == current */
-
- get_task_comm(name, tsk);
- audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, name);
-
- if (mm) {
- down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) &&
- vma->vm_file) {
- audit_log_d_path(ab, "exe=",
- &vma->vm_file->f_path);
- break;
- }
- vma = vma->vm_next;
- }
- up_read(&mm->mmap_sem);
- }
- audit_log_task_context(ab);
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ free_tree_refs(context);
+ audit_free_aux(context);
+ kfree(context->filterkey);
+ kfree(context->sockaddr);
+ audit_proctitle_free(context);
+ kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- uid_t auid, uid_t uid, unsigned int sessionid,
+ kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
{
struct audit_buffer *ab;
@@ -987,14 +996,17 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
if (!ab)
return rc;
- audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
- uid, sessionid);
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid), sessionid);
+ if (sid) {
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
}
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
@@ -1006,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
/*
* to_send and len_sent accounting are very loose estimates. We aren't
* really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
*
* why snprintf? an int is up to 12 digits long. if we just assumed when
* logging that a[%d]= was going to be 16 characters long we would be wasting
@@ -1023,8 +1035,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
{
char arg_num_len_buf[12];
const char __user *tmp_p = p;
- /* how many digits are in arg_num? 3 is the length of a=\n */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+ /* how many digits are in arg_num? 5 is the length of ' a=""' */
+ size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
size_t len, len_left, to_send;
size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
unsigned int i, has_cntl = 0, too_long = 0;
@@ -1109,7 +1121,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* so we can be sure nothing was lost.
*/
if ((i == 0) && (too_long))
- audit_log_format(*ab, "a%d_len=%zu ", arg_num,
+ audit_log_format(*ab, " a%d_len=%zu", arg_num,
has_cntl ? 2*len : len);
/*
@@ -1129,15 +1141,14 @@ static int audit_log_single_execve_arg(struct audit_context *context,
buf[to_send] = '\0';
/* actually log it */
- audit_log_format(*ab, "a%d", arg_num);
+ audit_log_format(*ab, " a%d", arg_num);
if (too_long)
audit_log_format(*ab, "[%d]", i);
audit_log_format(*ab, "=");
if (has_cntl)
audit_log_n_hex(*ab, buf, to_send);
else
- audit_log_format(*ab, "\"%s\"", buf);
- audit_log_format(*ab, "\n");
+ audit_log_string(*ab, buf);
p += to_send;
len_left -= to_send;
@@ -1152,20 +1163,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,
}
static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab,
- struct audit_aux_data_execve *axi)
+ struct audit_buffer **ab)
{
- int i;
- size_t len, len_sent = 0;
+ int i, len;
+ size_t len_sent = 0;
const char __user *p;
char *buf;
- if (axi->mm != current->mm)
- return; /* execve failed, no additional info */
-
- p = (const char __user *)axi->mm->arg_start;
+ p = (const char __user *)current->mm->arg_start;
- audit_log_format(*ab, "argc=%d ", axi->argc);
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1175,11 +1182,11 @@ static void audit_log_execve_info(struct audit_context *context,
*/
buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf) {
- audit_panic("out of memory for argv string\n");
+ audit_panic("out of memory for argv string");
return;
}
- for (i = 0; i < axi->argc; i++) {
+ for (i = 0; i < context->execve.argc; i++) {
len = audit_log_single_execve_arg(context, ab, i,
&len_sent, p, buf);
if (len <= 0)
@@ -1189,35 +1196,6 @@ static void audit_log_execve_info(struct audit_context *context,
kfree(buf);
}
-static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
-}
-
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1238,8 +1216,10 @@ static void show_special(struct audit_context *context, int *call_panic)
case AUDIT_IPC: {
u32 osid = context->ipc.osid;
- audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
- context->ipc.uid, context->ipc.gid, context->ipc.mode);
+ audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
+ from_kuid(&init_user_ns, context->ipc.uid),
+ from_kgid(&init_user_ns, context->ipc.gid),
+ context->ipc.mode);
if (osid) {
char *ctx = NULL;
u32 len;
@@ -1255,19 +1235,19 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab,
- "qbytes=%lx ouid=%u ogid=%u mode=%#o",
+ "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
- if (!ab)
- return;
}
break; }
case AUDIT_MQ_OPEN: {
audit_log_format(ab,
- "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+ "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
context->mq_open.oflag, context->mq_open.mode,
context->mq_open.attr.mq_flags,
@@ -1305,31 +1285,78 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
break; }
+ case AUDIT_MMAP: {
+ audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+ context->mmap.flags);
+ break; }
+ case AUDIT_EXECVE: {
+ audit_log_execve_info(context, &ab);
+ break; }
}
audit_log_end(ab);
}
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
+{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
+ if (!ab)
+ return; /* audit_panic or being filtered */
+
+ audit_log_format(ab, "proctitle=");
+
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
+ }
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
- const struct cred *cred;
int i, call_panic = 0;
struct audit_buffer *ab;
struct audit_aux_data *aux;
- const char *tty;
+ struct audit_names *n;
/* tsk == current */
- context->pid = tsk->pid;
- if (!context->ppid)
- context->ppid = sys_getppid();
- cred = current_cred();
- context->uid = cred->uid;
- context->gid = cred->gid;
- context->euid = cred->euid;
- context->suid = cred->suid;
- context->fsuid = cred->fsuid;
- context->egid = cred->egid;
- context->sgid = cred->sgid;
- context->fsgid = cred->fsgid;
context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1344,39 +1371,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " ppid=%d pid=%d auid=%u uid=%u gid=%u"
- " euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count,
- context->ppid,
- context->pid,
- tsk->loginuid,
- context->uid,
- context->gid,
- context->euid, context->suid, context->fsuid,
- context->egid, context->sgid, context->fsgid, tty,
- tsk->sessionid);
-
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
audit_log_task_info(ab, tsk);
- if (context->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab, context->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
+ audit_log_key(ab, context->filterkey);
audit_log_end(ab);
for (aux = context->aux; aux; aux = aux->next) {
@@ -1387,11 +1391,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
switch (aux->type) {
- case AUDIT_EXECVE: {
- struct audit_aux_data_execve *axi = (void *)aux;
- audit_log_execve_info(context, &ab, axi);
- break; }
-
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1455,71 +1454,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, "cwd=", &context->pwd);
+ audit_log_d_path(ab, " cwd=", &context->pwd);
audit_log_end(ab);
}
}
- for (i = 0; i < context->name_count; i++) {
- struct audit_names *n = &context->names[i];
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- continue; /* audit_panic has been called */
-
- audit_log_format(ab, "item=%d", i);
-
- if (n->name) {
- switch(n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#o"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- n->uid,
- n->gid,
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- audit_log_fcaps(ab, n);
-
- audit_log_end(ab);
+ i = 0;
+ list_for_each_entry(n, &context->names_list, list) {
+ if (n->hidden)
+ continue;
+ audit_log_name(context, n, NULL, i++, &call_panic);
}
+ audit_log_proctitle(tsk, context);
+
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
@@ -1534,12 +1482,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
*
* Called from copy_process and do_exit
*/
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- context = audit_get_context(tsk, 0, 0);
- if (likely(!context))
+ context = audit_take_context(tsk, 0, 0);
+ if (!context)
return;
/* Check for system calls that do not go through the exit
@@ -1549,6 +1497,8 @@ void audit_free(struct task_struct *tsk)
/* that can happen only if we are called from do_exit() */
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit(context, tsk);
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
audit_free_context(context);
}
@@ -1570,7 +1520,7 @@ void audit_free(struct task_struct *tsk)
* will only be written if another part of the kernel requests that it
* be written).
*/
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
@@ -1578,45 +1528,9 @@ void audit_syscall_entry(int arch, int major,
struct audit_context *context = tsk->audit_context;
enum audit_state state;
- if (unlikely(!context))
+ if (!context)
return;
- /*
- * This happens only on certain architectures that make system
- * calls in kernel_thread via the entry.S interface, instead of
- * with direct calls. (If you are porting to a new
- * architecture, hitting this condition can indicate that you
- * got the _exit/_leave calls backward in entry.S.)
- *
- * i386 no
- * x86_64 no
- * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
- *
- * This also happens with vm86 emulation in a non-nested manner
- * (entries without exits), so this case must be caught.
- */
- if (context->in_syscall) {
- struct audit_context *newctx;
-
-#if AUDIT_DEBUG
- printk(KERN_ERR
- "audit(:%d) pid=%d in syscall=%d;"
- " entering syscall=%d\n",
- context->serial, tsk->pid, context->major, major);
-#endif
- newctx = audit_alloc_context(context->state);
- if (newctx) {
- newctx->previous = context;
- context = newctx;
- tsk->audit_context = newctx;
- } else {
- /* If we can't alloc a new context, the best we
- * can do is to leak memory (any pending putname
- * will be lost). The only other alternative is
- * to abandon auditing. */
- audit_zero_context(context, context->state);
- }
- }
BUG_ON(context->in_syscall || context->name_count);
if (!audit_enabled)
@@ -1635,7 +1549,7 @@ void audit_syscall_entry(int arch, int major,
context->prio = 0;
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
}
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED)
return;
context->serial = 0;
@@ -1645,45 +1559,29 @@ void audit_syscall_entry(int arch, int major,
context->ppid = 0;
}
-void audit_finish_fork(struct task_struct *child)
-{
- struct audit_context *ctx = current->audit_context;
- struct audit_context *p = child->audit_context;
- if (!p || !ctx)
- return;
- if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
- return;
- p->arch = ctx->arch;
- p->major = ctx->major;
- memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
- p->ctime = ctx->ctime;
- p->dummy = ctx->dummy;
- p->in_syscall = ctx->in_syscall;
- p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
- p->ppid = current->pid;
- p->prio = ctx->prio;
- p->current_state = ctx->current_state;
-}
-
/**
* audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @success: success value of the syscall
+ * @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
* auditable (either because of the AUDIT_RECORD_CONTEXT state from
- * filtering, or because some other part of the kernel write an audit
+ * filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
{
struct task_struct *tsk = current;
struct audit_context *context;
- context = audit_get_context(tsk, valid, return_code);
+ if (success)
+ success = AUDITSC_SUCCESS;
+ else
+ success = AUDITSC_FAILURE;
- if (likely(!context))
+ context = audit_take_context(tsk, success, return_code);
+ if (!context)
return;
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1692,28 +1590,24 @@ void audit_syscall_exit(int valid, long return_code)
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
- if (context->previous) {
- struct audit_context *new_context = context->previous;
- context->previous = NULL;
- audit_free_context(context);
- tsk->audit_context = new_context;
- } else {
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
- tsk->audit_context = context;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
+
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ audit_free_aux(context);
+ context->aux = NULL;
+ context->aux_pids = NULL;
+ context->target_pid = 0;
+ context->target_sid = 0;
+ context->sockaddr_len = 0;
+ context->type = 0;
+ context->fds[0] = -1;
+ if (context->state != AUDIT_RECORD_CONTEXT) {
+ kfree(context->filterkey);
+ context->filterkey = NULL;
}
+ tsk->audit_context = context;
}
static inline void handle_one(const struct inode *inode)
@@ -1723,7 +1617,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(list_empty(&inode->inotify_watches)))
+ if (likely(hlist_empty(&inode->i_fsnotify_marks)))
return;
context = current->audit_context;
p = context->trees;
@@ -1736,7 +1630,7 @@ static inline void handle_one(const struct inode *inode)
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
- printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
@@ -1766,7 +1660,7 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d->d_inode;
- if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+ if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
@@ -1795,8 +1689,7 @@ retry:
goto retry;
}
/* too bad */
- printk(KERN_WARNING
- "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
@@ -1805,6 +1698,55 @@ retry:
#endif
}
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
+{
+ struct audit_names *aname;
+
+ if (context->name_count < AUDIT_NAMES) {
+ aname = &context->preallocated_names[context->name_count];
+ memset(aname, 0, sizeof(*aname));
+ } else {
+ aname = kzalloc(sizeof(*aname), GFP_NOFS);
+ if (!aname)
+ return NULL;
+ aname->should_free = true;
+ }
+
+ aname->ino = (unsigned long)-1;
+ aname->type = type;
+ list_add_tail(&aname->list, &context->names_list);
+
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ return aname;
+}
+
+/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr)
+ return n->name;
+ }
+ return NULL;
+}
+
/**
* audit_getname - add a name to the list
* @name: name to add
@@ -1812,35 +1754,36 @@ retry:
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
{
struct audit_context *context = current->audit_context;
-
- if (IS_ERR(name) || !name)
- return;
+ struct audit_names *n;
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ pr_err("%s:%d(:%d): ignoring getname(%p)\n",
__FILE__, __LINE__, context->serial, name);
dump_stack();
#endif
return;
}
- BUG_ON(context->name_count >= AUDIT_NAMES);
- context->names[context->name_count].name = name;
- context->names[context->name_count].name_len = AUDIT_NAME_FULL;
- context->names[context->name_count].name_put = 1;
- context->names[context->name_count].ino = (unsigned long)-1;
- context->names[context->name_count].osid = 0;
- ++context->name_count;
- if (!context->pwd.dentry) {
- read_lock(&current->fs->lock);
- context->pwd = current->fs->pwd;
- path_get(&current->fs->pwd);
- read_unlock(&current->fs->lock);
- }
+#if AUDIT_DEBUG
+ /* The filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
+ if (!n)
+ return;
+
+ n->name = name;
+ n->name_len = AUDIT_NAME_FULL;
+ n->name_put = true;
+ name->aname = n;
+
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
}
/* audit_putname - intercept a putname request
@@ -1850,145 +1793,124 @@ void __audit_getname(const char *name)
* then we delay the putname until syscall exit.
* Called from include/linux/fs.h:putname().
*/
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
{
struct audit_context *context = current->audit_context;
BUG_ON(!context);
- if (!context->in_syscall) {
+ if (!name->aname || !context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ pr_err("%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
- int i;
- for (i = 0; i < context->name_count; i++)
- printk(KERN_ERR "name[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
- }
+ struct audit_names *n;
+ int i = 0;
+
+ list_for_each_entry(n, &context->names_list, list)
+ pr_err("name[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
+ }
#endif
- __putname(name);
+ final_putname(name);
}
#if AUDIT_DEBUG
else {
++context->put_count;
if (context->put_count > context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d"
- " in_syscall=%d putname(%p) name_count=%d"
- " put_count=%d\n",
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+ " name_count=%d put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
- context->in_syscall, name, context->name_count,
- context->put_count);
+ context->in_syscall, name->name,
+ context->name_count, context->put_count);
dump_stack();
}
}
#endif
}
-static int audit_inc_name_count(struct audit_context *context,
- const struct inode *inode)
-{
- if (context->name_count >= AUDIT_NAMES) {
- if (inode)
- printk(KERN_DEBUG "name_count maxed, losing inode data: "
- "dev=%02x:%02x, inode=%lu\n",
- MAJOR(inode->i_sb->s_dev),
- MINOR(inode->i_sb->s_dev),
- inode->i_ino);
-
- else
- printk(KERN_DEBUG "name_count maxed, losing inode data\n");
- return 1;
- }
- context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
- return 0;
-}
-
-
-static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
- memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
- name->fcap.fE = 0;
- name->fcap_ver = 0;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-
-/* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
/**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- *
- * Called from fs/namei.c:path_lookup().
+ * @flags: attributes for this particular entry
*/
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int flags)
{
- int idx;
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
+ struct audit_names *n;
+ bool parent = flags & AUDIT_INODE_PARENT;
if (!context->in_syscall)
return;
- if (context->name_count
- && context->names[context->name_count-1].name
- && context->names[context->name_count-1].name == name)
- idx = context->name_count - 1;
- else if (context->name_count > 1
- && context->names[context->name_count-2].name
- && context->names[context->name_count-2].name == name)
- idx = context->name_count - 2;
- else {
- /* FIXME: how much do we care about inodes that have no
- * associated name? */
- if (audit_inc_name_count(context, inode))
- return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
+
+ if (!name)
+ goto out_alloc;
+
+#if AUDIT_DEBUG
+ /* The struct filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
+ list_for_each_entry_reverse(n, &context->names_list, list) {
+ /* does the name pointer match? */
+ if (!n->name || n->name->name != name->name)
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
+out_alloc:
+ /* unable to find the name from a previous getname(). Allocate a new
+ * anonymous entry.
+ */
+ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
+ if (!n)
+ return;
+out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ if (flags & AUDIT_INODE_HIDDEN)
+ n->hidden = true;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
- audit_copy_inode(&context->names[idx], dentry, inode);
+ audit_copy_inode(n, dentry, inode);
}
/**
- * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
- * @dentry: dentry being audited
+ * __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -1998,89 +1920,80 @@ void __audit_inode(const char *name, const struct dentry *dentry)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
- const struct inode *parent)
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
{
- int idx;
struct audit_context *context = current->audit_context;
- const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
- int dirlen = 0;
+ const char *dname = dentry->d_name.name;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
if (!context->in_syscall)
return;
if (inode)
handle_one(inode);
- /* determine matching parent */
- if (!dname)
- goto add_names;
-
- /* parent is more likely, look for it first */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
- if (!n->name)
+ /* look for a parent entry first */
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name || n->type != AUDIT_TYPE_PARENT)
continue;
if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- n->name_len = dirlen; /* update parent data in place */
- found_parent = n->name;
- goto add_names;
+ !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ found_parent = n;
+ break;
}
}
- /* no matching parent, look for matching child */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
+ /* is there a matching child entry? */
+ list_for_each_entry(n, &context->names_list, list) {
+ /* can only match entries that have a name */
+ if (!n->name || n->type != type)
+ continue;
- if (!n->name)
+ /* if we found a parent, make sure this one is a child of it */
+ if (found_parent && (n->name != found_parent->name))
continue;
- /* strcmp() is the more likely scenario */
- if (!strcmp(dname, n->name) ||
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- if (inode)
- audit_copy_inode(n, NULL, inode);
- else
- n->ino = (unsigned long)-1;
- found_child = n->name;
- goto add_names;
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ found_child = n;
+ break;
}
}
-add_names:
if (!found_parent) {
- if (audit_inc_name_count(context, parent))
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
+ if (!n)
return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
- audit_copy_inode(&context->names[idx], NULL, parent);
+ audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- if (audit_inc_name_count(context, inode))
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
return;
- idx = context->name_count - 1;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- context->names[idx].name = found_parent;
- context->names[idx].name_len = AUDIT_NAME_FULL;
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- context->names[idx].name_put = 0;
- } else {
- context->names[idx].name = NULL;
+ found_child->name_put = false;
}
-
- if (inode)
- audit_copy_inode(&context->names[idx], NULL, inode);
- else
- context->names[idx].ino = (unsigned long)-1;
}
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2112,47 +2025,88 @@ int auditsc_get_stamp(struct audit_context *ctx,
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid, unsigned int sessionid,
+ int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+
+ if (!audit_enabled)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+ audit_log_end(ab);
+}
+
/**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
*
* Returns 0.
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(kuid_t loginuid)
{
- unsigned int sessionid = atomic_inc_return(&session_id);
- struct audit_context *context = task->audit_context;
+ struct task_struct *task = current;
+ unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ kuid_t oldloginuid;
+ int rc;
- if (context && context->in_syscall) {
- struct audit_buffer *ab;
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (ab) {
- audit_log_format(ab, "login pid=%d uid=%u "
- "old auid=%u new auid=%u"
- " old ses=%u new ses=%u",
- task->pid, task_uid(task),
- task->loginuid, loginuid,
- task->sessionid, sessionid);
- audit_log_end(ab);
- }
- }
task->sessionid = sessionid;
task->loginuid = loginuid;
- return 0;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
}
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
* @mode: mode bits
- * @u_attr: queue attributes
+ * @attr: queue attributes
*
*/
-void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
struct audit_context *context = current->audit_context;
@@ -2196,7 +2150,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
/**
* __audit_mq_notify - record audit data for a POSIX MQ notify
* @mqdes: MQ descriptor
- * @u_notification: Notification event
+ * @notification: Notification event
*
*/
@@ -2252,7 +2206,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
*
* Called only after audit_ipc_obj().
*/
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
struct audit_context *context = current->audit_context;
@@ -2263,44 +2217,31 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
context->ipc.has_perm = 1;
}
-int audit_bprm(struct linux_binprm *bprm)
+void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- if (likely(!audit_enabled || !context || context->dummy))
- return 0;
-
- ax = kmalloc(sizeof(*ax), GFP_KERNEL);
- if (!ax)
- return -ENOMEM;
-
- ax->argc = bprm->argc;
- ax->envc = bprm->envc;
- ax->mm = bprm->mm;
- ax->d.type = AUDIT_EXECVE;
- ax->d.next = context->aux;
- context->aux = (void *)ax;
- return 0;
+ context->type = AUDIT_EXECVE;
+ context->execve.argc = bprm->argc;
}
/**
* audit_socketcall - record audit data for sys_socketcall
- * @nargs: number of args
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
-void audit_socketcall(int nargs, unsigned long *args)
+int __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return;
-
+ if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+ return -EINVAL;
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+ return 0;
}
/**
@@ -2323,13 +2264,10 @@ void __audit_fd_pair(int fd1, int fd2)
*
* Returns 0 for success or NULL context or < 0 on error.
*/
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return 0;
-
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
if (!p)
@@ -2346,7 +2284,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = t->pid;
+ context->target_pid = task_pid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2367,12 +2305,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
- uid_t uid = current_uid(), t_uid = task_uid(t);
+ kuid_t uid = current_uid(), t_uid = task_uid(t);
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = tsk->pid;
- if (tsk->loginuid != -1)
+ audit_sig_pid = task_pid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
@@ -2385,7 +2323,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
- ctx->target_pid = t->tgid;
+ ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
@@ -2406,7 +2344,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
- axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2465,24 +2403,58 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
/**
* __audit_log_capset - store information about the arguments to the capset syscall
- * @pid: target pid of the capset call
* @new: the new credentials
* @old: the old (current) credentials
*
* Record the aguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
-void __audit_log_capset(pid_t pid,
- const struct cred *new, const struct cred *old)
+void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = pid;
+ context->capset.pid = task_pid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
context->type = AUDIT_CAPSET;
}
+void __audit_mmap_fd(int fd, int flags)
+{
+ struct audit_context *context = current->audit_context;
+ context->mmap.fd = fd;
+ context->mmap.flags = flags;
+ context->type = AUDIT_MMAP;
+}
+
+static void audit_log_task(struct audit_buffer *ab)
+{
+ kuid_t auid, uid;
+ kgid_t gid;
+ unsigned int sessionid;
+ struct mm_struct *mm = current->mm;
+
+ auid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ current_uid_gid(&uid, &gid);
+
+ audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_untrustedstring(ab, current->comm);
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
+}
+
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
@@ -2493,10 +2465,6 @@ void __audit_log_capset(pid_t pid,
void audit_core_dumps(long signr)
{
struct audit_buffer *ab;
- u32 sid;
- uid_t auid = audit_get_loginuid(current), uid;
- gid_t gid;
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -2505,23 +2473,33 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- current_uid_gid(&uid, &gid);
- audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
- security_task_getsecid(current, &sid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
+ audit_log_end(ab);
+}
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
- audit_log_format(ab, " pid=%d comm=", current->pid);
- audit_log_untrustedstring(ab, current->comm);
+void __audit_seccomp(unsigned long syscall, long signr, int code)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
audit_log_format(ab, " sig=%ld", signr);
+ audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
+
+struct list_head *audit_killed_trees(void)
+{
+ struct audit_context *ctx = current->audit_context;
+ if (likely(!ctx || !ctx->in_syscall))
+ return NULL;
+ return &ctx->killed_trees;
+}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c..1323360d90e 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
static void backtrace_test_normal(void)
{
- printk("Testing a backtrace from process context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from process context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
- printk("Testing a backtrace from irq context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from irq context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
struct stack_trace trace;
unsigned long entries[8];
- printk("Testing a saved backtrace.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a saved backtrace.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
#else
static void backtrace_test_saved(void)
{
- printk("Saved backtrace test skipped.\n");
+ pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
- printk("====[ backtrace testing ]===========\n");
+ pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
- printk("====[ end of backtrace testing ]====\n");
+ pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c530138183..9fd4246b04b 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,19 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
+#include <linux/log2.h>
+#include <linux/spinlock_types.h>
void foo(void)
{
- /* The enum constants to put into include/linux/bounds.h */
+ /* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+ DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
+ DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f..a5cf13c018c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,29 +7,25 @@
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
-#include "cred-internals.h"
/*
* Leveraged for setting/resetting capabilities
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
-
EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
@@ -38,7 +34,6 @@ static int __init file_caps_disable(char *str)
return 1;
}
__setup("no_file_caps", file_caps_disable);
-#endif
/*
* More recent versions of libcap are available from:
@@ -48,15 +43,10 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- static int warned;
- if (!warned) {
- char name[sizeof(current->comm)];
-
- printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
- " (legacy support in use)\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+ get_task_comm(name, current));
}
/*
@@ -77,16 +67,10 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- static int warned;
-
- if (!warned) {
- char name[sizeof(current->comm)];
+ char name[sizeof(current->comm)];
- printk(KERN_INFO "warning: `%s' uses deprecated v2"
- " capabilities in a way that may be insecure.\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+ get_task_comm(name, current));
}
/*
@@ -137,7 +121,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
if (pid && (pid != task_pid_vnr(current))) {
struct task_struct *target;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
target = find_task_by_vpid(pid);
if (!target)
@@ -145,7 +129,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
else
ret = security_capget(target, pEp, pIp, pPp);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
} else
ret = security_capget(current, pEp, pIp, pPp);
@@ -169,8 +153,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
kernel_cap_t pE, pI, pP;
ret = cap_validate_magic(header, &tocopy);
- if (ret != 0)
- return ret;
+ if ((dataptr == NULL) || (ret != 0))
+ return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
if (get_user(pid, &header->pid))
return -EFAULT;
@@ -204,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
- * unexpectidly fail; the capget/modify/capset aborts
+ * unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
@@ -238,7 +222,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy;
+ unsigned i, tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -255,8 +239,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (pid != 0 && pid != task_pid_vnr(current))
return -EPERM;
- if (copy_from_user(&kdata, data,
- tocopy * sizeof(struct __user_cap_data_struct)))
+ copybytes = tocopy * sizeof(struct __user_cap_data_struct);
+ if (copybytes > sizeof(kdata))
+ return -EFAULT;
+
+ if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
for (i = 0; i < tocopy; i++) {
@@ -280,7 +267,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (ret < 0)
goto error;
- audit_log_capset(pid, new, current_cred());
+ audit_log_capset(new, current_cred());
return commit_creds(new);
@@ -290,7 +277,88 @@ error:
}
/**
- * capable - Determine if the current task has a superior capability in effect
+ * has_ns_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable(__task_cred(t), ns, cap);
+ rcu_read_unlock();
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+ return has_ns_capability(t, &init_user_ns, cap);
+}
+
+/**
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability_noaudit(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ rcu_read_unlock();
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ return has_ns_capability_noaudit(t, &init_user_ns, cap);
+}
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
@@ -299,17 +367,76 @@ error:
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
-int capable(int cap)
+bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
- printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+ pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
- if (security_capable(cap) == 0) {
+ if (security_capable(current_cred(), ns, cap) == 0) {
current->flags |= PF_SUPERPRIV;
- return 1;
+ return true;
}
- return 0;
+ return false;
+}
+EXPORT_SYMBOL(ns_capable);
+
+/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+ int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
+
+/**
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @inode: The inode in question
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
+ */
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+{
+ struct user_namespace *ns = current_user_ns();
+
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b..70776aec256 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
+ * Notifications support
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
@@ -22,11 +26,16 @@
* distribution for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/fs.h>
+#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
@@ -34,81 +43,135 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
-#include <linux/namei.h>
+#include <linux/hashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/kthread.h>
+#include <linux/delay.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
-static DEFINE_MUTEX(cgroup_mutex);
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
-/* Generate an array of cgroup subsystem pointers */
-#define SUBSYS(_x) &_x ## _subsys,
-
-static struct cgroup_subsys *subsys[] = {
-#include <linux/cgroup_subsys.h>
-};
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
*/
-struct cgroupfs_root {
- struct super_block *sb;
-
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
- unsigned long subsys_bits;
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
+#endif
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_bits;
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
- /* The root cgroup for this hierarchy */
- struct cgroup top_cgroup;
+#define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
- /* A list running through the active hierarchies */
- struct list_head root_list;
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
- /* Hierarchy-specific flags */
- unsigned long flags;
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
- /* The path to use for release notifications. */
- char release_agent_path[PATH_MAX];
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
+/*
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
+ */
+struct cgroup_root cgrp_dfl_root;
/*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
- * subsystems that are otherwise unattached - it never has more than a
- * single cgroup, and all tasks are part of that cgroup.
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
*/
-static struct cgroupfs_root rootnode;
+static bool cgrp_dfl_root_visible;
+
+/* some controllers are not supported in the default hierarchy */
+static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+#ifdef CONFIG_CGROUP_DEBUG
+ | (1 << debug_cgrp_id)
+#endif
+ ;
/* The list of hierarchy roots */
-static LIST_HEAD(roots);
-static int root_count;
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
-#define dummytop (&rootnode.top_cgroup)
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
+
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
@@ -117,16 +180,152 @@ static int root_count;
*/
static int need_forkexit_callback __read_mostly;
+static struct cftype cgroup_base_files[];
+
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned int ss_mask);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgroup_parent(cgrp);
+
+ return cgroup_css(cgrp, ss);
+}
+
/* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
- return test_bit(CGRP_REMOVED, &cgrp->flags);
+ return !(cgrp->self.flags & CSS_ONLINE);
}
-/* bits in struct cgroupfs_root flags field */
-enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
-};
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgroup_parent(cgrp);
+ }
+ return false;
+}
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -141,351 +340,617 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-/*
- * for_each_subsys() allows you to iterate on each subsystem attached to
- * an active hierarchy
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_subsys(_root, _ss) \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
-/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_active_root(_root) \
-list_for_each_entry(_root, &roots, root_list)
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
-/* Link structure for associating css_set objects with cgroups */
-struct cg_cgroup_link {
- /*
- * List running through cg_cgroup_links associated with a
- * cgroup, anchored on cgroup->css_sets
- */
- struct list_head cgrp_link_list;
- /*
- * List running through cg_cgroup_links pointing at a
- * single css_set object, anchored on css_set->cg_links
- */
- struct list_head cg_link_list;
- struct css_set *cg;
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
};
-/* The default css_set - used by init and its children prior to any
+/*
+ * The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
-static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static int css_set_count = 1; /* 1 for init_css_set */
-/* css_set_lock protects the list of css_set objects, and the
- * chain of tasks off each css_set. Nests outside task->alloc_lock
- * due to cgroup_iter_start() */
-static DEFINE_RWLOCK(css_set_lock);
-static int css_set_count;
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
+
+ do {
+ bool trigger;
-/* hash table for cgroup groups. This improves the performance to
- * find an existing css_set */
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
+
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
int i;
- int index;
- unsigned long tmp = 0UL;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- return &css_set_table[index];
+ return key;
}
-/* We don't maintain the lists running through each css_set to its
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
-/* When we create or destroy a css_set, the operation simply
- * takes/releases a reference count on all the cgroups referenced
- * by subsystems in this css_set. This can end up multiple-counting
- * some cgroups, but that's OK - the ref-count is just a
- * busy/not-busy indicator; ensuring that we only count each cgroup
- * once would require taking a global lock to ensure that no
- * subsystems moved between hierarchies while we were doing so.
- *
- * Possible TODO: decide at boot time based on the number of
- * registered subsystems and the number of CPUs or NUMA nodes whether
- * it's better for performance to ref-count every subsystem, or to
- * take a global lock and only add one ref count to each hierarchy.
- */
+ lockdep_assert_held(&css_set_rwsem);
-/*
- * unlink a css_set from the list and free it
- */
-static void unlink_css_set(struct css_set *cg)
-{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+ if (!atomic_dec_and_test(&cset->refcount))
+ return;
- hlist_del(&cg->hlist);
+ /* This css_set is dead. unlink it and release cgroup refcounts */
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
+ hash_del(&cset->hlist);
css_set_count--;
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *cgrp = link->cgrp;
+
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+
+ /* @cgrp can't go away while we're holding css_set_rwsem */
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ if (notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
+ }
+
kfree(link);
}
+
+ kfree_rcu(cset, rcu_head);
}
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void put_css_set(struct css_set *cset, bool taskexit)
{
- int i;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
- if (atomic_add_unless(&cg->refcount, -1, 1))
+ if (atomic_add_unless(&cset->refcount, -1, 1))
return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cg->refcount)) {
- write_unlock(&css_set_lock);
- return;
- }
- unlink_css_set(cg);
- write_unlock(&css_set_lock);
- rcu_read_lock();
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
- if (atomic_dec_and_test(&cgrp->count) &&
- notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- }
- rcu_read_unlock();
- kfree(cg);
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
}
/*
* refcounted get/put for css_set objects
*/
-static inline void get_css_set(struct css_set *cg)
+static inline void get_css_set(struct css_set *cset)
{
- atomic_inc(&cg->refcount);
+ atomic_inc(&cset->refcount);
}
-static inline void put_css_set(struct css_set *cg)
+/**
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
+ struct cgroup *new_cgrp,
+ struct cgroup_subsys_state *template[])
{
- __put_css_set(cg, 0);
-}
+ struct list_head *l1, *l2;
-static inline void put_css_set_taskexit(struct css_set *cg)
-{
- __put_css_set(cg, 1);
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
+ return false;
+
+ /*
+ * Compare cgroup pointers in order to distinguish between
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
+ */
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
+ while (1) {
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
+
+ l1 = l1->next;
+ l2 = l2->next;
+ /* See if we reached the end - both lists are equal length. */
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
+ break;
+ } else {
+ BUG_ON(l2 == &old_cset->cgrp_links);
+ }
+ /* Locate the cgroups associated with these links. */
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
+ /* Hierarchies should be linked in the same order. */
+ BUG_ON(cgrp1->root != cgrp2->root);
+
+ /*
+ * If this hierarchy is the hierarchy of the cgroup
+ * that's changing, then we need to check that this
+ * css_set points to the new cgroup; if it's any other
+ * hierarchy, then this css_set should point to the
+ * same cgroup as the old css_set.
+ */
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
+ return false;
+ } else {
+ if (cgrp1 != cgrp2)
+ return false;
+ }
+ }
+ return true;
}
-/*
- * find_existing_css_set() is a helper for
- * find_css_set(), and checks to see whether an existing
- * css_set is suitable.
- *
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
*/
-static struct css_set *find_existing_css_set(
- struct css_set *oldcg,
- struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
int i;
- struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
- struct css_set *cg;
-
- /* Built the set of subsystem state objects that we want to
- * see in the new css_set */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- if (root->subsys_bits & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgrp->subsys[i];
+
+ /*
+ * Build the set of subsystem state objects that we want to see in the
+ * new css_set. while subsystems can change globally, the entries here
+ * won't change, so no need for locking.
+ */
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
- template[i] = oldcg->subsys[i];
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
- if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
- /* All subsystems matched */
- return cg;
- }
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
+ continue;
+
+ /* This css_set matches what we need */
+ return cset;
}
/* No existing cgroup group matched */
return NULL;
}
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+ struct cgrp_cset_link *link, *tmp_link;
- list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
- list_del(&link->cgrp_link_list);
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
kfree(link);
}
}
-/*
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
- * success or a negative error
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
*/
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
int i;
- INIT_LIST_HEAD(tmp);
+
+ INIT_LIST_HEAD(tmp_links);
+
for (i = 0; i < count; i++) {
- link = kmalloc(sizeof(*link), GFP_KERNEL);
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
- free_cg_links(tmp);
+ free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
- list_add(&link->cgrp_link_list, tmp);
+ list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
- * @cg: the css_set to be linked
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
-static void link_css_set(struct list_head *tmp_cg_links,
- struct css_set *cg, struct cgroup *cgrp)
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
- BUG_ON(list_empty(tmp_cg_links));
- link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
- cgrp_link_list);
- link->cg = cg;
- list_move(&link->cgrp_link_list, &cgrp->css_sets);
- list_add(&link->cg_link_list, &cg->cg_links);
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
+ link->cgrp = cgrp;
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
+ list_move(&link->cset_link, &cgrp->cset_links);
+
+ /*
+ * Always add links to the tail of the list so that the list
+ * is sorted by order of hierarchy creation
+ */
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
}
-/*
- * find_css_set() takes an existing cgroup group and a
- * cgroup object, and returns a css_set object that's
- * equivalent to the old group, but with the given cgroup
- * substituted into the appropriate hierarchy. Must be called with
- * cgroup_mutex held
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
*/
-static struct css_set *find_css_set(
- struct css_set *oldcg, struct cgroup *cgrp)
+static struct css_set *find_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp)
{
- struct css_set *res;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
- int i;
-
- struct list_head tmp_cg_links;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct css_set *cset;
+ struct list_head tmp_links;
+ struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid;
- struct hlist_head *hhead;
+ lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
- res = find_existing_css_set(oldcg, cgrp, template);
- if (res)
- get_css_set(res);
- read_unlock(&css_set_lock);
+ down_read(&css_set_rwsem);
+ cset = find_existing_css_set(old_cset, cgrp, template);
+ if (cset)
+ get_css_set(cset);
+ up_read(&css_set_rwsem);
- if (res)
- return res;
+ if (cset)
+ return cset;
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
+ cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ if (!cset)
return NULL;
- /* Allocate all the cg_cgroup_link objects that we'll need */
- if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
- kfree(res);
+ /* Allocate all the cgrp_cset_link objects that we'll need */
+ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+ kfree(cset);
return NULL;
}
- atomic_set(&res->refcount, 1);
- INIT_LIST_HEAD(&res->cg_links);
- INIT_LIST_HEAD(&res->tasks);
- INIT_HLIST_NODE(&res->hlist);
+ atomic_set(&cset->refcount, 1);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
+ INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
- memcpy(res->subsys, template, sizeof(res->subsys));
+ memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = res->subsys[i]->cgroup;
- struct cgroup_subsys *ss = subsys[i];
- atomic_inc(&cgrp->count);
- /*
- * We want to add a link once per cgroup, so we
- * only do it for the first subsystem in each
- * hierarchy
- */
- if (ss->root->subsys_list.next == &ss->sibling)
- link_css_set(&tmp_cg_links, res, cgrp);
+ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == cgrp->root)
+ c = cgrp;
+ link_css_set(&tmp_links, cset, c);
}
- if (list_empty(&rootnode.subsys_list))
- link_css_set(&tmp_cg_links, res, dummytop);
- BUG_ON(!list_empty(&tmp_cg_links));
+ BUG_ON(!list_empty(&tmp_links));
css_set_count++;
- /* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ /* Add @cset to the hash table */
+ key = css_set_hash(cset->subsys);
+ hash_add(css_set_table, &cset->hlist, key);
+
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
+ up_write(&css_set_rwsem);
+
+ return cset;
+}
+
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+ struct cgroup *root_cgrp = kf_root->kn->priv;
+
+ return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+
+ /*
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
+ */
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
+ } else {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
- write_unlock(&css_set_lock);
+ if (c->root == root) {
+ res = c;
+ break;
+ }
+ }
+ }
+ BUG_ON(!res);
return res;
}
/*
- * There is one global cgroup mutex. We also require taking
- * task_lock() when dereferencing a task's cgroup subsys pointers.
- * See "The task_lock() exception", at the end of this comment.
- *
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
@@ -511,338 +976,449 @@ static struct css_set *find_css_set(
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
- * need a special hack to ensure that top_cgroup cannot be deleted.
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
- * another. It does so using cgroup_mutex, however there are
- * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
- * mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
- * the task_struct routinely used for such matters.
+ * need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- mutex_lock(&cgroup_mutex);
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
/**
- * cgroup_unlock - release lock on cgroup changes
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
*
- * Undo the lock taken in a previous cgroup_lock() call.
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
*/
-void cgroup_unlock(void)
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- mutex_unlock(&cgroup_mutex);
-}
+ umode_t mode = 0;
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
+ if (cft->mode)
+ return cft->mode;
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
-static struct inode_operations cgroup_dir_inode_operations;
-static struct file_operations proc_cgroupstats_operations;
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
+ if (cft->write_u64 || cft->write_s64 || cft->write)
+ mode |= S_IWUSR;
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
-{
- struct inode *inode = new_inode(sb);
-
- if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
+ return mode;
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static void cgroup_get(struct cgroup *cgrp)
{
- struct cgroup_subsys *ss;
- for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy)
- ss->pre_destroy(ss, cgrp);
- return;
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
}
-static void free_cgroup_rcu(struct rcu_head *obj)
+static void cgroup_put(struct cgroup *cgrp)
{
- struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
- kfree(cgrp);
+ css_put(&cgrp->self);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
+ */
+static void cgroup_kn_unlock(struct kernfs_node *kn)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
-
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->destroy(ss, cgrp);
+ struct cgroup *cgrp;
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ mutex_unlock(&cgroup_mutex);
- call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
- }
- iput(inode);
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
}
-static void remove_dir(struct dentry *d)
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
{
- struct dentry *parent = dget(d->d_parent);
+ struct cgroup *cgrp;
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
+
+ cgroup_kn_unlock(kn);
+ return NULL;
}
-static void cgroup_clear_directory(struct dentry *dentry)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
- struct list_head *node;
+ char name[CGROUP_FILE_NAME_MAX];
- BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dcache_lock);
- node = dentry->d_subdirs.next;
- while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
- list_del_init(node);
- if (d->d_inode) {
- /* This should never be called on a cgroup
- * directory with child cgroups */
- BUG_ON(d->d_inode->i_mode & S_IFDIR);
- d = dget_locked(d);
- spin_unlock(&dcache_lock);
- d_delete(d);
- simple_unlink(dentry->d_inode, d);
- dput(d);
- spin_lock(&dcache_lock);
- }
- node = dentry->d_subdirs.next;
- }
- spin_unlock(&dcache_lock);
+ lockdep_assert_held(&cgroup_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
-/*
- * NOTE : the dentry must have been dget()'ed
+/**
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
*/
-static void cgroup_d_remove_dir(struct dentry *dentry)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- cgroup_clear_directory(dentry);
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- spin_lock(&dcache_lock);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dcache_lock);
- remove_dir(dentry);
+ if (!(subsys_mask & (1 << i)))
+ continue;
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
+ }
}
-static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long final_bits)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
{
- unsigned long added_bits, removed_bits;
- struct cgroup *cgrp = &root->top_cgroup;
- int i;
+ struct cgroup_subsys *ss;
+ unsigned int tmp_ss_mask;
+ int ssid, i, ret;
- removed_bits = root->actual_subsys_bits & ~final_bits;
- added_bits = final_bits & ~root->actual_subsys_bits;
- /* Check that any added subsystems are currently free */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
- struct cgroup_subsys *ss = subsys[i];
- if (!(bit & added_bits))
+ lockdep_assert_held(&cgroup_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (!(ss_mask & (1 << ssid)))
continue;
- if (ss->root != &rootnode) {
- /* Subsystem isn't free */
+
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ return -EBUSY;
+
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
- }
}
- /* Currently we don't handle adding/removing subsystems when
- * any child cgroups exist. This is theoretically supportable
- * but involves complex error handling, so it's being left until
- * later */
- if (root->number_of_cgroups > 1)
- return -EBUSY;
+ /* skip creating root files on dfl_root for inhibited subsystems */
+ tmp_ss_mask = ss_mask;
+ if (dst_root == &cgrp_dfl_root)
+ tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- /* Process each subsystem */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- unsigned long bit = 1UL << i;
- if (bit & added_bits) {
- /* We're binding this subsystem to this hierarchy */
- BUG_ON(cgrp->subsys[i]);
- BUG_ON(!dummytop->subsys[i]);
- BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
- mutex_lock(&ss->hierarchy_mutex);
- cgrp->subsys[i] = dummytop->subsys[i];
- cgrp->subsys[i]->cgroup = cgrp;
- list_move(&ss->sibling, &root->subsys_list);
- ss->root = root;
- if (ss->bind)
- ss->bind(ss, cgrp);
- mutex_unlock(&ss->hierarchy_mutex);
- } else if (bit & removed_bits) {
- /* We're removing this subsystem */
- BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
- mutex_lock(&ss->hierarchy_mutex);
- if (ss->bind)
- ss->bind(ss, dummytop);
- dummytop->subsys[i]->cgroup = dummytop;
- cgrp->subsys[i] = NULL;
- subsys[i]->root = &rootnode;
- list_move(&ss->sibling, &rootnode.subsys_list);
- mutex_unlock(&ss->hierarchy_mutex);
- } else if (bit & final_bits) {
- /* Subsystem state should already exist */
- BUG_ON(!cgrp->subsys[i]);
- } else {
- /* Subsystem state shouldn't exist */
- BUG_ON(cgrp->subsys[i]);
+ ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+ if (ret) {
+ if (dst_root != &cgrp_dfl_root)
+ return ret;
+
+ /*
+ * Rebinding back to the default root is not allowed to
+ * fail. Using both default and non-default roots should
+ * be rare. Moving subsystems back and forth even more so.
+ * Just warn about it and continue.
+ */
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
}
- root->subsys_bits = root->actual_subsys_bits = final_bits;
- synchronize_rcu();
+ /*
+ * Nothing can fail from this point on. Remove files for the
+ * removed subsystems and rebind each subsystem.
+ */
+ for_each_subsys(ss, ssid)
+ if (ss_mask & (1 << ssid))
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+
+ for_each_subsys(ss, ssid) {
+ struct cgroup_root *src_root;
+ struct cgroup_subsys_state *css;
+ struct css_set *cset;
+
+ if (!(ss_mask & (1 << ssid)))
+ continue;
+
+ src_root = ss->root;
+ css = cgroup_css(&src_root->cgrp, ss);
+
+ WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+
+ RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+ rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ ss->root = dst_root;
+ css->cgroup = &dst_root->cgrp;
+
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root)
+ dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+
+ if (ss->bind)
+ ss->bind(css);
+ }
+
+ kernfs_activate(dst_root->cgrp.kn);
return 0;
}
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
-
- mutex_lock(&cgroup_mutex);
- for_each_subsys(root, ss)
- seq_printf(seq, ",%s", ss->name);
- if (test_bit(ROOT_NOPREFIX, &root->flags))
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->name);
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+ seq_puts(seq, ",sane_behavior");
+ if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- mutex_unlock(&cgroup_mutex);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
+ if (strlen(root->name))
+ seq_printf(seq, ",name=%s", root->name);
return 0;
}
struct cgroup_sb_opts {
- unsigned long subsys_bits;
- unsigned long flags;
+ unsigned int subsys_mask;
+ unsigned int flags;
char *release_agent;
+ bool cpuset_clone_children;
+ char *name;
+ /* User explicitly requested empty subsystem */
+ bool none;
};
-/* Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. */
-static int parse_cgroupfs_options(char *data,
- struct cgroup_sb_opts *opts)
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
- char *token, *o = data ?: "all";
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
+ unsigned int mask = -1U;
+ struct cgroup_subsys *ss;
+ int i;
- opts->subsys_bits = 0;
- opts->flags = 0;
- opts->release_agent = NULL;
+#ifdef CONFIG_CPUSETS
+ mask = ~(1U << cpuset_cgrp_id);
+#endif
+
+ memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
+ if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
+ continue;
+ }
if (!strcmp(token, "all")) {
- /* Add all non-disabled subsystems */
- int i;
- opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (!ss->disabled)
- opts->subsys_bits |= 1ul << i;
- }
- } else if (!strcmp(token, "noprefix")) {
- set_bit(ROOT_NOPREFIX, &opts->flags);
- } else if (!strncmp(token, "release_agent=", 14)) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
+ opts->flags |= CGRP_ROOT_NOPREFIX;
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
- opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
- opts->release_agent[PATH_MAX - 1] = 0;
- } else {
- struct cgroup_subsys *ss;
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- ss = subsys[i];
- if (!strcmp(token, ss->name)) {
- if (!ss->disabled)
- set_bit(i, &opts->subsys_bits);
- break;
- }
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
+ const char *name = token + 5;
+ /* Can't specify an empty name */
+ if (!strlen(name))
+ return -EINVAL;
+ /* Must match [\w.-]+ */
+ for (i = 0; i < strlen(name); i++) {
+ char c = name[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return -EINVAL;
}
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(name,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
+
+ continue;
}
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ opts->subsys_mask |= (1 << i);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
}
- /* We can't have an empty hierarchy */
- if (!opts->subsys_bits)
+ /* Consistency checks */
+
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ return -EINVAL;
+ }
+ } else {
+ /*
+ * If the 'all' option was specified select all the
+ * subsystems, otherwise if 'none', 'name=' and a subsystem
+ * name options were not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So
+ * all empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+ }
+
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+ return -EINVAL;
+
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_mask && opts->none)
return -EINVAL;
return 0;
}
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
+ unsigned int added_mask, removed_mask;
+
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: remount is not allowed\n");
+ return -EINVAL;
+ }
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */
@@ -850,1194 +1426,2418 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
- /* Don't allow flags to change at remount */
- if (opts.flags != root->flags) {
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ (opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+ root->flags & CGRP_ROOT_OPTION_MASK, root->name);
ret = -EINVAL;
goto out_unlock;
}
- ret = rebind_subsystems(root, opts.subsys_bits);
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
- /* (re)populate subsystem files */
- if (!ret)
- cgroup_populate_dir(cgrp);
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
+
+ rebind_subsystems(&cgrp_dfl_root, removed_mask);
- if (opts.release_agent)
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
- if (opts.release_agent)
- kfree(opts.release_agent);
+ kfree(opts.release_agent);
+ kfree(opts.name);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
}
-static struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
+
+static void cgroup_enable_task_cg_lists(void)
+{
+ struct task_struct *p, *g;
+
+ down_write(&css_set_rwsem);
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING)) {
+ struct css_set *cset = task_css_set(p);
+
+ list_add(&p->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
+}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
- INIT_LIST_HEAD(&cgrp->sibling);
- INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->css_sets);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&cgrp->pids_mutex);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
+
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+ init_waitqueue_head(&cgrp->offline_waitq);
}
-static void init_cgroup_root(struct cgroupfs_root *root)
+
+static void init_cgroup_root(struct cgroup_root *root,
+ struct cgroup_sb_opts *opts)
{
- struct cgroup *cgrp = &root->top_cgroup;
- INIT_LIST_HEAD(&root->subsys_list);
+ struct cgroup *cgrp = &root->cgrp;
+
INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
+ atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
- cgrp->top_cgroup = cgrp;
init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
+
+ root->flags = opts->flags;
+ if (opts->release_agent)
+ strcpy(root->release_agent_path, opts->release_agent);
+ if (opts->name)
+ strcpy(root->name, opts->name);
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_test_super(struct super_block *sb, void *data)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
- struct cgroupfs_root *new = data;
- struct cgroupfs_root *root = sb->s_fs_info;
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct css_set *cset;
+ int i, ret;
- /* First check subsystems */
- if (new->subsys_bits != root->subsys_bits)
- return 0;
+ lockdep_assert_held(&cgroup_mutex);
- /* Next check flags */
- if (new->flags != root->flags)
- return 0;
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
- return 1;
-}
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out;
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
- int ret;
- struct cgroupfs_root *root = data;
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ if (ret)
+ goto cancel_ref;
- ret = set_anon_super(sb, NULL);
+ ret = cgroup_init_root_id(root);
if (ret)
- return ret;
+ goto cancel_ref;
+
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
- sb->s_fs_info = root;
- root->sb = sb;
+ ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (ret)
+ goto destroy_root;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CGROUP_SUPER_MAGIC;
- sb->s_op = &cgroup_ops;
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto destroy_root;
- return 0;
-}
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
-static int cgroup_get_rootdir(struct super_block *sb)
-{
- struct inode *inode =
- cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
- struct dentry *dentry;
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
- if (!inode)
- return -ENOMEM;
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &cgroup_dir_inode_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- dentry = d_alloc_root(inode);
- if (!dentry) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = dentry;
- return 0;
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
+
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
}
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+ void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
+ struct cgroup_root *root;
struct cgroup_sb_opts opts;
- int ret = 0;
- struct super_block *sb;
- struct cgroupfs_root *root;
- struct list_head tmp_cg_links;
+ struct dentry *dentry;
+ int ret;
+ int i;
+ bool new_sb;
+
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- if (ret) {
- if (opts.release_agent)
- kfree(opts.release_agent);
- return ret;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- if (opts.release_agent)
- kfree(opts.release_agent);
- return -ENOMEM;
- }
+ if (ret)
+ goto out_unlock;
- init_cgroup_root(root);
- root->subsys_bits = opts.subsys_bits;
- root->flags = opts.flags;
- if (opts.release_agent) {
- strcpy(root->release_agent_path, opts.release_agent);
- kfree(opts.release_agent);
+ /* look for a matching existing root */
+ if (!opts.subsys_mask && !opts.none && !opts.name) {
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ ret = 0;
+ goto out_unlock;
}
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
- if (IS_ERR(sb)) {
- kfree(root);
- return PTR_ERR(sb);
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
}
- if (sb->s_fs_info != root) {
- /* Reusing an existing superblock */
- BUG_ON(sb->s_root == NULL);
- kfree(root);
- root = NULL;
- } else {
- /* New superblock */
- struct cgroup *root_cgrp = &root->top_cgroup;
- struct inode *inode;
- int i;
-
- BUG_ON(sb->s_root != NULL);
+ for_each_root(root) {
+ bool name_match = false;
- ret = cgroup_get_rootdir(sb);
- if (ret)
- goto drop_new_super;
- inode = sb->s_root->d_inode;
+ if (root == &cgrp_dfl_root)
+ continue;
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
/*
- * We're accessing css_set_count without locking
- * css_set_lock here, but that's OK - it can only be
- * increased by someone holding cgroup_lock, and
- * that's us. The worst that can happen is that we
- * have some link structures left over
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
*/
- ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
+ if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ } else {
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+ }
}
- ret = rebind_subsystems(root, root->subsys_bits);
- if (ret == -EBUSY) {
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto free_cg_links;
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
}
- /* EBUSY should be the only error here */
- BUG_ON(ret);
+ ret = 0;
+ goto out_unlock;
+ }
- list_add(&root->root_list, &roots);
- root_count++;
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- sb->s_root->d_fsdata = root_cgrp;
- root->top_cgroup.dentry = sb->s_root;
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- /* Link the top cgroup in this hierarchy into all
- * the css_set objects */
- write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
+ init_cgroup_root(root, &opts);
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
- }
- write_unlock(&css_set_lock);
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
- free_cg_links(&tmp_cg_links);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(opts.release_agent);
+ kfree(opts.name);
- BUG_ON(!list_empty(&root_cgrp->sibling));
- BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
+ if (ret)
+ return ERR_PTR(ret);
- cgroup_populate_dir(root_cgrp);
- mutex_unlock(&inode->i_mutex);
- mutex_unlock(&cgroup_mutex);
- }
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
+ cgroup_put(&root->cgrp);
- simple_set_mnt(mnt, sb);
- return 0;
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
+ }
- free_cg_links:
- free_cg_links(&tmp_cg_links);
- drop_new_super:
- up_write(&sb->s_umount);
- deactivate_super(sb);
- return ret;
+ return dentry;
}
-static void cgroup_kill_sb(struct super_block *sb) {
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
- int ret;
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- BUG_ON(!root);
+ /*
+ * If @root doesn't have any mounts or children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ * cgroup_mount() may wait for @root's release.
+ *
+ * And don't kill the default root.
+ */
+ if (css_has_online_children(&root->cgrp.self) ||
+ root == &cgrp_dfl_root)
+ cgroup_put(&root->cgrp);
+ else
+ percpu_ref_kill(&root->cgrp.self.refcnt);
- BUG_ON(root->number_of_cgroups != 1);
- BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
+ kernfs_kill_sb(sb);
+}
- mutex_lock(&cgroup_mutex);
+static struct file_system_type cgroup_fs_type = {
+ .name = "cgroup",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
- /* Rebind all subsystems back to the default hierarchy */
- ret = rebind_subsystems(root, 0);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
+static struct kobject *cgroup_kobj;
- /*
- * Release all the links from css_sets to this hierarchy's
- * root cgroup
- */
- write_lock(&css_set_lock);
+/**
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Return value is the same as kernfs_path().
+ */
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+{
+ struct cgroup_root *root;
+ struct cgroup *cgrp;
+ int hierarchy_id = 1;
+ char *path = NULL;
- list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
- cgrp_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- kfree(link);
- }
- write_unlock(&css_set_lock);
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- root_count--;
+ root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+
+ if (root) {
+ cgrp = task_cgroup_from_root(task, root);
+ path = cgroup_path(cgrp, buf, buflen);
+ } else {
+ /* if no hierarchy exists, everyone is in "/" */
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
}
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
-
- kill_litter_super(sb);
- kfree(root);
+ return path;
}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
-static struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .get_sb = cgroup_get_sb,
- .kill_sb = cgroup_kill_sb,
+/* used to track tasks and other necessary states during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
};
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
- return dentry->d_fsdata;
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
}
-static inline struct cftype *__d_cft(struct dentry *dentry)
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
- return dentry->d_fsdata;
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
+
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
+
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
+
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
+
+ return NULL;
}
/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
*/
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
+ struct task_struct *tsk,
+ struct css_set *new_cset)
{
- char *start;
- struct dentry *dentry = rcu_dereference(cgrp->dentry);
+ struct css_set *old_cset;
- if (!dentry || cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
- return 0;
- }
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
- start = buf + buflen;
+ /*
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
+ old_cset = task_css_set(tsk);
- *--start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
- if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, cgrp->dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
- dentry = rcu_dereference(cgrp->dentry);
- if (!cgrp->parent)
- continue;
- if (--start < buf)
- return -ENAMETOOLONG;
- *start = '/';
+ get_css_set(new_cset);
+ rcu_assign_pointer(tsk->cgroups, new_cset);
+
+ /*
+ * Use move_tail so that cgroup_taskset_first() still returns the
+ * leader after migration. This works because cgroup_migrate()
+ * ensures that the dst_cset of the leader is the first on the
+ * tset's dst_csets list.
+ */
+ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+
+ /*
+ * We just gained a reference on old_cset by taking it from the
+ * task. As trading it for new_cset is protected by cgroup_mutex,
+ * we're safe to drop it here; it will be freed under RCU.
+ */
+ set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
+ put_css_set_locked(old_cset, false);
+}
+
+/**
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
+ */
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+{
+ struct css_set *cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ down_write(&css_set_rwsem);
+ list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_preload_node);
+ put_css_set_locked(cset, false);
}
- memmove(buf, start, buf + buflen - start);
- return 0;
+ up_write(&css_set_rwsem);
}
-/*
- * Return the first subsystem attached to a cgroup's hierarchy, and
- * its subsystem id.
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
*/
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
+{
+ struct cgroup *src_cgrp;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+ if (!list_empty(&src_cset->mg_preload_node))
+ return;
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
-static void get_first_subsys(const struct cgroup *cgrp,
- struct cgroup_subsys_state **css, int *subsys_id)
+ src_cset->mg_src_cgrp = src_cgrp;
+ get_css_set(src_cset);
+ list_add(&src_cset->mg_preload_node, preloaded_csets);
+}
+
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup (may be %NULL)
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
+ */
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- const struct cgroupfs_root *root = cgrp->root;
- const struct cgroup_subsys *test_ss;
- BUG_ON(list_empty(&root->subsys_list));
- test_ss = list_entry(root->subsys_list.next,
- struct cgroup_subsys, sibling);
- if (css) {
- *css = cgrp->subsys[test_ss->subsys_id];
- BUG_ON(!*css);
+ LIST_HEAD(csets);
+ struct css_set *src_cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
+
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ struct css_set *dst_cset;
+
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
+ if (!dst_cset)
+ goto err;
+
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
+
+ src_cset->mg_dst_cset = dst_cset;
+
+ if (list_empty(&dst_cset->mg_preload_node))
+ list_add(&dst_cset->mg_preload_node, &csets);
+ else
+ put_css_set(dst_cset, false);
}
- if (subsys_id)
- *subsys_id = test_ss->subsys_id;
+
+ list_splice_tail(&csets, preloaded_csets);
+ return 0;
+err:
+ cgroup_migrate_finish(&csets);
+ return -ENOMEM;
}
/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
*
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'tsk' during call.
+ * Migrate a process or task denoted by @leader to @cgrp. If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader. The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
+ *
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
*/
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+ bool threadgroup)
{
- int retval = 0;
- struct cgroup_subsys *ss;
- struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
- struct cgroupfs_root *root = cgrp->root;
- int subsys_id;
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
+
+ /*
+ * Prevent freeing of tasks while we take a snapshot. Tasks that are
+ * already PF_EXITING could be freed from underneath us unless we
+ * take an rcu_read_lock.
+ */
+ down_write(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ goto next;
- get_first_subsys(cgrp, NULL, &subsys_id);
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ goto next;
- /* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup(tsk, subsys_id);
- if (cgrp == oldcgrp)
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ goto next;
+
+ /*
+ * cgroup_taskset_first() must always return the leader.
+ * Take care to avoid disturbing the ordering.
+ */
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset.dst_csets);
+ next:
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_write(&css_set_rwsem);
+
+ /* methods shouldn't be called if no task is actually migrating */
+ if (list_empty(&tset.src_csets))
return 0;
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk);
- if (retval)
- return retval;
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
+ failed_css = css;
+ goto out_cancel_attach;
+ }
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
/*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg)
- return -ENOMEM;
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- return -ESRCH;
+ down_write(&css_set_rwsem);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ up_write(&css_set_rwsem);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
+ /*
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
+ */
+ tset.csets = &tset.dst_csets;
+
+ for_each_e_css(css, i, cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, &tset);
+
+ ret = 0;
+ goto out_release_tset;
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk);
+out_cancel_attach:
+ for_each_e_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);
- return 0;
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ up_write(&css_set_rwsem);
+ return ret;
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+ struct task_struct *leader, bool threadgroup)
+{
+ LIST_HEAD(preloaded_csets);
+ struct task_struct *task;
+ int ret;
+
+ /* look up all src csets */
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+ &preloaded_csets);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ if (!ret)
+ ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
+ struct cgroup *cgrp;
+ pid_t pid;
int ret;
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
+ return -ENODEV;
+
+retry_find_task:
+ rcu_read_lock();
if (pid) {
- rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
- return -ESRCH;
+ ret = -ESRCH;
+ goto out_unlock_cgroup;
}
-
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
- if (cred->euid &&
- cred->euid != tcred->uid &&
- cred->euid != tcred->suid) {
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
rcu_read_unlock();
- return -EACCES;
+ ret = -EACCES;
+ goto out_unlock_cgroup;
}
- get_task_struct(tsk);
- rcu_read_unlock();
- } else {
+ } else
tsk = current;
- get_task_struct(tsk);
+
+ if (threadgroup)
+ tsk = tsk->group_leader;
+
+ /*
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
+ * trapped in a cpuset, or RT worker may be born in a cgroup
+ * with no rt_runtime allocated. Just say no.
+ */
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock_cgroup;
}
- ret = cgroup_attach_task(cgrp, tsk);
- put_task_struct(tsk);
- return ret;
-}
+ get_task_struct(tsk);
+ rcu_read_unlock();
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
-{
- int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
- return ret;
-}
+ threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
- FILE_ROOT,
- FILE_DIR,
- FILE_TASKLIST,
- FILE_NOTIFY_ON_RELEASE,
- FILE_RELEASE_AGENT,
-};
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
+ threadgroup_unlock(tsk);
+
+ put_task_struct(tsk);
+out_unlock_cgroup:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
*/
-bool cgroup_lock_live_group(struct cgroup *cgrp)
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
+ struct cgroup_root *root;
+ int retval = 0;
+
mutex_lock(&cgroup_mutex);
- if (cgroup_is_removed(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
}
- return true;
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup *cgrp;
+
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- if (!cgroup_lock_live_group(cgrp))
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- strcpy(cgrp->root->release_agent_path, buffer);
- cgroup_unlock();
- return 0;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
}
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
seq_putc(seq, '\n');
- cgroup_unlock();
return 0;
}
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
-
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- char buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- char *end;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (!nbytes)
- return -EINVAL;
- if (nbytes >= sizeof(buffer))
- return -E2BIG;
- if (copy_from_user(buffer, userbuf, nbytes))
- return -EFAULT;
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ return 0;
+}
- buffer[nbytes] = 0; /* nul-terminate */
- strstrip(buffer);
- if (cft->write_u64) {
- u64 val = simple_strtoull(buffer, &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_u64(cgrp, cft, val);
- } else {
- s64 val = simple_strtoll(buffer, &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_s64(cgrp, cft, val);
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
}
- if (!retval)
- retval = nbytes;
- return retval;
+ if (printed)
+ seq_putc(seq, '\n');
}
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
{
- char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- size_t max_bytes = cft->max_write_len;
- char *buffer = local_buffer;
-
- if (!max_bytes)
- max_bytes = sizeof(local_buffer) - 1;
- if (nbytes >= max_bytes)
- return -E2BIG;
- /* Allocate a dynamic buffer if we need one */
- if (nbytes >= sizeof(local_buffer)) {
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
- }
- if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out;
- }
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- buffer[nbytes] = 0; /* nul-terminate */
- strstrip(buffer);
- retval = cft->write_string(cgrp, cft, buffer);
- if (!retval)
- retval = nbytes;
-out:
- if (buffer != local_buffer)
- kfree(buffer);
- return retval;
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+ ~cgrp_dfl_root_inhibit_ss_mask);
+ return 0;
}
-static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (cgroup_is_removed(cgrp))
- return -ENODEV;
- if (cft->write)
- return cft->write(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_u64 || cft->write_s64)
- return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_string)
- return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->trigger) {
- int ret = cft->trigger(cgrp, (unsigned int)cft->private);
- return ret ? ret : nbytes;
- }
- return -EINVAL;
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ return 0;
}
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- u64 val = cft->read_u64(cgrp, cft);
- int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ return 0;
}
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- s64 val = cft->read_s64(cgrp, cft);
- int len = sprintf(tmp, "%lld\n", (long long) val);
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
+ }
+ up_read(&css_set_rwsem);
+
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
+ }
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
-static ssize_t cgroup_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ unsigned int enable = 0, disable = 0;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
+
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name) ||
+ ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
- if (cgroup_is_removed(cgrp))
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- if (cft->read)
- return cft->read(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_u64)
- return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_s64)
- return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
- return -EINVAL;
-}
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->child_subsys_mask & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
-struct cgroup_seqfile_state {
- struct cftype *cft;
- struct cgroup *cgroup;
-};
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->child_subsys_mask & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Create csses for enables and update child_subsys_mask. This
+ * changes cgroup_e_css() results which in turn makes the
+ * subsequent cgroup_update_dfl_csses() associate all tasks in the
+ * subtree to the updated csses.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
-static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+ cgroup_for_each_live_child(child, cgrp) {
+ ret = create_css(child, ss);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ cgrp->child_subsys_mask |= enable;
+ cgrp->child_subsys_mask &= ~disable;
+
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /* all tasks are now migrated away from the old csses, kill them */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp)
+ kill_css(cgroup_css(child, ss));
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+
+err_undo_css:
+ cgrp->child_subsys_mask &= ~enable;
+ cgrp->child_subsys_mask |= disable;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ if (css)
+ kill_css(css);
+ }
+ }
+ goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
{
- struct seq_file *sf = cb->state;
- return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
}
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- struct cgroup_seqfile_state *state = m->private;
- struct cftype *cft = state->cft;
- if (cft->read_map) {
- struct cgroup_map_cb cb = {
- .fill = cgroup_map_add,
- .state = m,
- };
- return cft->read_map(state->cgroup, cft, &cb);
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
}
- return cft->read_seq_string(state->cgroup, cft, m);
+
+ return ret ?: nbytes;
}
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- struct seq_file *seq = file->private_data;
- kfree(seq->private);
- return single_release(inode, file);
+ return seq_cft(seq)->seq_start(seq, ppos);
}
-static struct file_operations cgroup_seqfile_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = seq_lseek,
- .release = cgroup_seqfile_release,
-};
-
-static int cgroup_file_open(struct inode *inode, struct file *file)
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- int err;
- struct cftype *cft;
-
- err = generic_file_open(inode, file);
- if (err)
- return err;
- cft = __d_cft(file->f_dentry);
-
- if (cft->read_map || cft->read_seq_string) {
- struct cgroup_seqfile_state *state =
- kzalloc(sizeof(*state), GFP_USER);
- if (!state)
- return -ENOMEM;
- state->cft = cft;
- state->cgroup = __d_cgrp(file->f_dentry->d_parent);
- file->f_op = &cgroup_seqfile_operations;
- err = single_open(file, cgroup_seqfile_show, state);
- if (err < 0)
- kfree(state);
- } else if (cft->open)
- err = cft->open(inode, file);
- else
- err = 0;
+ return seq_cft(seq)->seq_next(seq, v, ppos);
+}
- return err;
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+ seq_cft(seq)->seq_stop(seq, v);
}
-static int cgroup_file_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- if (cft->release)
- return cft->release(inode, file);
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
+
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
+
+ if (cft->read_u64)
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
return 0;
}
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
+
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
{
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
+ if (kn->parent != new_parent)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ /*
+ * This isn't a proper migration and its usefulness is very
+ * limited. Disallow if sane_behavior.
+ */
+ if (cgroup_sane_behavior(cgrp))
+ return -EPERM;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
}
-static struct file_operations cgroup_file_operations = {
- .read = cgroup_file_read,
- .write = cgroup_file_write,
- .llseek = generic_file_llseek,
- .open = cgroup_file_open,
- .release = cgroup_file_release,
-};
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
-static struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
-};
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
-static int cgroup_create_file(struct dentry *dentry, int mode,
- struct super_block *sb)
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- };
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+ int ret;
- struct inode *inode;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, false, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
- inode = cgroup_new_inode(mode, sb);
- if (!inode)
- return -ENOMEM;
+ if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
+}
- if (S_ISDIR(mode)) {
- inode->i_op = &cgroup_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @cgrp: the target cgroup
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails. If addition fails, this
+ * function doesn't remove files already added. The caller is responsible
+ * for cleaning up.
+ */
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add)
+{
+ struct cftype *cft;
+ int ret;
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+ lockdep_assert_held(&cgroup_mutex);
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cgroup_file_operations;
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+ continue;
+
+ if (is_add) {
+ ret = cgroup_add_file(cgrp, cft);
+ if (ret) {
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
+ return ret;
+ }
+ } else {
+ cgroup_rm_file(cgrp, cft);
+ }
}
- dentry->d_op = &cgroup_dops;
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
return 0;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- int mode)
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- struct dentry *parent;
- int error = 0;
+ LIST_HEAD(pending);
+ struct cgroup_subsys *ss = cfts[0].ss;
+ struct cgroup *root = &ss->root->cgrp;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* add/rm files for all cgroups created before */
+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ struct cgroup *cgrp = css->cgroup;
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
+ if (cgroup_is_dead(cgrp))
+ continue;
+
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ if (ret)
+ break;
}
- dput(dentry);
- return error;
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
}
-int cgroup_add_file(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype *cft)
+static void cgroup_exit_cftypes(struct cftype *cfts)
{
- struct dentry *dir = cgrp->dentry;
- struct dentry *dentry;
- int error;
-
- char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
- if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
- strcpy(name, subsys->name);
- strcat(name, ".");
- }
- strcat(name, cft->name);
- BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
- dentry = lookup_one_len(name, dir, strlen(name));
- if (!IS_ERR(dentry)) {
- error = cgroup_create_file(dentry, 0644 | S_IFREG,
- cgrp->root->sb);
- if (!error)
- dentry->d_fsdata = (void *)cft;
- dput(dentry);
- } else
- error = PTR_ERR(dentry);
- return error;
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
+ }
}
-int cgroup_add_files(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype cft[],
- int count)
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
- int i, err;
- for (i = 0; i < count; i++) {
- err = cgroup_add_file(cgrp, subsys, &cft[i]);
- if (err)
- return err;
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
}
+
+ return 0;
+}
+
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
return 0;
}
/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
+ */
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ int ret;
+
+ if (ss->disabled)
+ return 0;
+
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
+
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
* Return the number of tasks in the cgroup.
*/
-int cgroup_task_count(const struct cgroup *cgrp)
+static int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
- count += atomic_read(&link->cg->refcount);
- }
- read_unlock(&css_set_lock);
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ up_read(&css_set_rwsem);
return count;
}
-/*
- * Advance a list_head iterator. The iterator should be positioned at
- * the start of a css_set
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /*
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() inbetween iterations.
+ *
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
+ */
+ if (!pos) {
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+ } else {
+ list_for_each_entry_rcu(next, &parent->children, sibling)
+ if (next->serial_nr > pos->serial_nr)
+ break;
+ }
+
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
+}
+
+/**
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_pre(). Find the next descendant
+ * to visit for pre-order traversal of @root's descendants. @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = css_next_child(NULL, pos);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
+ }
+
+ return NULL;
+}
+
+/**
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant, @pos
+ * is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
+ */
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last, *tmp;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ css_for_each_child(tmp, last)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last;
+
+ do {
+ last = pos;
+ pos = css_next_child(NULL, pos);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_post(). Find the next descendant
+ * to visit for post-order traversal of @root's descendants. @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return css_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
*/
-static void cgroup_advance_iter(struct cgroup *cgrp,
- struct cgroup_iter *it)
+bool css_has_online_children(struct cgroup_subsys_state *css)
{
- struct list_head *l = it->cg_link;
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct cgroup_subsys_state *child;
+ bool ret = false;
+
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * css_advance_task_iter - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
+ */
+static void css_advance_task_iter(struct css_task_iter *it)
+{
+ struct list_head *l = it->cset_pos;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &cgrp->css_sets) {
- it->cg_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
- cg = link->cg;
- } while (list_empty(&cg->tasks));
- it->cg_link = l;
- it->task = cg->tasks.next;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+ } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
+ it->cset_pos = l;
+
+ if (!list_empty(&cset->tasks))
+ it->task_pos = cset->tasks.next;
+ else
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
-/*
- * To reduce the fork() overhead for systems that are not actually
- * using their cgroups capability, we don't maintain the lists running
- * through each css_set to its tasks until we see the list actually
- * used - in other words after the first call to cgroup_iter_start().
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
*
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
+ * Initiate iteration through the tasks of @css. The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL. On completion of iteration, css_task_iter_end() must be
+ * called.
+ *
+ * Note that this function acquires a lock which is released when the
+ * iteration finishes. The caller can't sleep while iteration is in
+ * progress.
*/
-static void cgroup_enable_task_cg_lists(void)
+void css_task_iter_start(struct cgroup_subsys_state *css,
+ struct css_task_iter *it)
+ __acquires(css_set_rwsem)
{
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- do_each_thread(g, p) {
- task_lock(p);
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- */
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
- list_add(&p->cg_list, &p->cgroups->tasks);
- task_unlock(p);
- } while_each_thread(g, p);
- write_unlock(&css_set_lock);
-}
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
-void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
-{
- /*
- * The first time anyone tries to iterate across a cgroup,
- * we need to enable the list linking each css_set to its
- * tasks, and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ down_read(&css_set_rwsem);
- read_lock(&css_set_lock);
- it->cg_link = &cgrp->css_sets;
- cgroup_advance_iter(cgrp, it);
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
+
+ css_advance_task_iter(it);
}
-struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
- struct cgroup_iter *it)
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration. @it should have been
+ * initialized via css_task_iter_start(). Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cg_cgroup_link *link;
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cg_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
- /* Advance iterator to find next entry */
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
l = l->next;
- link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
- if (l == &link->cg->tasks) {
- /* We reached the end of this task list - move on to
- * the next cg_cgroup_link */
- cgroup_advance_iter(cgrp, it);
- } else {
- it->task = l;
- }
+
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
+
+ if (l == it->mg_tasks_head)
+ css_advance_task_iter(it);
+ else
+ it->task_pos = l;
+
return res;
}
-void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
+ */
+void css_task_iter_end(struct css_task_iter *it)
+ __releases(css_set_rwsem)
{
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
}
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively) simultaneously.
- */
- return t1 > t2;
- }
+ LIST_HEAD(preloaded_csets);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* all tasks in @from are being moved, all csets are source */
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+ up_read(&css_set_rwsem);
+
+ ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Migrate tasks one-by-one until @form is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&preloaded_csets);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
*/
-static inline int started_after(void *p1, void *p2)
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
-/**
- * cgroup_scan_tasks - iterate though all the tasks in a cgroup
- * @scan: struct cgroup_scanner containing arguments for the scan
- *
- * Arguments include pointers to callback functions test_task() and
- * process_task().
- * Iterate through all the tasks in a cgroup, calling test_task() for each,
- * and if it returns true, call process_task() for it also.
- * The test_task pointer may be NULL, meaning always true (select all tasks).
- * Effectively duplicates cgroup_iter_{start,next,end}()
- * but does not lock css_set_lock for the call to process_task().
- * The struct cgroup_scanner may be embedded in any structure of the caller's
- * creation.
- * It is guaranteed that process_task() will act on every task that
- * is a member of the cgroup for the duration of this call. This
- * function may or may not call process_task() for tasks that exit
- * or move to a different cgroup during the call, or are forked or
- * move into the cgroup during the call.
- *
- * Note that test_task() may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should
- * be cheap.
- * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
- * pre-allocated and will be used for heap operations (and its "gt" member will
- * be overwritten), else a temporary heap will be used (allocation of which
- * may cause this function to fail).
- */
-int cgroup_scan_tasks(struct cgroup_scanner *scan)
-{
- int retval, i;
- struct cgroup_iter it;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
- struct ptr_heap tmp_heap;
- struct ptr_heap *heap;
- struct timespec latest_time = { 0, 0 };
-
- if (scan->heap) {
- /* The caller supplied our heap and pre-allocated its memory */
- heap = scan->heap;
- heap->gt = &started_after;
- } else {
- /* We need to allocate our own heap memory */
- heap = &tmp_heap;
- retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- /* cannot allocate the heap */
- return retval;
- }
-
- again:
- /*
- * Scan tasks in the cgroup, using the scanner's "test_task" callback
- * to determine which are of interest, and using the scanner's
- * "process_task" callback to process any of them that need an update.
- * Since we don't want to hold any locks during the task updates,
- * gather tasks to be processed in a heap structure.
- * The heap is sorted by descending task start time.
- * If the statically-sized heap fills up, we overflow tasks that
- * started later, and in future iterations only consider tasks that
- * started after the latest task in the previous pass. This
- * guarantees forward progress and that we don't miss any tasks.
- */
- heap->size = 0;
- cgroup_iter_start(scan->cg, &it);
- while ((p = cgroup_iter_next(scan->cg, &it))) {
- /*
- * Only affect tasks that qualify per the caller's callback,
- * if he provided one
- */
- if (scan->test_task && !scan->test_task(p, scan))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(heap, p);
- if (dropped == NULL) {
- /*
- * The new task was inserted; the heap wasn't
- * previously full
- */
- get_task_struct(p);
- } else if (dropped != p) {
- /*
- * The new task was inserted, and pushed out a
- * different task
- */
- get_task_struct(p);
- put_task_struct(dropped);
- }
- /*
- * Else the new task was newer than anything already in
- * the heap and wasn't inserted
- */
+static void pidlist_free(void *p)
+{
+ if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ kfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
}
- cgroup_iter_end(scan->cg, &it);
- if (heap->size) {
- for (i = 0; i < heap->size; i++) {
- struct task_struct *q = heap->ptrs[i];
- if (i == 0) {
- latest_time = q->start_time;
- latest_task = q;
- }
- /* Process the task per the caller's callback */
- scan->process_task(q, scan);
- put_task_struct(q);
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+ int src, dest = 1;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
}
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't get processed.
- * Not the most efficient way to do it, but it avoids
- * having to take callback_mutex in the fork path
- */
- goto again;
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
}
- if (heap == &tmp_heap)
- heap_free(&tmp_heap);
- return 0;
+after:
+ return dest;
}
/*
- * Stuff for reading the 'tasks' file.
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
*
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
*
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
+static pid_t pid_fry(pid_t pid)
+{
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+ if (cgroup_sane_behavior(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+static int fried_cmppid(const void *a, const void *b)
+{
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
/*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'. Return actual number of pids loaded. No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
*/
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
{
- int n = 0, pid;
- struct cgroup_iter it;
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
+ /* entry not found; create a new one */
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
+ return l;
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+ l->key.type = type;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
+ struct css_task_iter it;
struct task_struct *tsk;
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
- if (unlikely(n == npids))
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = pidlist_allocate(length);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ if (unlikely(n == length))
break;
- pid = task_pid_vnr(tsk);
- if (pid > 0)
- pidarray[n++] = pid;
+ /* get tgid or pid for procs or tasks file respectively */
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
+ }
+ css_task_iter_end(&it);
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ if (cgroup_sane_behavior(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (type == CGROUP_FILE_PROCS)
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
+ if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
+ pidlist_free(array);
+ return -ENOMEM;
}
- cgroup_iter_end(cgrp, &it);
- return n;
+
+ /* store array, freeing old if necessary */
+ pidlist_free(l->list);
+ l->list = array;
+ l->length = length;
+ *lp = l;
+ return 0;
}
/**
@@ -2051,24 +3851,34 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
- int ret = -EINVAL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
/*
- * Validate dentry by checking the superblock operations,
- * and make sure it's a directory.
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
*/
- if (dentry->d_sb->s_op != &cgroup_ops ||
- !S_ISDIR(dentry->d_inode->i_mode))
- goto err;
-
- ret = 0;
- cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
@@ -2088,25 +3898,20 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
break;
}
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
-err:
- return ret;
-}
-
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
/*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
*/
-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
{
/*
* Initially we receive a position value that corresponds to
@@ -2114,46 +3919,76 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
- int *iter;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
- down_read(&cgrp->pids_mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
- *pos = *iter;
+ iter = l->list + index;
+ *pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
- up_read(&cgrp->pids_mutex);
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
- int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
-
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
@@ -2162,473 +3997,724 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
- *pos = *p;
+ *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
-static struct seq_operations cgroup_tasks_seq_operations = {
- .start = cgroup_tasks_start,
- .stop = cgroup_tasks_stop,
- .next = cgroup_tasks_next,
- .show = cgroup_tasks_show,
-};
-
-static void release_cgroup_pid_array(struct cgroup *cgrp)
-{
- down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
- }
- up_write(&cgrp->pids_mutex);
-}
-
-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- release_cgroup_pid_array(cgrp);
- return seq_release(inode, file);
+ return notify_on_release(css->cgroup);
}
-static struct file_operations cgroup_tasks_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .write = cgroup_file_write,
- .release = cgroup_tasks_release,
-};
-
-/*
- * Handle an open on 'tasks' file. Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
- */
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- pid_t *pidarray;
- int npids;
- int retval;
-
- /* Nothing to do for write-only files */
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- npids = cgroup_task_count(cgrp);
- pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
- if (!pidarray)
- return -ENOMEM;
- npids = pid_array_load(pidarray, npids, cgrp);
- sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
- /*
- * Store the array in the cgroup, freeing the old
- * array if necessary
- */
- down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
- up_write(&cgrp->pids_mutex);
-
- file->f_op = &cgroup_tasks_operations;
-
- retval = seq_open(file, &cgroup_tasks_seq_operations);
- if (retval) {
- release_cgroup_pid_array(cgrp);
- return retval;
- }
- ((struct seq_file *)file->private_data)->private = cgrp;
+ clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
-static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft)
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return notify_on_release(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}
-static int cgroup_write_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft,
- u64 val)
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
return 0;
}
-/*
- * for the common functions, 'private' gives the type of file
- */
-static struct cftype files[] = {
+static struct cftype cgroup_base_files[] = {
{
- .name = "tasks",
- .open = cgroup_tasks_open,
- .write_u64 = cgroup_tasks_write,
- .release = cgroup_tasks_release,
- .private = FILE_TASKLIST,
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .flags = CFTYPE_INSANE,
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_ONLY_ON_DFL,
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
},
+ /*
+ * Historical crazy stuff. These don't have "cgroup." prefix and
+ * don't exist if sane_behavior. If you're depending on these, be
+ * prepared to be burned.
+ */
+ {
+ .name = "tasks",
+ .flags = CFTYPE_INSANE, /* use "procs" instead */
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
{
.name = "notify_on_release",
+ .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
- .private = FILE_NOTIFY_ON_RELEASE,
},
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
};
-static struct cftype cft_release_agent = {
- .name = "release_agent",
- .read_seq_string = cgroup_release_agent_show,
- .write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
- .private = FILE_RELEASE_AGENT,
-};
-
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * cgroup_populate_dir - create subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ *
+ * On failure, no file is added.
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- int err;
struct cgroup_subsys *ss;
+ int i, ret = 0;
- /* First clear out any existing files */
- cgroup_clear_directory(cgrp->dentry);
+ /* process cftsets of each subsystem */
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
- if (err < 0)
- return err;
+ if (!(subsys_mask & (1 << i)))
+ continue;
- if (cgrp == cgrp->top_cgroup) {
- if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
- return err;
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
+ if (ret < 0)
+ goto err;
+ }
}
+ return 0;
+err:
+ cgroup_clear_dir(cgrp, subsys_mask);
+ return ret;
+}
+
+/*
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts. Killing of the percpu_ref is initiated.
+ * Implemented in kill_css().
+ *
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ * accessors are inside RCU read sections. css_release() schedules the
+ * RCU callback.
+ *
+ * 4. After the grace period, the css can be freed. Implemented in
+ * css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
+ */
+static void css_free_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup *cgrp = css->cgroup;
+
+ if (css->ss) {
+ /* css free path */
+ if (css->parent)
+ css_put(css->parent);
- for_each_subsys(cgrp->root, ss) {
- if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
- return err;
+ css->ss->css_free(css);
+ cgroup_put(cgrp);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ cgroup_pidlist_destroy_all(cgrp);
+
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
+}
- return 0;
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct cgroup_subsys_state *css =
+ container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
+
+ INIT_WORK(&css->destroy_work, css_free_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-static void init_cgroup_css(struct cgroup_subsys_state *css,
- struct cgroup_subsys *ss,
- struct cgroup *cgrp)
+static void css_release_work_fn(struct work_struct *work)
{
- css->cgroup = cgrp;
- atomic_set(&css->refcnt, 1);
- css->flags = 0;
- if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
- BUG_ON(cgrp->subsys[ss->subsys_id]);
- cgrp->subsys[ss->subsys_id] = css;
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ mutex_lock(&cgroup_mutex);
+
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (ss) {
+ /* css release path */
+ cgroup_idr_remove(&ss->css_idr, css->id);
+ } else {
+ /* cgroup release path */
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
+ }
+
+ mutex_unlock(&cgroup_mutex);
+
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
}
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+static void css_release(struct percpu_ref *ref)
{
- /* We need to take each hierarchy_mutex in a consistent order */
- int i;
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_get(cgrp);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->root == root)
- mutex_lock(&ss->hierarchy_mutex);
+ memset(css, 0, sizeof(*css));
+ css->cgroup = cgrp;
+ css->ss = ss;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
+
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
}
+
+ BUG_ON(cgroup_css(cgrp, ss));
}
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
{
- int i;
+ struct cgroup_subsys *ss = css->ss;
+ int ret = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->root == root)
- mutex_unlock(&ss->hierarchy_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (ss->css_online)
+ ret = ss->css_online(css);
+ if (!ret) {
+ css->flags |= CSS_ONLINE;
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
}
+ return ret;
}
-/*
- * cgroup_create - create a cgroup
- * @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ if (ss->css_offline)
+ ss->css_offline(css);
+
+ css->flags &= ~CSS_ONLINE;
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
+}
+
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
*
- * Must be called with the mutex on the parent inode held
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
*/
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- int mode)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
- struct cgroup *cgrp;
- struct cgroupfs_root *root = parent->root;
- int err = 0;
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+ struct cgroup_subsys_state *css;
+ int err;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ css = ss->css_alloc(parent_css);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+
+ init_and_link_css(css, ss, cgrp);
+
+ err = percpu_ref_init(&css->refcnt, css_release);
+ if (err)
+ goto err_free_css;
+
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
+
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
+
+ err = online_css(css);
+ if (err)
+ goto err_list_del;
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ cgroup_parent(parent)) {
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
+ ss->warned_broken_hierarchy = true;
+ }
+
+ return 0;
+
+err_list_del:
+ list_del_rcu(&css->sibling);
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
+err_free_percpu_ref:
+ percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+ return err;
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct super_block *sb = root->sb;
+ struct kernfs_node *kn;
+ int ssid, ret;
+ parent = cgroup_kn_lock_live(parent_kn);
+ if (!parent)
+ return -ENODEV;
+ root = parent->root;
+
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp)
- return -ENOMEM;
+ if (!cgrp) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- /* Grab a reference on the superblock so the hierarchy doesn't
- * get deleted on unmount if there are child cgroups. This
- * can be done outside cgroup_mutex, since the sb can't
- * disappear while someone has an open control file on the
- * fs */
- atomic_inc(&sb->s_active);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out_free_cgrp;
- mutex_lock(&cgroup_mutex);
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (cgrp->id < 0) {
+ ret = -ENOMEM;
+ goto out_cancel_ref;
+ }
init_cgroup_housekeeping(cgrp);
- cgrp->parent = parent;
- cgrp->root = parent->root;
- cgrp->top_cgroup = parent->top_cgroup;
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- for_each_subsys(root, ss) {
- struct cgroup_subsys_state *css = ss->create(ss, cgrp);
- if (IS_ERR(css)) {
- err = PTR_ERR(css);
- goto err_destroy;
- }
- init_cgroup_css(css, ss, cgrp);
- }
-
- cgroup_lock_hierarchy(root);
- list_add(&cgrp->sibling, &cgrp->parent->children);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups++;
-
- err = cgroup_create_dir(cgrp, dentry, mode);
- if (err < 0)
- goto err_remove;
-
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- err = cgroup_populate_dir(cgrp);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_id;
+ }
+ cgrp->kn = kn;
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ /*
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
- return 0;
+ cgrp->self.serial_nr = css_serial_nr_next++;
- err_remove:
+ /* allocation complete, commit to creation */
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
- cgroup_lock_hierarchy(root);
- list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups--;
+ /*
+ * @cgrp is now fully operational. If something fails after this
+ * point, it'll be released via the normal destruction path.
+ */
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- err_destroy:
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
- for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
+ ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (ret)
+ goto out_destroy;
+
+ /* let's create and online css's */
+ for_each_subsys(ss, ssid) {
+ if (parent->child_subsys_mask & (1 << ssid)) {
+ ret = create_css(cgrp, ss);
+ if (ret)
+ goto out_destroy;
+ }
}
- mutex_unlock(&cgroup_mutex);
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * child_subsys_mask from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->child_subsys_mask = parent->child_subsys_mask;
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
+ kernfs_activate(kn);
+
+ ret = 0;
+ goto out_unlock;
+out_free_id:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+ percpu_ref_cancel_init(&cgrp->self.refcnt);
+out_free_cgrp:
kfree(cgrp);
- return err;
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
+
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ goto out_unlock;
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
- /* the vfs holds inode->i_mutex already */
- return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+ mutex_lock(&cgroup_mutex);
+ offline_css(css);
+ mutex_unlock(&cgroup_mutex);
+
+ css_put(css);
}
-static int cgroup_has_css_refs(struct cgroup *cgrp)
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
{
- /* Check the reference count on each subsystem. Since we
- * already established that there are no tasks in the
- * cgroup, if the css refcount is also 1, then there should
- * be no outstanding references, so the subsystem is safe to
- * destroy. We scan across all subsystems rather than using
- * the per-hierarchy linked list of mounted subsystems since
- * we can be called via check_for_release() with no
- * synchronization other than RCU, and the subsystem linked
- * list isn't RCU-safe */
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
- /* Skip subsystems not in this hierarchy */
- if (ss->root != cgrp->root)
- continue;
- css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway. */
- if (css && (atomic_read(&css->refcnt) > 1))
- return 1;
- }
- return 0;
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
*/
-
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
+static void kill_css(struct cgroup_subsys_state *css)
{
- struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
- local_irq_save(flags);
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- int refcnt;
- while (1) {
- /* We can only remove a CSS with a refcnt==1 */
- refcnt = atomic_read(&css->refcnt);
- if (refcnt > 1) {
- failed = true;
- goto done;
- }
- BUG_ON(!refcnt);
- /*
- * Drop the refcnt to 0 while we check other
- * subsystems. This will cause any racing
- * css_tryget() to spin until we set the
- * CSS_REMOVED bits or abort
- */
- if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
- break;
- cpu_relax();
- }
- }
- done:
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- if (failed) {
- /*
- * Restore old refcnt if we previously managed
- * to clear it from 1 to 0
- */
- if (!atomic_read(&css->refcnt))
- atomic_set(&css->refcnt, 1);
- } else {
- /* Commit the fact that the CSS is removed */
- set_bit(CSS_REMOVED, &css->flags);
- }
- }
- local_irq_restore(flags);
- return !failed;
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
+ */
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+
+ /*
+ * Killing would put the base ref, but we need to keep it alive
+ * until after ->css_offline().
+ */
+ css_get(css);
+
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is
+ * invoked, no new css reference will be given out via
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
+ * proceed to offlining css's because percpu_ref_kill() doesn't
+ * guarantee that the ref is seen as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
+ struct cgroup_subsys_state *css;
+ bool empty;
+ int ssid;
- /* the vfs holds both inode->i_mutex already */
+ lockdep_assert_held(&cgroup_mutex);
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
+ /*
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
+ */
+ down_read(&css_set_rwsem);
+ empty = list_empty(&cgrp->cset_links);
+ up_read(&css_set_rwsem);
+ if (!empty)
return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
*/
- cgroup_call_pre_destroy(cgrp);
+ if (css_has_online_children(&cgrp->self))
+ return -EBUSY;
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
+ /*
+ * Mark @cgrp dead. This prevents further task migration and child
+ * creation by disabling cgroup_lock_live_group().
+ */
+ cgrp->self.flags &= ~CSS_ONLINE;
- if (atomic_read(&cgrp->count)
- || !list_empty(&cgrp->children)
- || !cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
+ /* initiate massacre of all css's */
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
- spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
+ /* CSS_ONLINE is clear, remove from ->release_list for the last time */
+ raw_spin_lock(&release_list_lock);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
- spin_unlock(&release_list_lock);
-
- cgroup_lock_hierarchy(cgrp->root);
- /* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(cgrp->root);
+ list_del_init(&cgrp->release_list);
+ raw_spin_unlock(&release_list_lock);
- spin_lock(&cgrp->dentry->d_lock);
- d = dget(cgrp->dentry);
- spin_unlock(&d->d_lock);
+ /*
+ * Remove @cgrp directory along with the base files. @cgrp has an
+ * extra ref on its kn.
+ */
+ kernfs_remove(cgrp->kn);
- cgroup_d_remove_dir(d);
- dput(d);
+ set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
+ check_for_release(cgroup_parent(cgrp));
- set_bit(CGRP_RELEASABLE, &parent->flags);
- check_for_release(parent);
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
- mutex_unlock(&cgroup_mutex);
return 0;
+};
+
+static int cgroup_rmdir(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ cgrp = cgroup_kn_lock_live(kn);
+ if (!cgrp)
+ return 0;
+ cgroup_get(cgrp); /* for @kn->priv clearing */
+
+ ret = cgroup_destroy_locked(cgrp);
+
+ cgroup_kn_unlock(kn);
+
+ /*
+ * There are two control paths which try to determine cgroup from
+ * dentry without going through kernfs - cgroupstats_build() and
+ * css_tryget_online_from_dir(). Those are supported by RCU
+ * protecting clearing of cgrp->kn->priv backpointer, which should
+ * happen after all files under it have been removed.
+ */
+ if (!ret)
+ RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
+
+ cgroup_put(cgrp);
+ return ret;
}
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
- /* Create the top cgroup state for this subsystem */
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
- css = ss->create(ss, dummytop);
+ mutex_lock(&cgroup_mutex);
+
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
+
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
- init_cgroup_css(css, ss, dummytop);
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
- * init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -2637,9 +4723,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
- ss->active = 1;
+ BUG_ON(online_css(css));
+
+ mutex_unlock(&cgroup_mutex);
}
/**
@@ -2650,40 +4736,29 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
*/
int __init cgroup_init_early(void)
{
+ static struct cgroup_sb_opts __initdata opts =
+ { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ struct cgroup_subsys *ss;
int i;
- atomic_set(&init_css_set.refcount, 1);
- INIT_LIST_HEAD(&init_css_set.cg_links);
- INIT_LIST_HEAD(&init_css_set.tasks);
- INIT_HLIST_NODE(&init_css_set.hlist);
- css_set_count = 1;
- init_cgroup_root(&rootnode);
- root_count = 1;
- init_task.cgroups = &init_css_set;
-
- init_css_set_link.cg = &init_css_set;
- list_add(&init_css_set_link.cgrp_link_list,
- &rootnode.top_cgroup.css_sets);
- list_add(&init_css_set_link.cg_link_list,
- &init_css_set.cg_links);
-
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- BUG_ON(!ss->name);
- BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
- if (ss->subsys_id != i) {
- printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
- ss->name, ss->subsys_id);
- BUG();
- }
+
+ init_cgroup_root(&cgrp_dfl_root, &opts);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
+ RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
+
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
if (ss->early_init)
- cgroup_init_subsys(ss);
+ cgroup_init_subsys(ss, true);
}
return 0;
}
@@ -2696,60 +4771,104 @@ int __init cgroup_init_early(void)
*/
int __init cgroup_init(void)
{
- int err;
- int i;
- struct hlist_head *hhead;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid, err;
- err = bdi_init(&cgroup_backing_dev_info);
- if (err)
- return err;
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (!ss->early_init)
- cgroup_init_subsys(ss);
- }
+ mutex_lock(&cgroup_mutex);
/* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
+
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+
+ mutex_unlock(&cgroup_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
+
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
+
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (!ss->disabled) {
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ }
+ }
+
+ cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+ if (!cgroup_kobj)
+ return -ENOMEM;
err = register_filesystem(&cgroup_fs_type);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ kobject_put(cgroup_kobj);
+ return err;
+ }
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+ return 0;
+}
-out:
- if (err)
- bdi_destroy(&cgroup_backing_dev_info);
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
- return err;
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+
+ return 0;
}
+core_initcall(cgroup_wq_init);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
- * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
- * doesn't really matter if tsk->cgroup changes after we read it,
- * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
- * anyway. No need to check that tsk->cgroup != NULL, thanks to
- * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
- * cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2762,27 +4881,36 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- for_each_active_root(root) {
+ for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int subsys_id;
- int count = 0;
+ int ssid, count = 0;
+
+ if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ continue;
- seq_printf(m, "%lu:", root->subsys_bits);
- for_each_subsys(root, ss)
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ seq_printf(m, "%d:", root->hierarchy_id);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (strlen(root->name))
+ seq_printf(m, "%sname=%s", count ? "," : "",
+ root->name);
seq_putc(m, ':');
- get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
- cgrp = task_cgroup(tsk, subsys_id);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ cgrp = task_cgroup_from_root(tsk, root);
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
out_unlock:
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
@@ -2791,32 +4919,25 @@ out:
return retval;
}
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
+ struct cgroup_subsys *ss;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
mutex_lock(&cgroup_mutex);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- seq_printf(m, "%s\t%lu\t%d\t%d\n",
- ss->name, ss->root->subsys_bits,
- ss->root->number_of_cgroups, !ss->disabled);
- }
+
+ for_each_subsys(ss, i)
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -2826,7 +4947,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
return single_open(file, proc_cgroupstats_show, NULL);
}
-static struct file_operations proc_cgroupstats_operations = {
+static const struct file_operations proc_cgroupstats_operations = {
.open = cgroupstats_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -2834,74 +4955,83 @@ static struct file_operations proc_cgroupstats_operations = {
};
/**
- * cgroup_fork - attach newly forked task to its parents cgroup.
+ * cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
*
- * Description: A task inherits its parent's cgroup at fork().
- *
- * A pointer to the shared css_set was automatically copied in
- * fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
- *
- * At the point that cgroup_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set. Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
- task_lock(current);
- child->cgroups = current->cgroups;
- get_css_set(child->cgroups);
- task_unlock(current);
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(ss, child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ struct cgroup_subsys *ss;
+ int i;
+
+ /*
+ * This may race against cgroup_enable_task_cg_links(). As that
+ * function sets use_task_css_set_links before grabbing
+ * tasklist_lock and we just went through tasklist_lock to add
+ * @child, it's guaranteed that either we see the set
+ * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+ * @child during its iteration.
+ *
+ * If we won the race, @child is associated with %current's
+ * css_set. Grabbing css_set_rwsem guarantees both that the
+ * association is stable, and, on completion of the parent's
+ * migration, @child is visible in the source of migration or
+ * already in the destination cgroup. This guarantee is necessary
+ * when implementing operations which need to migrate all tasks of
+ * a cgroup to another.
+ *
+ * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * will remain in init_css_set. This is safe because all tasks are
+ * in the init_css_set before cg_links is enabled and there's no
+ * operation which transfers all tasks out of init_css_set.
+ */
if (use_task_css_set_links) {
- write_lock(&css_set_lock);
- task_lock(child);
- if (list_empty(&child->cg_list))
- list_add(&child->cg_list, &child->cgroups->tasks);
- task_unlock(child);
- write_unlock(&css_set_lock);
+ struct css_set *cset;
+
+ down_write(&css_set_rwsem);
+ cset = task_css_set(current);
+ if (list_empty(&child->cg_list)) {
+ rcu_assign_pointer(child->cgroups, cset);
+ list_add(&child->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ up_write(&css_set_rwsem);
+ }
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for_each_subsys(ss, i)
+ if (ss->fork)
+ ss->fork(child);
}
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -2911,241 +5041,74 @@ void cgroup_post_fork(struct task_struct *child)
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
- * the_top_cgroup_hack:
- *
- * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
- *
- * We call cgroup_exit() while the task is still competent to
- * handle notify_on_release(), then leave the task attached to the
- * root cgroup in each hierarchy for the remainder of its exit.
- *
- * To do this properly, we would increment the reference count on
- * top_cgroup, and near the very end of the kernel/exit.c do_exit()
- * code we would add a second cgroup function call, to drop that
- * reference. This would just create an unnecessary hot spot on
- * the top_cgroup reference count, to no avail.
- *
- * Normally, holding a reference to a cgroup without bumping its
- * count is unsafe. The cgroup could go away, or someone could
- * attach us to a different cgroup, decrementing the count on
- * the first cgroup that we never incremented. But in this case,
- * top_cgroup isn't going away, and either task has PF_EXITING set,
- * which wards off any cgroup_attach_task() attempts, or task is a failed
- * fork, never visible to cgroup_attach_task.
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit. No need to bother with
+ * init_css_set refcnting. init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ bool put_cset = false;
int i;
- struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
/*
- * Unlink from the css_set task list if necessary.
- * Optimistically check cg_list before taking
- * css_set_lock
+ * Unlink from @tsk from its css_set. As migration path can't race
+ * with us, we can check cg_list without grabbing css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
- write_unlock(&css_set_lock);
+ down_write(&css_set_rwsem);
+ list_del_init(&tsk->cg_list);
+ up_write(&css_set_rwsem);
+ put_cset = true;
}
/* Reassign the task to the init_css_set. */
- task_lock(tsk);
- cg = tsk->cgroups;
- tsk->cgroups = &init_css_set;
- task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
- char *nodename)
-{
- struct dentry *dentry;
- int ret = 0;
- struct cgroup *parent, *child;
- struct inode *inode;
- struct css_set *cg;
- struct cgroupfs_root *root;
- struct cgroup_subsys *ss;
-
- /* We shouldn't be called by an unregistered subsystem */
- BUG_ON(!subsys->active);
-
- /* First figure out what hierarchy and cgroup we're dealing
- * with, and pin them so we can drop cgroup_mutex */
- mutex_lock(&cgroup_mutex);
- again:
- root = subsys->root;
- if (root == &rootnode) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Pin the hierarchy */
- if (!atomic_inc_not_zero(&root->sb->s_active)) {
- /* We race with the final deactivate_super() */
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Keep the cgroup alive */
- task_lock(tsk);
- parent = task_cgroup(tsk, subsys->subsys_id);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
+ cset = task_css_set(tsk);
+ RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- mutex_unlock(&cgroup_mutex);
+ if (need_forkexit_callback) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
+ if (ss->exit) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
- /* Now do the VFS work to create a cgroup */
- inode = parent->dentry->d_inode;
-
- /* Hold the parent directory mutex across this operation to
- * stop anyone else deleting the new cgroup */
- mutex_lock(&inode->i_mutex);
- dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
- if (IS_ERR(dentry)) {
- printk(KERN_INFO
- "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
- PTR_ERR(dentry));
- ret = PTR_ERR(dentry);
- goto out_release;
- }
-
- /* Create the cgroup directory, which also creates the cgroup */
- ret = vfs_mkdir(inode, dentry, 0755);
- child = __d_cgrp(dentry);
- dput(dentry);
- if (ret) {
- printk(KERN_INFO
- "Failed to create cgroup %s: %d\n", nodename,
- ret);
- goto out_release;
+ ss->exit(css, old_css, tsk);
+ }
+ }
}
- /* The cgroup now exists. Retake cgroup_mutex and check
- * that we're still in the same state that we thought we
- * were. */
- mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
- (parent != task_cgroup(tsk, subsys->subsys_id))) {
- /* Aargh, we raced ... */
- mutex_unlock(&inode->i_mutex);
- put_css_set(cg);
-
- deactivate_super(root->sb);
- /* The cgroup is still accessible in the VFS, but
- * we're not going to try to rmdir() it at this
- * point. */
- printk(KERN_INFO
- "Race in cgroup_clone() - leaking cgroup %s\n",
- nodename);
- goto again;
- }
-
- /* do any required auto-setup */
- for_each_subsys(root, ss) {
- if (ss->post_clone)
- ss->post_clone(ss, child);
- }
-
- /* All seems fine. Finish by moving the task into the new cgroup */
- ret = cgroup_attach_task(child, tsk);
- mutex_unlock(&cgroup_mutex);
-
- out_release:
- mutex_unlock(&inode->i_mutex);
-
- mutex_lock(&cgroup_mutex);
- put_css_set(cg);
- mutex_unlock(&cgroup_mutex);
- deactivate_super(root->sb);
- return ret;
-}
-
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
- * @cgrp: the cgroup in question
- *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp)
-{
- int ret;
- struct cgroup *target;
- int subsys_id;
-
- if (cgrp == dummytop)
- return 1;
-
- get_first_subsys(cgrp, NULL, &subsys_id);
- target = task_cgroup(current, subsys_id);
- while (cgrp != target && cgrp!= cgrp->top_cgroup)
- cgrp = cgrp->parent;
- ret = (cgrp == target);
- return ret;
+ if (put_cset)
+ put_css_set(cset, true);
}
static void check_for_release(struct cgroup *cgrp)
{
- /* All of these checks rely on RCU to keep the cgroup
- * structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
- /* Control Group is currently removeable. If it's not
+ if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+ !css_has_online_children(&cgrp->self)) {
+ /*
+ * Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
- * it now */
+ * it now
+ */
int need_schedule_work = 0;
- spin_lock(&release_list_lock);
- if (!cgroup_is_removed(cgrp) &&
+
+ raw_spin_lock(&release_list_lock);
+ if (!cgroup_is_dead(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
-void __css_put(struct cgroup_subsys_state *css)
-{
- struct cgroup *cgrp = css->cgroup;
- rcu_read_lock();
- if ((atomic_dec_return(&css->refcnt) == 1) &&
- notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- rcu_read_unlock();
-}
-
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
@@ -3173,20 +5136,21 @@ static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
- spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ raw_spin_unlock(&release_list_lock);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
@@ -3194,7 +5158,7 @@ static void cgroup_release_agent(struct work_struct *work)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
@@ -3212,24 +5176,23 @@ static void cgroup_release_agent(struct work_struct *work)
continue_free:
kfree(pathbuf);
kfree(agentbuf);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
static int __init cgroup_disable(char *str)
{
- int i;
+ struct cgroup_subsys *ss;
char *token;
+ int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
+ for_each_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
@@ -3241,3 +5204,201 @@ static int __init cgroup_disable(char *str)
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
+
+/**
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
+ *
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
+ */
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
+ struct cgroup *cgrp;
+
+ /* is @dentry a cgroup dir? */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return ERR_PTR(-EBADF);
+
+ rcu_read_lock();
+
+ /*
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See cgroup_rmdir() for details.
+ */
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
+
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+ return css;
+}
+
+/**
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
+ *
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
+ */
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
+}
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
+ }
+ up_read(&css_set_rwsem);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .base_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index daca6209202..00000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * kernel/cgroup_debug.c - Example cgroup subsystem that
- * exposes debug info
- *
- * Copyright (C) Google Inc, 2007
- *
- * Developed by Paul Menage (menage@google.com)
- *
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-
-#include <asm/atomic.h>
-
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
-{
- return atomic_read(&cont->count);
-}
-
-static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
-{
- u64 count;
-
- cgroup_lock();
- count = cgroup_task_count(cont);
- cgroup_unlock();
- return count;
-}
-
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
-{
- return (u64)(long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup *cont,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(&current->cgroups->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
-{
- return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
-static struct cftype files[] = {
- {
- .name = "cgroup_refcount",
- .read_u64 = cgroup_refcount_read,
- },
- {
- .name = "taskcount",
- .read_u64 = taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-};
-
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
- .populate = debug_populate,
- .subsys_id = debug_subsys_id,
-};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcad..a79e40f9d70 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,119 +14,76 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
-enum freezer_state {
- CGROUP_THAWED = 0,
- CGROUP_FREEZING,
- CGROUP_FROZEN,
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};
struct freezer {
- struct cgroup_subsys_state css;
- enum freezer_state state;
- spinlock_t lock; /* protects _writes_ to state */
+ struct cgroup_subsys_state css;
+ unsigned int state;
};
-static inline struct freezer *cgroup_freezer(
- struct cgroup *cgroup)
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
- return container_of(
- cgroup_subsys_state(cgroup, freezer_subsys_id),
- struct freezer, css);
+ return css ? container_of(css, struct freezer, css) : NULL;
}
static inline struct freezer *task_freezer(struct task_struct *task)
{
- return container_of(task_subsys_state(task, freezer_subsys_id),
- struct freezer, css);
+ return css_freezer(task_css(task, freezer_cgrp_id));
}
-int cgroup_frozen(struct task_struct *task)
+static struct freezer *parent_freezer(struct freezer *freezer)
{
- struct freezer *freezer;
- enum freezer_state state;
+ return css_freezer(freezer->css.parent);
+}
- task_lock(task);
- freezer = task_freezer(task);
- state = freezer->state;
- task_unlock(task);
+bool cgroup_freezing(struct task_struct *task)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
+ rcu_read_unlock();
- return state == CGROUP_FROZEN;
+ return ret;
}
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
-static const char *freezer_state_strs[] = {
- "THAWED",
- "FREEZING",
- "FROZEN",
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
};
-/*
- * State diagram
- * Transitions are caused by userspace writes to the freezer.state file.
- * The values in parenthesis are state labels. The rest are edge labels.
- *
- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
- * ^ ^ | |
- * | \_______THAWED_______/ |
- * \__________________________THAWED____________/
- */
-
-struct cgroup_subsys freezer_subsys;
-
-/* Locks taken and their ordering
- * ------------------------------
- * css_set_lock
- * cgroup_mutex (AKA cgroup_lock)
- * task->alloc_lock (AKA task_lock)
- * freezer->lock
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * can_attach():
- * cgroup_mutex
- *
- * cgroup_frozen():
- * task->alloc_lock (to get task's cgroup)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * task->alloc_lock (to get task's cgroup)
- * freezer->lock
- * sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- * freezer->lock
- * read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- * freezer->lock
- * read_lock css_set_lock (cgroup iterator start)
- * sighand->siglock
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- * freezer->lock
- * read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (to prevent races with freeze_task())
- * sighand->siglock
- */
-static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct freezer *freezer;
@@ -134,246 +91,394 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
- freezer->state = CGROUP_THAWED;
return &freezer->css;
}
-static void freezer_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
{
- kfree(cgroup_freezer(cgroup));
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
+
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
+ atomic_dec(&system_freezing_cnt);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
}
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
+static void freezer_css_free(struct cgroup_subsys_state *css)
{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
+ kfree(css_freezer(css));
}
/*
- * The call to cgroup_lock() in the freezer.state write method prevents
- * a write to that file racing against an attach, and hence the
- * can_attach() result will remain valid until the attach completes.
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
*/
-static int freezer_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup,
- struct task_struct *task)
+static void freezer_attach(struct cgroup_subsys_state *new_css,
+ struct cgroup_taskset *tset)
{
- struct freezer *freezer;
+ struct freezer *freezer = css_freezer(new_css);
+ struct task_struct *task;
+ bool clear_frozen = false;
+
+ mutex_lock(&freezer_mutex);
/*
- * Anything frozen can't move or be moved to/from.
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
*
- * Since orig_freezer->state == FROZEN means that @task has been
- * frozen, so it's sufficient to check the latter condition.
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
+ cgroup_taskset_for_each(task, tset) {
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = true;
+ }
+ }
- if (is_task_frozen_enough(task))
- return -EBUSY;
-
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state == CGROUP_FROZEN)
- return -EBUSY;
+ /* propagate FROZEN clearing upwards */
+ while (clear_frozen && (freezer = parent_freezer(freezer))) {
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = freezer->state & CGROUP_FREEZING;
+ }
- return 0;
+ mutex_unlock(&freezer_mutex);
}
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
/*
- * No lock is needed, since the task isn't on tasklist yet,
- * so it can't be moved to another cgroup, which means the
- * freezer won't be removed and will be valid during this
- * function call.
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
*/
- freezer = task_freezer(task);
-
- /*
- * The root cgroup is non-freezable, so we can skip the
- * following check.
- */
- if (!freezer->css.cgroup->parent)
+ if (task_css_is_root(task, freezer_cgrp_id))
return;
- spin_lock_irq(&freezer->lock);
- BUG_ON(freezer->state == CGROUP_FROZEN);
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
- /* Locking avoids race with FREEZING -> THAWED transitions. */
- if (freezer->state == CGROUP_FREEZING)
- freeze_task(task, true);
- spin_unlock_irq(&freezer->lock);
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
+ freeze_task(task);
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-/*
- * caller must hold freezer->lock
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
*/
-static void update_freezer_state(struct cgroup *cgroup,
- struct freezer *freezer)
+static void update_if_frozen(struct cgroup_subsys_state *css)
{
- struct cgroup_iter it;
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int nfrozen = 0, ntotal = 0;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- ntotal++;
- if (is_task_frozen_enough(task))
- nfrozen++;
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ /* are all tasks frozen? */
+ css_task_iter_start(css, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
}
- /*
- * Transition to FROZEN when no new tasks can be added ensures
- * that we never exist in the FROZEN state while there are unfrozen
- * tasks.
- */
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- else if (nfrozen > 0)
- freezer->state = CGROUP_FREEZING;
- else
- freezer->state = CGROUP_THAWED;
- cgroup_iter_end(cgroup, &it);
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
}
-static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
+static int freezer_read(struct seq_file *m, void *v)
{
- struct freezer *freezer;
- enum freezer_state state;
-
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
-
- freezer = cgroup_freezer(cgroup);
- spin_lock_irq(&freezer->lock);
- state = freezer->state;
- if (state == CGROUP_FREEZING) {
- /* We change from FREEZING to FROZEN lazily if the cgroup was
- * only partially frozen when we exitted write. */
- update_freezer_state(cgroup, freezer);
- state = freezer->state;
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
}
- spin_unlock_irq(&freezer->lock);
- cgroup_unlock();
- seq_puts(m, freezer_state_strs[state]);
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
return 0;
}
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int num_cant_freeze_now = 0;
-
- freezer->state = CGROUP_FREEZING;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task, true))
- continue;
- if (is_task_frozen_enough(task))
- continue;
- if (!freezing(task) && !freezer_should_skip(task))
- num_cant_freeze_now++;
- }
- cgroup_iter_end(cgroup, &it);
- return num_cant_freeze_now ? -EBUSY : 0;
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
}
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- thaw_process(task);
- }
- cgroup_iter_end(cgroup, &it);
-
- freezer->state = CGROUP_THAWED;
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ __thaw_task(task);
+ css_task_iter_end(&it);
}
-static int freezer_change_state(struct cgroup *cgroup,
- enum freezer_state goal_state)
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
{
- struct freezer *freezer;
- int retval = 0;
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
- freezer = cgroup_freezer(cgroup);
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
- spin_lock_irq(&freezer->lock);
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
+ atomic_inc(&system_freezing_cnt);
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
+ }
+}
+
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup_subsys_state *pos;
- update_freezer_state(cgroup, freezer);
- if (goal_state == freezer->state)
- goto out;
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
- switch (goal_state) {
- case CGROUP_THAWED:
- unfreeze_cgroup(cgroup, freezer);
- break;
- case CGROUP_FROZEN:
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- default:
- BUG();
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+
+ rcu_read_lock();
+ css_put(pos);
}
-out:
- spin_unlock_irq(&freezer->lock);
-
- return retval;
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-static int freezer_write(struct cgroup *cgroup,
- struct cftype *cft,
- const char *buffer)
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval;
- enum freezer_state goal_state;
+ bool freeze;
+
+ buf = strstrip(buf);
- if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
- goal_state = CGROUP_THAWED;
- else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
- goal_state = CGROUP_FROZEN;
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
else
return -EINVAL;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
- retval = freezer_change_state(cgroup, goal_state);
- cgroup_unlock();
- return retval;
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
static struct cftype files[] = {
{
.name = "state",
- .read_seq_string = freezer_read,
- .write_string = freezer_write,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
},
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
};
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
- if (!cgroup->parent)
- return 0;
- return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys freezer_subsys = {
- .name = "freezer",
- .create = freezer_create,
- .destroy = freezer_destroy,
- .populate = freezer_populate,
- .subsys_id = freezer_subsys_id,
- .can_attach = freezer_can_attach,
- .attach = NULL,
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
.fork = freezer_fork,
- .exit = NULL,
+ .base_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460..633394f442f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,43 +21,80 @@
#include <linux/unistd.h>
#include <linux/security.h>
#include <linux/timex.h>
+#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
#include <linux/times.h>
#include <linux/ptrace.h>
+#include <linux/gfp.h>
#include <asm/uaccess.h>
-/*
- * Note that the native side is already converted to a timespec, because
- * that's what we want anyway.
- */
-static int compat_get_timeval(struct timespec *o,
- struct compat_timeval __user *i)
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
- long usec;
+ memset(txc, 0, sizeof(struct timex));
- if (get_user(o->tv_sec, &i->tv_sec) ||
- get_user(usec, &i->tv_usec))
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc->modes, &utp->modes) ||
+ __get_user(txc->offset, &utp->offset) ||
+ __get_user(txc->freq, &utp->freq) ||
+ __get_user(txc->maxerror, &utp->maxerror) ||
+ __get_user(txc->esterror, &utp->esterror) ||
+ __get_user(txc->status, &utp->status) ||
+ __get_user(txc->constant, &utp->constant) ||
+ __get_user(txc->precision, &utp->precision) ||
+ __get_user(txc->tolerance, &utp->tolerance) ||
+ __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc->tick, &utp->tick) ||
+ __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc->jitter, &utp->jitter) ||
+ __get_user(txc->shift, &utp->shift) ||
+ __get_user(txc->stabil, &utp->stabil) ||
+ __get_user(txc->jitcnt, &utp->jitcnt) ||
+ __get_user(txc->calcnt, &utp->calcnt) ||
+ __get_user(txc->errcnt, &utp->errcnt) ||
+ __get_user(txc->stbcnt, &utp->stbcnt))
return -EFAULT;
- o->tv_nsec = usec * 1000;
+
return 0;
}
-static int compat_put_timeval(struct compat_timeval __user *o,
- struct timeval *i)
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
{
- return (put_user(i->tv_sec, &o->tv_sec) ||
- put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc->modes, &utp->modes) ||
+ __put_user(txc->offset, &utp->offset) ||
+ __put_user(txc->freq, &utp->freq) ||
+ __put_user(txc->maxerror, &utp->maxerror) ||
+ __put_user(txc->esterror, &utp->esterror) ||
+ __put_user(txc->status, &utp->status) ||
+ __put_user(txc->constant, &utp->constant) ||
+ __put_user(txc->precision, &utp->precision) ||
+ __put_user(txc->tolerance, &utp->tolerance) ||
+ __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc->tick, &utp->tick) ||
+ __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc->jitter, &utp->jitter) ||
+ __put_user(txc->shift, &utp->shift) ||
+ __put_user(txc->stabil, &utp->stabil) ||
+ __put_user(txc->jitcnt, &utp->jitcnt) ||
+ __put_user(txc->calcnt, &utp->calcnt) ||
+ __put_user(txc->errcnt, &utp->errcnt) ||
+ __put_user(txc->stbcnt, &utp->stbcnt) ||
+ __put_user(txc->tai, &utp->tai))
+ return -EFAULT;
+ return 0;
}
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
- if (compat_put_timeval(tv, &ktv))
+ if (compat_put_timeval(&ktv, tv))
return -EFAULT;
}
if (tz) {
@@ -68,38 +105,114 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
return 0;
}
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
- struct timespec kts;
- struct timezone ktz;
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
if (tv) {
- if (compat_get_timeval(&kts, tv))
+ if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ __get_user(tv->tv_sec, &ctv->tv_sec) ||
+ __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ __put_user(tv->tv_sec, &ctv->tv_sec) ||
+ __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_get_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_put_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
+
+int compat_convert_timespec(struct timespec __user **kts,
+ const void __user *cts)
+{
+ struct timespec ts;
+ struct timespec __user *uts;
+
+ if (!cts || COMPAT_USE_64BIT_TIME) {
+ *kts = (struct timespec __user *)cts;
+ return 0;
+ }
+
+ uts = compat_alloc_user_space(sizeof(ts));
+ if (!uts)
+ return -EFAULT;
+ if (compat_get_timespec(&ts, cts))
+ return -EFAULT;
+ if (copy_to_user(uts, &ts, sizeof(ts)))
+ return -EFAULT;
+
+ *kts = uts;
+ return 0;
+}
+
static long compat_nanosleep_restart(struct restart_block *restart)
{
struct compat_timespec __user *rmtp;
@@ -116,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (ret) {
rmtp = restart->nanosleep.compat_rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
return ret;
}
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
mm_segment_t oldfs;
long ret;
- if (get_compat_timespec(&tu, rqtp))
+ if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
if (!timespec_valid(&tu))
@@ -150,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
@@ -177,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -189,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -215,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
if (tbuf) {
struct tms tms;
@@ -234,12 +347,14 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
return compat_jiffies_to_clock_t(jiffies);
}
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
*/
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
{
old_sigset_t s;
long ret;
@@ -253,36 +368,66 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
return ret;
}
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ memcpy(blocked->sig, &set, sizeof(set));
+}
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+#endif
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs ();
-
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
__get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -293,23 +438,20 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
r.rlim_cur = RLIM_INFINITY;
if (r.rlim_max == COMPAT_RLIM_INFINITY)
r.rlim_max = RLIM_INFINITY;
- set_fs(KERNEL_DS);
- ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
- set_fs(old_fs);
- return ret;
+ return do_prlimit(current, resource, &r, NULL);
}
#ifdef COMPAT_RLIM_OLD_INFINITY
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, &r);
+ ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
set_fs(old_fs);
if (!ret) {
@@ -328,16 +470,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
#endif
-asmlinkage long compat_sys_getrlimit (unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
- mm_segment_t old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
- set_fs(old_fs);
+ ret = do_prlimit(current, resource, NULL, &r);
if (!ret) {
if (r.rlim_cur > COMPAT_RLIM_INFINITY)
r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -377,28 +516,11 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
-}
-
-asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
- struct compat_rusage __user *ru)
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
@@ -425,9 +547,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
}
}
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
- struct compat_siginfo __user *uinfo, int options,
- struct compat_rusage __user *uru)
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
{
siginfo_t info;
struct rusage ru;
@@ -445,9 +568,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
return ret;
if (uru) {
- ret = put_compat_rusage(&ru, uru);
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -469,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
return compat_get_bitmap(k, user_mask_ptr, len * 8);
}
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
- unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+ unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
@@ -489,42 +616,39 @@ out:
return retval;
}
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
- unsigned long *k;
- unsigned int min_length = cpumask_size();
- if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
- min_length = sizeof(compat_ulong_t);
-
- if (len < min_length)
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
- if (ret < 0)
- goto out;
-
- k = cpumask_bits(mask);
- ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
- if (ret == 0)
- ret = min_length;
+ if (ret == 0) {
+ size_t retlen = min_t(size_t, len, cpumask_size());
-out:
+ if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
free_cpumask_var(mask);
+
return ret;
}
int get_compat_itimerspec(struct itimerspec *dst,
const struct compat_itimerspec __user *src)
{
- if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
- get_compat_timespec(&dst->it_value, &src->it_value))
+ if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+ __compat_get_timespec(&dst->it_value, &src->it_value))
return -EFAULT;
return 0;
}
@@ -532,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst,
int put_compat_itimerspec(struct compat_itimerspec __user *dst,
const struct itimerspec *src)
{
- if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
- put_compat_timespec(&src->it_value, &dst->it_value))
+ if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+ __compat_put_timespec(&src->it_value, &dst->it_value))
return -EFAULT;
return 0;
}
-long compat_sys_timer_create(clockid_t which_clock,
- struct compat_sigevent __user *timer_event_spec,
- timer_t __user *created_timer_id)
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+ struct compat_sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
{
struct sigevent __user *event = NULL;
@@ -556,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
return sys_timer_create(which_clock, event, created_timer_id);
}
-long compat_sys_timer_settime(timer_t timer_id, int flags,
- struct compat_itimerspec __user *new,
- struct compat_itimerspec __user *old)
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ struct compat_itimerspec __user *, new,
+ struct compat_itimerspec __user *, old)
{
long err;
mm_segment_t oldfs;
@@ -579,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
return err;
}
-long compat_sys_timer_gettime(timer_t timer_id,
- struct compat_itimerspec __user *setting)
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct compat_itimerspec __user *, setting)
{
long err;
mm_segment_t oldfs;
@@ -596,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id,
return err;
}
-long compat_sys_clock_settime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
- if (get_compat_timespec(&ts, tp))
+ if (compat_get_timespec(&ts, tp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
@@ -613,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
return err;
}
-long compat_sys_clock_gettime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -625,13 +749,36 @@ long compat_sys_clock_gettime(clockid_t which_clock,
err = sys_clock_gettime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && put_compat_timespec(&ts, tp))
+ if (!err && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
-long compat_sys_clock_getres(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+ struct compat_timex __user *, utp)
+{
+ struct timex txc;
+ mm_segment_t oldfs;
+ int err, ret;
+
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+ set_fs(oldfs);
+
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -642,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
err = sys_clock_getres(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && tp && put_compat_timespec(&ts, tp))
+ if (!err && tp && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
@@ -652,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
long err;
mm_segment_t oldfs;
struct timespec tu;
- struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+ struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
restart->nanosleep.rmtp = (struct timespec __user *) &tu;
oldfs = get_fs();
@@ -661,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&tu, rmtp))
+ compat_put_timespec(&tu, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -671,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
return err;
}
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
- struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+ struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
long err;
mm_segment_t oldfs;
struct timespec in, out;
struct restart_block *restart;
- if (get_compat_timespec(&in, rqtp))
+ if (compat_get_timespec(&in, rqtp))
return -EFAULT;
oldfs = get_fs();
@@ -691,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&out, rmtp))
+ compat_put_timespec(&out, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -805,7 +952,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -814,18 +961,28 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
}
}
+EXPORT_SYMBOL_GPL(sigset_from_compat);
+
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
- int sig;
struct timespec t;
siginfo_t info;
- long ret, timeout = 0;
+ long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
@@ -833,60 +990,27 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&s, &s32);
- sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&s);
if (uts) {
- if (get_compat_timespec (&t, uts))
+ if (compat_get_timespec(&t, uts))
return -EFAULT;
- if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
- || t.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = timespec_to_jiffies(&t)
- +(t.tv_sec || t.tv_nsec);
- if (timeout) {
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &s);
-
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
- }else {
- ret = timeout?-EINTR:-EAGAIN;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
}
- return ret;
+ return ret;
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
{
compat_time_t i;
struct timeval tv;
@@ -902,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
return i;
}
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
struct timespec tv;
int err;
@@ -922,99 +1046,30 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- current->blocked = newset;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
{
struct timex txc;
- int ret;
-
- memset(&txc, 0, sizeof(struct timex));
+ int err, ret;
- if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
- __get_user(txc.modes, &utp->modes) ||
- __get_user(txc.offset, &utp->offset) ||
- __get_user(txc.freq, &utp->freq) ||
- __get_user(txc.maxerror, &utp->maxerror) ||
- __get_user(txc.esterror, &utp->esterror) ||
- __get_user(txc.status, &utp->status) ||
- __get_user(txc.constant, &utp->constant) ||
- __get_user(txc.precision, &utp->precision) ||
- __get_user(txc.tolerance, &utp->tolerance) ||
- __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __get_user(txc.tick, &utp->tick) ||
- __get_user(txc.ppsfreq, &utp->ppsfreq) ||
- __get_user(txc.jitter, &utp->jitter) ||
- __get_user(txc.shift, &utp->shift) ||
- __get_user(txc.stabil, &utp->stabil) ||
- __get_user(txc.jitcnt, &utp->jitcnt) ||
- __get_user(txc.calcnt, &utp->calcnt) ||
- __get_user(txc.errcnt, &utp->errcnt) ||
- __get_user(txc.stbcnt, &utp->stbcnt))
- return -EFAULT;
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
ret = do_adjtimex(&txc);
- if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
- __put_user(txc.modes, &utp->modes) ||
- __put_user(txc.offset, &utp->offset) ||
- __put_user(txc.freq, &utp->freq) ||
- __put_user(txc.maxerror, &utp->maxerror) ||
- __put_user(txc.esterror, &utp->esterror) ||
- __put_user(txc.status, &utp->status) ||
- __put_user(txc.constant, &utp->constant) ||
- __put_user(txc.precision, &utp->precision) ||
- __put_user(txc.tolerance, &utp->tolerance) ||
- __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __put_user(txc.tick, &utp->tick) ||
- __put_user(txc.ppsfreq, &utp->ppsfreq) ||
- __put_user(txc.jitter, &utp->jitter) ||
- __put_user(txc.shift, &utp->shift) ||
- __put_user(txc.stabil, &utp->stabil) ||
- __put_user(txc.jitcnt, &utp->jitcnt) ||
- __put_user(txc.calcnt, &utp->calcnt) ||
- __put_user(txc.errcnt, &utp->errcnt) ||
- __put_user(txc.stbcnt, &utp->stbcnt) ||
- __put_user(txc.tai, &utp->tai))
- ret = -EFAULT;
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
return ret;
}
#ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
- compat_uptr_t __user *pages32,
- const int __user *nodes,
- int __user *status,
- int flags)
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
{
const void __user * __user *pages;
int i;
@@ -1030,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
- compat_ulong_t maxnode,
- const compat_ulong_t __user *old_nodes,
- const compat_ulong_t __user *new_nodes)
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
@@ -1064,67 +1119,39 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
{
- struct sysinfo s;
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs();
- do_sysinfo(&s);
+ set_fs(KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+ set_fs(old_fs);
+ if (compat_put_timespec(&t, interval))
+ return -EFAULT;
+ return ret;
+}
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
+/*
+ * Allocate user-space memory for the duration of a single system call,
+ * in order to marshall parameters inside a compat thunk.
+ */
+void __user *compat_alloc_user_space(unsigned long len)
+{
+ void __user *ptr;
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
+ /* If len would occupy more than half of the entire compat space... */
+ if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
+ return NULL;
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
+ ptr = arch_compat_alloc_user_space(len);
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
- return -EFAULT;
+ if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+ return NULL;
- return 0;
+ return ptr;
}
+EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecb..c18b1f1ae51 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
static const struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
+ .llseek = default_llseek,
};
static int __init ikconfig_init(void)
@@ -78,7 +79,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- entry->size = kernel_config_data_size;
+ proc_set_size(entry, kernel_config_data_size);
return 0;
}
@@ -91,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
module_init(ikconfig_init);
module_exit(ikconfig_cleanup);
+#endif /* CONFIG_IKCONFIG_PROC */
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Randy Dunlap");
MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
-
-#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 00000000000..5664985c46a
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,216 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+
+void context_tracking_cpu_set(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+}
+
+/**
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
+void context_tracking_user_enter(void)
+{
+ unsigned long flags;
+
+ /*
+ * Repeat the user_enter() check here because some archs may be calling
+ * this from asm and if no CPU needs context tracking, they shouldn't
+ * go further. Repeat the check here until they support the inline static
+ * key check.
+ */
+ if (!context_tracking_is_enabled())
+ return;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ /* Kernel threads aren't supposed to go to userspace */
+ WARN_ON_ONCE(!current->mm);
+
+ local_irq_save(flags);
+ if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ trace_user_enter(0);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
+ rcu_user_enter();
+ }
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ __this_cpu_write(context_tracking.state, IN_USER);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
+
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ /*
+ * Need to disable preemption in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ preempt_disable_notrace();
+ prev_ctx = exception_enter();
+ preempt_enable_no_resched_notrace();
+
+ preempt_schedule();
+
+ preempt_disable_notrace();
+ exception_exit(prev_ctx);
+ preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
+
+/**
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
+void context_tracking_user_exit(void)
+{
+ unsigned long flags;
+
+ if (!context_tracking_is_enabled())
+ return;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
+ rcu_user_exit();
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
+ __this_cpu_write(context_tracking.state, IN_KERNEL);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
+
+/**
+ * __context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
+void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb..a343bde710b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,22 +10,52 @@
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <linux/suspend.h>
+#include <linux/lockdep.h>
+#include <trace/events/power.h>
+
+#include "smpboot.h"
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+/*
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(cpu_notifier_register_done);
+
+static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
+#ifdef CONFIG_HOTPLUG_CPU
+
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -34,22 +64,30 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
-} cpu_hotplug;
-void __init cpu_hotplug_init(void)
-{
- cpu_hotplug.active_writer = NULL;
- mutex_init(&cpu_hotplug.lock);
- cpu_hotplug.refcount = 0;
-}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} cpu_hotplug = {
+ .active_writer = NULL,
+ .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+ .refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
+};
-#ifdef CONFIG_HOTPLUG_CPU
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
+ cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
@@ -62,29 +100,18 @@ void put_online_cpus(void)
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
+
+ if (WARN_ON(!cpu_hotplug.refcount))
+ cpu_hotplug.refcount++; /* try to fix things up */
+
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
-#endif /* CONFIG_HOTPLUG_CPU */
-
-/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
- */
-void cpu_maps_update_begin(void)
-{
- mutex_lock(&cpu_add_remove_lock);
-}
-
-void cpu_maps_update_done(void)
-{
- mutex_unlock(&cpu_add_remove_lock);
-}
-
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
@@ -107,10 +134,11 @@ void cpu_maps_update_done(void)
* get_online_cpus() not an api which is called all that often.
*
*/
-static void cpu_hotplug_begin(void)
+void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
@@ -121,11 +149,36 @@ static void cpu_hotplug_begin(void)
}
}
-static void cpu_hotplug_done(void)
+void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
+}
+
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
@@ -136,9 +189,35 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+ int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+ nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_notify(unsigned long val, void *v)
+{
+ return __cpu_notify(val, v, -1, NULL);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
@@ -148,19 +227,66 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask. While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+ struct task_struct *p;
+
+ /*
+ * This function is called after the cpu is taken down and marked
+ * offline, so its not like new tasks will ever get this cpu set in
+ * their mm mask. -- Peter Zijlstra
+ * Thus, we may use rcu_read_lock() here, instead of grabbing
+ * full-fledged tasklist_lock.
+ */
+ WARN_ON(cpu_online(cpu));
+ rcu_read_lock();
+ for_each_process(p) {
+ struct task_struct *t;
+
+ /*
+ * Main thread might exit, but other threads may still have
+ * a valid mm. Find one.
+ */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+}
+
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
+ cputime_t utime, stime;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
- if (task_cpu(p) == cpu &&
- (!cputime_eq(p->utime, cputime_zero) ||
- !cputime_eq(p->stime, cputime_zero)))
- printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
- (state = %ld, flags = %x) \n",
- p->comm, task_pid_nr(p), cpu,
- p->state, p->flags);
+ task_cputime(p, &utime, &stime);
+ if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
+ (utime || stime))
+ pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
+ p->comm, task_pid_nr(p), cpu,
+ p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
@@ -181,12 +307,9 @@ static int __ref take_cpu_down(void *_param)
if (err < 0)
return err;
- raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
- param->hcpu);
-
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
+ cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Park the stopper thread */
+ kthread_park(current);
return 0;
}
@@ -194,7 +317,6 @@ static int __ref take_cpu_down(void *_param)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
- cpumask_var_t old_allowed;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
@@ -208,62 +330,69 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
- return -ENOMEM;
-
cpu_hotplug_begin();
- err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
- hcpu, -1, &nr_calls);
- if (err == NOTIFY_BAD) {
+
+ err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (err) {
nr_calls--;
- __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu, nr_calls, NULL);
- printk("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
- err = -EINVAL;
+ __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
goto out_release;
}
- /* Ensure that we are not runnable on dying cpu */
- cpumask_copy(old_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current,
- cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so explicitly call both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+#ifdef CONFIG_PREEMPT
+ synchronize_sched();
+#endif
+ synchronize_rcu();
+
+ smpboot_park_threads(cpu);
+
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
-
- goto out_allowed;
+ smpboot_unpark_threads(cpu);
+ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+ goto out_release;
}
BUG_ON(cpu_online(cpu));
- /* Wait for it to sleep (leaving idle task). */
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
while (!idle_cpu(cpu))
- yield();
+ cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
+ cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
-out_allowed:
- set_cpus_allowed_ptr(current, old_allowed);
out_release:
cpu_hotplug_done();
- if (!err) {
- if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
- }
- free_cpumask_var(old_allowed);
+ if (!err)
+ cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
}
@@ -271,9 +400,6 @@ int __ref cpu_down(unsigned int cpu)
{
int err;
- err = stop_machine_create();
- if (err)
- return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -281,85 +407,86 @@ int __ref cpu_down(unsigned int cpu)
goto out;
}
- cpu_clear(cpu, cpu_active_map);
-
- /*
- * Make sure the all cpus did the reschedule and are not
- * using stale version of the cpu_active_mask.
- * This is not strictly necessary becuase stop_machine()
- * that we run down the line already provides the required
- * synchronization. But it's really a side effect and we do not
- * want to depend on the innards of the stop_machine here.
- */
- synchronize_sched();
-
err = _cpu_down(cpu, 0);
- if (cpu_online(cpu))
- cpu_set(cpu, cpu_active_map);
-
out:
cpu_maps_update_done();
- stop_machine_destroy();
return err;
}
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-
- if (cpu_online(cpu) || !cpu_present(cpu))
- return -EINVAL;
+ struct task_struct *idle;
cpu_hotplug_begin();
- ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
- -1, &nr_calls);
- if (ret == NOTIFY_BAD) {
- nr_calls--;
- printk("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+
+ if (cpu_online(cpu) || !cpu_present(cpu)) {
ret = -EINVAL;
+ goto out;
+ }
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
+ ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (ret) {
+ nr_calls--;
+ pr_warn("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
/* Arch-specific enabling code. */
- ret = __cpu_up(cpu);
+ ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- cpu_set(cpu, cpu_active_map);
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+ cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- __raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
cpu_hotplug_done();
return ret;
}
-int __cpuinit cpu_up(unsigned int cpu)
+int cpu_up(unsigned int cpu)
{
int err = 0;
+
if (!cpu_possible(cpu)) {
- printk(KERN_ERR "can't online cpu %d because it is not "
- "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
- printk(KERN_ERR "please check additional_cpus= boot "
- "parameter\n");
+ pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
+ cpu);
+#if defined(CONFIG_IA64)
+ pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
+ err = try_online_node(cpu_to_node(cpu));
+ if (err)
+ return err;
+
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -373,49 +500,57 @@ out:
cpu_maps_update_done();
return err;
}
+EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
int disable_nonboot_cpus(void)
{
- int cpu, first_cpu, error;
+ int cpu, first_cpu, error = 0;
- error = stop_machine_create();
- if (error)
- return error;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
- /* We take down all of the non-boot CPUs in one shot to avoid races
+ /*
+ * We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
- printk("Disabling non-boot CPUs ...\n");
+
+ pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
- if (!error) {
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
+ if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
- printk("CPU%d is down\n", cpu);
- } else {
- printk(KERN_ERR "Error taking CPU%d down: %d\n",
- cpu, error);
+ else {
+ pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
+
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
- printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+ pr_err("Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
- stop_machine_destroy();
return error;
}
+void __weak arch_enable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_enable_nonboot_cpus_end(void)
+{
+}
+
void __ref enable_nonboot_cpus(void)
{
int cpu, error;
@@ -426,27 +561,83 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk("Enabling non-boot CPUs ...\n");
+ pr_info("Enabling non-boot CPUs ...\n");
+
+ arch_enable_nonboot_cpus_begin();
+
for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
- printk("CPU%d is up\n", cpu);
+ pr_info("CPU%d is up\n", cpu);
continue;
}
- printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+ pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
+
+ arch_enable_nonboot_cpus_end();
+
cpumask_clear(frozen_cpus);
out:
cpu_maps_update_done();
}
-static int alloc_frozen_cpus(void)
+static int __init alloc_frozen_cpus(void)
{
if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
return -ENOMEM;
return 0;
}
core_initcall(alloc_frozen_cpus);
+
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+ unsigned long action, void *ptr)
+{
+ switch (action) {
+
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ cpu_hotplug_disable();
+ break;
+
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ cpu_hotplug_enable();
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+
+static int __init cpu_hotplug_pm_sync_init(void)
+{
+ /*
+ * cpu_hotplug_pm_callback has higher priority than x86
+ * bsp_pm_callback which depends on cpu_hotplug_pm_callback
+ * to disable cpu hotplug to avoid cpu hotplug race.
+ */
+ pm_notifier(cpu_hotplug_pm_callback, 0);
+ return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
+
#endif /* CONFIG_PM_SLEEP_SMP */
/**
@@ -457,7 +648,7 @@ core_initcall(alloc_frozen_cpus);
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-void __cpuinit notify_cpu_starting(unsigned int cpu)
+void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
@@ -465,7 +656,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
- raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+ cpu_notify(val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
@@ -479,7 +670,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
@@ -537,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 00000000000..9656a3c3650
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author:
+ * Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu_pm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/syscore_ops.h>
+
+static DEFINE_RWLOCK(cpu_pm_notifier_lock);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ nr_to_call, nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+/**
+ * cpu_pm_register_notifier - register a driver with cpu_pm
+ * @nb: notifier block to register
+ *
+ * Add a driver to a list of drivers that are notified about
+ * CPU and CPU cluster low power entry and exit.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_register.
+ */
+int cpu_pm_register_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
+
+/**
+ * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
+ * @nb: notifier block to be unregistered
+ *
+ * Remove a driver from the CPU PM notifier list.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_unregister.
+ */
+int cpu_pm_unregister_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
+
+/**
+ * cpu_pm_enter - CPU low power entry notifier
+ *
+ * Notifies listeners that a single CPU is entering a low power state that may
+ * cause some blocks in the same power domain as the cpu to reset.
+ *
+ * Must be called on the affected CPU with interrupts disabled. Platform is
+ * responsible for ensuring that cpu_pm_enter is not called twice on the same
+ * CPU before cpu_pm_exit is called. Notified drivers can include VFP
+ * co-processor, interrupt controller and its PM extensions, local CPU
+ * timers context save/restore which shouldn't be interrupted. Hence it
+ * must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_enter(void)
+{
+ int nr_calls;
+ int ret = 0;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+ if (ret)
+ /*
+ * Inform listeners (nr_calls - 1) about failure of CPU PM
+ * PM entry who are notified earlier to prepare for it.
+ */
+ cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_enter);
+
+/**
+ * cpu_pm_exit - CPU low power exit notifier
+ *
+ * Notifies listeners that a single CPU is exiting a low power state that may
+ * have caused some blocks in the same power domain as the cpu to reset.
+ *
+ * Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_exit(void)
+{
+ int ret;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_exit);
+
+/**
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
+ *
+ * Notifies listeners that all cpus in a power domain are entering a low power
+ * state that may cause some blocks in the same power domain to reset.
+ *
+ * Must be called after cpu_pm_enter has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_enter(void)
+{
+ int nr_calls;
+ int ret = 0;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+ if (ret)
+ /*
+ * Inform listeners (nr_calls - 1) about failure of CPU cluster
+ * PM entry who are notified earlier to prepare for it.
+ */
+ cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
+
+/**
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
+ *
+ * Notifies listeners that all cpus in a power domain are exiting form a
+ * low power state that may have caused some blocks in the same power domain
+ * to reset.
+ *
+ * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_exit(void)
+{
+ int ret;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
+
+#ifdef CONFIG_PM
+static int cpu_pm_suspend(void)
+{
+ int ret;
+
+ ret = cpu_pm_enter();
+ if (ret)
+ return ret;
+
+ ret = cpu_cluster_pm_enter();
+ return ret;
+}
+
+static void cpu_pm_resume(void)
+{
+ cpu_cluster_pm_exit();
+ cpu_pm_exit();
+}
+
+static struct syscore_ops cpu_pm_syscore_ops = {
+ .suspend = cpu_pm_suspend,
+ .resume = cpu_pm_resume,
+};
+
+static int cpu_pm_init(void)
+{
+ register_syscore_ops(&cpu_pm_syscore_ops);
+ return 0;
+}
+core_initcall(cpu_pm_init);
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa0..116a4164720 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
@@ -55,29 +55,13 @@
#include <linux/sort.h>
#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/wait.h>
-/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
-
-/* Forward declare cgroup structures */
-struct cgroup_subsys cpuset_subsys;
-struct cpuset;
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -95,46 +79,65 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
-
/*
- * Copy of global cpuset_mems_generation as of the most
- * recent time this cpuset changed its mems_allowed.
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
*/
- int mems_generation;
+ nodemask_t old_mems_allowed;
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
-
- /* used for walking a cpuset heirarchy */
- struct list_head stack_list;
};
-/* Retrieve the cpuset for a cgroup */
-static inline struct cpuset *cgroup_cs(struct cgroup *cont)
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
- struct cpuset, css);
+ return css ? container_of(css, struct cpuset, css) : NULL;
}
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
- return container_of(task_subsys_state(task, cpuset_subsys_id),
- struct cpuset, css);
+ return css_cs(task_css(task, cpuset_cgrp_id));
}
-struct cpuset_hotplug_scanner {
- struct cgroup_scanner scan;
- struct cgroup *to;
-};
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
+}
+
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -145,6 +148,11 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -180,49 +188,54 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value. Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement. So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,8 +245,9 @@ static struct cpuset top_cpuset = {
* If a task is only holding callback_mutex, then it has read-only
* access to cpusets.
*
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
*
* The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
@@ -243,36 +257,33 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers. They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
+ * CPU / memory hotplug is handled asynchronously.
*/
-#define CPUSET_NAME_LEN (128)
-#define CPUSET_NODELIST_LEN (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static int cpuset_get_sb(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name, void *data)
{
struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- int ret = -ENODEV;
+ struct dentry *ret = ERR_PTR(-ENODEV);
if (cgroup_fs) {
char mountopts[] =
"cpuset,noprefix,"
"release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->get_sb(cgroup_fs, flags,
- unused_dev_name, mountopts, mnt);
+ ret = cgroup_fs->mount(cgroup_fs, flags,
+ unused_dev_name, mountopts);
put_filesystem(cgroup_fs);
}
return ret;
@@ -280,130 +291,61 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
- .get_sb = cpuset_get_sb,
+ .mount = cpuset_mount,
};
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. If we get
- * all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * until we find one that does have some online cpus. The top
+ * cpuset always has some cpus online.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_cpus(const struct cpuset *cs,
- struct cpumask *pmask)
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
- if (cs)
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
- else
- cpumask_copy(pmask, cpu_online_mask);
- BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
+ while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ cs = parent_cs(cs);
+ cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
}
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
* up the cpuset hierarchy until we find one that does have some
- * online mems. If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_HIGH_MEMORY].
+ * online mems. The top cpuset always has some mems online.
*
* One way or another, we guarantee to return some non-empty subset
- * of node_states[N_HIGH_MEMORY].
+ * of node_states[N_MEMORY].
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (cs && !nodes_intersects(cs->mems_allowed,
- node_states[N_HIGH_MEMORY]))
- cs = cs->parent;
- if (cs)
- nodes_and(*pmask, cs->mems_allowed,
- node_states[N_HIGH_MEMORY]);
- else
- *pmask = node_states[N_HIGH_MEMORY];
- BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+ while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ cs = parent_cs(cs);
+ nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
}
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held. May be
- * called with or without cgroup_mutex held. Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL. This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation. However this really only
- * matters on alpha systems using cpusets heavily. If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant. Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Called with callback_mutex/cpuset_mutex held
*/
-
-void cpuset_update_task_memory_state(void)
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
+ struct task_struct *tsk)
{
- int my_cpusets_mem_gen;
- struct task_struct *tsk = current;
- struct cpuset *cs;
-
- rcu_read_lock();
- my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
- rcu_read_unlock();
-
- if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
- mutex_lock(&callback_mutex);
- task_lock(tsk);
- cs = task_cs(tsk); /* Maybe changed when task not locked */
- guarantee_online_mems(cs, &tsk->mems_allowed);
- tsk->cpuset_mems_generation = cs->mems_generation;
- if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
- else
- tsk->flags &= ~PF_SPREAD_PAGE;
- if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
- else
- tsk->flags &= ~PF_SPREAD_SLAB;
- task_unlock(tsk);
- mutex_unlock(&callback_mutex);
- mpol_rebind_task(tsk, &tsk->mems_allowed);
- }
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
}
/*
@@ -411,7 +353,7 @@ void cpuset_update_task_memory_state(void)
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -426,7 +368,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
-static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
@@ -460,7 +402,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -473,54 +415,69 @@ static void free_trial_cpuset(struct cpuset *trial)
* Return 0 if valid, -errno if not.
*/
-static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
- struct cgroup *cont;
+ struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
}
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
+#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
@@ -538,31 +495,27 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs)
+ continue;
- if (cpumask_empty(cp->cpus_allowed))
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -571,7 +524,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* This function builds a partial partition of the systems CPUs
* A 'partial partition' is a set of non-overlapping subsets whose
* union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched.c
+ * The output of this function needs to be passed to kernel/sched/core.c
* partition_sched_domains() routine, which will rebuild the scheduler's
* load balancing domains (sched domains) as specified by that partial
* partition.
@@ -584,7 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -600,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* is a subset of one of these domains, while there are as
* many such domains as possible, each as small as possible.
* doms - Conversion of 'csa' to an array of cpumasks, for passing to
- * the kernel/sched.c routine partition_sched_domains() in a
+ * the kernel/sched/core.c routine partition_sched_domains() in a
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
@@ -619,19 +572,18 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* element of the partition (one sched domain) to be passed to
* partition_sched_domains().
*/
-/* FIXME: see the FIXME in partition_sched_domains() */
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
- struct cpumask *doms; /* resulting partition; i.e. sched domains */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
doms = NULL;
dattr = NULL;
@@ -639,7 +591,8 @@ static int generate_sched_domains(struct cpumask **domains,
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
- doms = kmalloc(cpumask_size(), GFP_KERNEL);
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -648,44 +601,39 @@ static int generate_sched_domains(struct cpumask **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms, top_cpuset.cpus_allowed);
+ cpumask_copy(doms[0], top_cpuset.cpus_allowed);
- ndoms = 1;
goto done;
}
- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
continue;
-
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -718,7 +666,7 @@ restart:
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
- doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
+ doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -738,16 +686,13 @@ restart:
continue;
}
- dp = doms + nslot;
+ dp = doms[nslot];
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
+ pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+ nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
@@ -790,143 +735,163 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
- struct cpumask *doms;
+ cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
+ /*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+ if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ goto out;
+
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
+out:
put_online_cpus();
}
-
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
+#else /* !CONFIG_SMP */
+static void rebuild_sched_domains_locked(void)
{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
}
+#endif /* CONFIG_SMP */
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
void rebuild_sched_domains(void)
{
- do_rebuild_sched_domains(NULL);
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
-/**
- * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Call with cgroup_mutex held. May take callback_mutex during call.
- * Called for each task in a cgroup by cgroup_scan_tasks().
- * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
- * words, if its mask is not equal to its cpuset's mask).
+/*
+ * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
+ * @cs: the cpuset in interest
+ *
+ * A cpuset's effective cpumask is the cpumask of the nearest ancestor
+ * with non-empty cpus. We use effective cpumask whenever:
+ * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
+ * if the cpuset they reside in has no cpus)
+ * - we want to retrieve task_cs(tsk)'s cpus_allowed.
+ *
+ * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
+ * exception. See comments there.
*/
-static int cpuset_test_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
{
- return !cpumask_equal(&tsk->cpus_allowed,
- (cgroup_cs(scan->cg))->cpus_allowed);
+ while (cpumask_empty(cs->cpus_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
-/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner containing the cgroup of the task
+/*
+ * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
+ * @cs: the cpuset in interest
*
- * Called by cgroup_scan_tasks() for each task in a cgroup whose
- * cpus_allowed mask needs to be changed.
+ * A cpuset's effective nodemask is the nodemask of the nearest ancestor
+ * with non-empty memss. We use effective nodemask whenever:
+ * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
+ * if the cpuset they reside in has no mems)
+ * - we want to retrieve task_cs(tsk)'s mems_allowed.
*
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * Called with cpuset_mutex held.
*/
-static void cpuset_change_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
{
- set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+ while (nodes_empty(cs->mems_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ */
+static void update_tasks_cpumask(struct cpuset *cs)
+{
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ css_task_iter_end(&it);
+}
+
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
*
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Called with cpuset_mutex held
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
{
- struct cgroup_scanner scan;
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
- scan.cg = cs->css.cgroup;
- scan.test_task = cpuset_test_cpumask;
- scan.process_task = cpuset_change_cpumask;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ update_tasks_cpumask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
}
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
+ * @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- struct ptr_heap heap;
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -943,19 +908,16 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
return -EINVAL;
}
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
return retval;
is_load_balanced = is_sched_load_balance(trialcs);
@@ -964,16 +926,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- /*
- * Scan tasks in the cpuset, and update the cpumasks of any
- * that need an update.
- */
- update_tasks_cpumask(cs, &heap);
-
- heap_free(&heap);
+ update_tasks_cpumask_hier(cs, true);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -985,45 +941,78 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
- * during this call, as manage_mutex holds off any cpuset_attach()
- * calls. Therefore we don't need to take task_lock around the
- * call to guarantee_online_mems(), as we know no one is changing
- * our task's cpuset.
- *
- * Hold callback_mutex around the two modifications of our tasks
- * mems_allowed to synchronize with cpuset_mems_allowed().
- *
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
* migrating memory region.
- *
- * We call cpuset_update_task_memory_state() before hacking
- * our tasks mems_allowed, so that we are assured of being in
- * sync with our tasks cpuset, and in particular, callbacks to
- * cpuset_update_task_memory_state() from nested page allocations
- * won't see any mismatch of our cpuset and task mems_generation
- * values, so won't overwrite our hacked tasks mems_allowed
- * nodemask.
*/
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
+ struct cpuset *mems_cs;
- cpuset_update_task_memory_state();
-
- mutex_lock(&callback_mutex);
tsk->mems_allowed = *to;
- mutex_unlock(&callback_mutex);
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- mutex_lock(&callback_mutex);
- guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
- mutex_unlock(&callback_mutex);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ rcu_read_unlock();
+}
+
+/*
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ nodemask_t *newmems)
+{
+ bool need_loop;
+
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return;
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return;
+
+ task_lock(tsk);
+ /*
+ * Determine if a loop is necessary if another thread is doing
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
+ */
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);
+
+ if (need_loop) {
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
+ }
+
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ tsk->mems_allowed = *newmems;
+
+ if (need_loop) {
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
+
+ task_unlock(tsk);
}
static void *cpuset_being_rebound;
@@ -1031,107 +1020,110 @@ static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @oldmem: old mems_allowed of cpuset cs
*
- * Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
-{
- struct task_struct *p;
- struct mm_struct **mmarray;
- int i, n, ntasks;
- int migrate;
- int fudge;
- struct cgroup_iter it;
- int retval;
+static void update_tasks_nodemask(struct cpuset *cs)
+{
+ static nodemask_t newmems; /* protected by cpuset_mutex */
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- fudge = 10; /* spare mmarray[] slots */
- fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
- retval = -ENOMEM;
+ guarantee_online_mems(mems_cs, &newmems);
/*
- * Allocate mmarray[] to hold mm reference for each task
- * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
- * tasklist_lock. We could use GFP_ATOMIC, but with a
- * few more lines of code, we can retry until we get a big
- * enough mmarray[] w/o using GFP_ATOMIC.
+ * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * take while holding tasklist_lock. Forks can happen - the
+ * mpol_dup() cpuset_being_rebound check will catch such forks,
+ * and rebind their vma mempolicies too. Because we still hold
+ * the global cpuset_mutex, we know that no other rebind effort
+ * will be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
*/
- while (1) {
- ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
- ntasks += fudge;
- mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
- if (!mmarray)
- goto done;
- read_lock(&tasklist_lock); /* block fork */
- if (cgroup_task_count(cs->css.cgroup) <= ntasks)
- break; /* got enough */
- read_unlock(&tasklist_lock); /* try again */
- kfree(mmarray);
- }
-
- n = 0;
-
- /* Load up mmarray[] with mm reference for each task in cpuset. */
- cgroup_iter_start(cs->css.cgroup, &it);
- while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
+ bool migrate;
- if (n >= ntasks) {
- printk(KERN_WARNING
- "Cpuset mempolicy rebind incomplete.\n");
- break;
- }
- mm = get_task_mm(p);
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
if (!mm)
continue;
- mmarray[n++] = mm;
- }
- cgroup_iter_end(cs->css.cgroup, &it);
- read_unlock(&tasklist_lock);
- /*
- * Now that we've dropped the tasklist spinlock, we can
- * rebind the vma mempolicies of each mm in mmarray[] to their
- * new cpuset, and release that mm. The mpol_rebind_mm()
- * call takes mmap_sem, which we couldn't take while holding
- * tasklist_lock. Forks can happen again now - the mpol_dup()
- * cpuset_being_rebound check will catch such forks, and rebind
- * their vma mempolicies too. Because we still hold the global
- * cgroup_mutex, we know that no other rebind effort will
- * be contending for the global variable cpuset_being_rebound.
- * It's ok if we rebind the same mm twice; mpol_rebind_mm()
- * is idempotent. Also migrate pages in each mm to new nodes.
- */
- migrate = is_memory_migrate(cs);
- for (i = 0; i < n; i++) {
- struct mm_struct *mm = mmarray[i];
+ migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
mmput(mm);
}
+ css_task_iter_end(&it);
+
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
- kfree(mmarray);
cpuset_being_rebound = NULL;
- retval = 0;
-done:
- return retval;
+}
+
+/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!nodes_empty(cp->mems_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ update_tasks_nodemask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
}
/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1139,15 +1131,16 @@ done:
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- nodemask_t oldmem;
int retval;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+ * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
* it's read-only
*/
- if (cs == &top_cpuset)
- return -EACCES;
+ if (cs == &top_cpuset) {
+ retval = -EACCES;
+ goto done;
+ }
/*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1163,11 +1156,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_HIGH_MEMORY]))
- return -EINVAL;
+ node_states[N_MEMORY])) {
+ retval = -EINVAL;
+ goto done;
+ }
}
- oldmem = cs->mems_allowed;
- if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
@@ -1177,49 +1172,76 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
- cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
- retval = update_tasks_nodemask(cs, &oldmem);
+ update_tasks_nodemask_hier(cs, true);
done:
return retval;
}
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
- if (val < -1 || val >= SD_LV_MAX)
+#ifdef CONFIG_SMP
+ if (val < -1 || val >= sched_domain_level_max)
return -EINVAL;
+#endif
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
}
+/**
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ *
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
+ */
+static void update_tasks_flags(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
+}
+
/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
- int err;
int balance_flag_changed;
+ int spread_flag_changed;
+ int err;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
@@ -1237,13 +1259,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
+ spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+ || (is_spread_page(cs) != is_spread_page(trialcs)));
+
mutex_lock(&callback_mutex);
cs->flags = trialcs->flags;
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
+ if (spread_flag_changed)
+ update_tasks_flags(cs);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1347,59 +1374,140 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
+static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss,
- struct cgroup *cont, struct task_struct *tsk)
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
- struct cpuset *cs = cgroup_cs(cont);
- int ret = 0;
+ struct cpuset *cs = css_cs(css);
+ struct task_struct *task;
+ int ret;
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
- if (tsk->flags & PF_THREAD_BOUND) {
- mutex_lock(&callback_mutex);
- if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
- ret = -EINVAL;
- mutex_unlock(&callback_mutex);
+ mutex_lock(&cpuset_mutex);
+
+ /*
+ * We allow to move tasks into an empty cpuset if sane_behavior
+ * flag is set.
+ */
+ ret = -ENOSPC;
+ if (!cgroup_sane_behavior(css->cgroup) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ goto out_unlock;
+
+ cgroup_taskset_for_each(task, tset) {
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ ret = -EINVAL;
+ if (task->flags & PF_NO_SETAFFINITY)
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
}
-static void cpuset_attach(struct cgroup_subsys *ss,
- struct cgroup *cont, struct cgroup *oldcont,
- struct task_struct *tsk)
+static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ css_cs(css)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
- nodemask_t from, to;
+ /* static buf protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
- struct cpuset *cs = cgroup_cs(cont);
- struct cpuset *oldcs = cgroup_cs(oldcont);
- int err;
+ struct task_struct *task;
+ struct task_struct *leader = cgroup_taskset_first(tset);
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
- if (cs == &top_cpuset) {
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- mutex_lock(&callback_mutex);
- guarantee_online_cpus(cs, cpus_attach);
- mutex_unlock(&callback_mutex);
+ else
+ guarantee_online_cpus(cpus_cs, cpus_attach);
+
+ guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+
+ cgroup_taskset_for_each(task, tset) {
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flag(cs, task);
}
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- if (err)
- return;
- from = oldcs->mems_allowed;
- to = cs->mems_allowed;
- mm = get_task_mm(tsk);
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_to = cs->mems_allowed;
+ mm = get_task_mm(leader);
if (mm) {
- mpol_rebind_mm(mm, &to);
- if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &from, &to);
+ struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
+
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+ /*
+ * old_mems_allowed is the same with mems_allowed here, except
+ * if this task is being moved automatically due to hotplug.
+ * In that case @mems_allowed has been updated and is empty,
+ * so @old_mems_allowed is the right nodesets that we migrate
+ * mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+ }
mmput(mm);
}
+
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1419,14 +1527,18 @@ typedef enum {
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
-static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = 0;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1452,28 +1564,29 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
-static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1483,28 +1596,57 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ buf = strstrip(buf);
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
+ */
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
+ flush_work(&cpuset_hotplug_work);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
- switch (cft->private) {
+ switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
@@ -1517,8 +1659,11 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
- cgroup_unlock();
- return retval;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+ return retval ?: nbytes;
}
/*
@@ -1528,72 +1673,46 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
- * A single large read to a buffer that crosses a page boundary is
- * ok, because the result being copied to user land is not recomputed
- * across a page fault.
*/
-
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
- int ret;
-
- mutex_lock(&callback_mutex);
- ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
-
- return ret;
-}
+ struct cpuset *cs = css_cs(seq_css(sf));
+ cpuset_filetype_t type = seq_cft(sf)->private;
+ ssize_t count;
+ char *buf, *s;
+ int ret = 0;
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-{
- nodemask_t mask;
+ count = seq_get_buf(sf, &buf);
+ s = buf;
mutex_lock(&callback_mutex);
- mask = cs->mems_allowed;
- mutex_unlock(&callback_mutex);
-
- return nodelist_scnprintf(page, PAGE_SIZE, mask);
-}
-
-static ssize_t cpuset_common_file_read(struct cgroup *cont,
- struct cftype *cft,
- struct file *file,
- char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct cpuset *cs = cgroup_cs(cont);
- cpuset_filetype_t type = cft->private;
- char *page;
- ssize_t retval = 0;
- char *s;
-
- if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
- return -ENOMEM;
-
- s = page;
switch (type) {
case FILE_CPULIST:
- s += cpuset_sprintf_cpulist(s, cs);
+ s += cpulist_scnprintf(s, count, cs->cpus_allowed);
break;
case FILE_MEMLIST:
- s += cpuset_sprintf_memlist(s, cs);
+ s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
default:
- retval = -EINVAL;
- goto out;
+ ret = -EINVAL;
+ goto out_unlock;
}
- *s++ = '\n';
- retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
- free_page((unsigned long)page);
- return retval;
+ if (s < buf + count - 1) {
+ *s++ = '\n';
+ seq_commit(sf, s - buf);
+ } else {
+ seq_commit(sf, -1);
+ }
+out_unlock:
+ mutex_unlock(&callback_mutex);
+ return ret;
}
-static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1622,9 +1741,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
return 0;
}
-static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1645,16 +1764,16 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
static struct cftype files[] = {
{
.name = "cpus",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
@@ -1706,6 +1825,7 @@ static struct cftype files[] = {
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_PRESSURE,
+ .mode = S_IRUGO,
},
{
@@ -1721,86 +1841,32 @@ static struct cftype files[] = {
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
- .name = "memory_pressure_enabled",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
-};
-
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- int err;
-
- err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
- if (err)
- return err;
- /* memory_pressure_enabled is in root cpuset only */
- if (!cont->parent)
- err = cgroup_add_file(cont, ss,
- &cft_memory_pressure_enabled);
- return err;
-}
-/*
- * post_clone() is called at the end of cgroup_clone().
- * 'cgroup' was just created automatically as a result of
- * a cgroup_clone(), and the current task is about to
- * be moved into 'cgroup'.
- *
- * Currently we refuse to set up the cgroup - thereby
- * refusing the task to be entered, and as a result refusing
- * the sys_unshare() or clone() which initiated it - if any
- * sibling cpusets have exclusive cpus or mem.
- *
- * If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
- * held.
- */
-static void cpuset_post_clone(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct cgroup *parent, *child;
- struct cpuset *cs, *parent_cs;
-
- parent = cgroup->parent;
- list_for_each_entry(child, &parent->children, sibling) {
- cs = cgroup_cs(child);
- if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
- return;
- }
- cs = cgroup_cs(cgroup);
- parent_cs = cgroup_cs(parent);
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
- cs->mems_allowed = parent_cs->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
- return;
-}
+ { } /* terminate */
+};
/*
- * cpuset_create - create a cpuset
- * ss: cpuset cgroup subsystem
- * cont: control group that the new cpuset will be part of
+ * cpuset_css_alloc - allocate a cpuset css
+ * cgrp: control group that the new cpuset will be part of
*/
-static struct cgroup_subsys_state *cpuset_create(
- struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
- struct cpuset *parent;
- if (!cont->parent) {
- /* This is early initialization for the top cgroup */
- top_cpuset.mems_generation = cpuset_mems_generation++;
+ if (!parent_css)
return &top_cpuset.css;
- }
- parent = cgroup_cs(cont->parent);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1808,71 +1874,110 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cpuset_update_task_memory_state();
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
- cs->parent = parent;
- number_of_cpusets++;
- return &cs->css ;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ cpuset_inc();
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * histrical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ mutex_unlock(&callback_mutex);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
- cpuset_update_task_memory_state();
+ mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- number_of_cpusets--;
+ cpuset_dec();
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
-struct cgroup_subsys cpuset_subsys = {
- .name = "cpuset",
- .create = cpuset_create,
- .destroy = cpuset_destroy,
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .populate = cpuset_populate,
- .post_clone = cpuset_post_clone,
- .subsys_id = cpuset_subsys_id,
+ .base_cftypes = files,
.early_init = 1,
};
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
-
- top_cpuset.mems_generation = cpuset_mems_generation++;
- return 0;
-}
-
-
/**
* cpuset_init - initialize cpusets at system boot
*
@@ -1883,11 +1988,13 @@ int __init cpuset_init(void)
{
int err = 0;
+ if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+ BUG();
+
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
fmeter_init(&top_cpuset.fmeter);
- top_cpuset.mems_generation = cpuset_mems_generation++;
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
@@ -1898,230 +2005,230 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
- number_of_cpusets = 1;
return 0;
}
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
- */
-static void cpuset_do_move_task(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- struct cpuset_hotplug_scanner *chsp;
-
- chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
- cgroup_attach_task(chsp->to, tsk);
-}
-
-/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- */
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
-{
- struct cpuset_hotplug_scanner scan;
-
- scan.scan.cg = from->css.cgroup;
- scan.scan.test_task = NULL; /* select all tasks in cgroup */
- scan.scan.process_task = cpuset_do_move_task;
- scan.scan.heap = NULL;
- scan.to = to->css.cgroup;
-
- if (cgroup_scan_tasks(&scan.scan))
- printk(KERN_ERR "move_member_tasks_to_cpuset: "
- "cgroup_scan_tasks failed\n");
-}
-
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
- */
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
-
- /*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
- parent = cs->parent;
+ parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ parent = parent_cs(parent);
- move_member_tasks_to_cpuset(cs, parent);
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
}
-/*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
- *
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
- *
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+/**
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
*
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'. But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static void scan_for_empty_cpusets(struct cpuset *root)
-{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
- nodemask_t oldmems;
-
- list_add_tail((struct list_head *)&root->stack_list, &queue);
-
- while (!list_empty(&queue)) {
- cp = list_first_entry(&queue, struct cpuset, stack_list);
- list_del(queue.next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &queue);
- }
-
- /* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
- nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
- continue;
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+{
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems;
+ bool is_empty;
+ bool sane = cgroup_sane_behavior(cs->css.cgroup);
- oldmems = cp->mems_allowed;
+retry:
+ wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- /* Remove offline cpus and mems from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_online_mask);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_HIGH_MEMORY]);
- mutex_unlock(&callback_mutex);
+ mutex_lock(&cpuset_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed) ||
- nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else {
- update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems);
- }
+ /*
+ * We have raced with task attaching. We wait until attaching
+ * is finished, so we won't attach a task to an empty cpuset.
+ */
+ if (cs->attach_in_progress) {
+ mutex_unlock(&cpuset_mutex);
+ goto retry;
}
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * If sane_behavior flag is set, we need to update tasks' cpumask
+ * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
+ * call update_tasks_cpumask() if the cpuset becomes empty, as
+ * the tasks in it will be migrated to an ancestor.
+ */
+ if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+ (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
+ update_tasks_cpumask(cs);
+
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * If sane_behavior flag is set, we need to update tasks' nodemask
+ * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
+ * call update_tasks_nodemask() if the cpuset becomes empty, as
+ * the tasks in it will be migratd to an ancestor.
+ */
+ if ((sane && nodes_empty(cs->mems_allowed)) ||
+ (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+ *
+ * Otherwise move tasks to the nearest ancestor with execution
+ * resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (!sane && is_empty)
+ remove_tasks_in_empty_cpuset(cs);
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ *
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
*
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
- unsigned long phase, void *unused_cpu)
+static void cpuset_hotplug_workfn(struct work_struct *work)
{
- struct sched_domain_attr *attr;
- struct cpumask *doms;
- int ndoms;
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated, mems_updated;
- switch (phase) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- break;
+ mutex_lock(&cpuset_mutex);
- default:
- return NOTIFY_DONE;
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
+
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
}
- cgroup_lock();
- cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
- scan_for_empty_cpusets(&top_cpuset);
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ mutex_lock(&callback_mutex);
+ top_cpuset.mems_allowed = new_mems;
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset);
+ }
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ mutex_unlock(&cpuset_mutex);
- return NOTIFY_OK;
+ /* if cpus or mems changed, we need to propagate to descendants */
+ if (cpus_updated || mems_updated) {
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ cpuset_hotplug_update_tasks(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+ }
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated)
+ rebuild_sched_domains();
+}
+
+void cpuset_update_active_cpus(bool cpu_online)
+{
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- break;
- case MEM_OFFLINE:
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- scan_for_empty_cpusets(&top_cpuset);
- break;
- default:
- break;
- }
- cgroup_unlock();
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+ top_cpuset.mems_allowed = node_states[N_MEMORY];
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
- hotcpu_notifier(cpuset_track_online_cpus, 0);
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
-
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
/**
@@ -2131,26 +2238,48 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
+ struct cpuset *cpus_cs;
+
mutex_lock(&callback_mutex);
- cpuset_cpus_allowed_locked(tsk, pmask);
+ rcu_read_lock();
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ guarantee_online_cpus(cpus_cs, pmask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
- task_unlock(tsk);
+ struct cpuset *cpus_cs;
+
+ rcu_read_lock();
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ rcu_read_unlock();
+
+ /*
+ * We own tsk->cpus_allowed, nobody can change it under us.
+ *
+ * But we used cs && cs->cpus_allowed lockless and thus can
+ * race with cgroup_attach_task() or update_cpumask() and get
+ * the wrong tsk->cpus_allowed. However, both cases imply the
+ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+ * which takes task_rq_lock().
+ *
+ * If we are called after it dropped the lock we must see all
+ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+ * set any mask even if it is not right from task_cs() pov,
+ * the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
+ */
}
void cpuset_init_current_mems_allowed(void)
@@ -2164,18 +2293,20 @@ void cpuset_init_current_mems_allowed(void)
*
* Description: Returns the nodemask_t mems_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
+ * subset of node_states[N_MEMORY], even if this means going outside the
* tasks cpuset.
**/
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
+ struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
- task_lock(tsk);
- guarantee_online_mems(task_cs(tsk), &mask);
- task_unlock(tsk);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &mask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
return mask;
@@ -2198,34 +2329,32 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
* callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
-static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
/**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If
- * __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
+ * set, yes, we can always allocate. If node is in our task's mems_allowed,
+ * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
*
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2393,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
* GFP_USER - only nodes in current tasks mems allowed ok.
*
* Rule:
- * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
* the code that might scan up ancestor cpusets and sleep.
*/
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- int node; /* node that zone z is on */
- const struct cpuset *cs; /* current cpuset ancestors */
+ struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@@ -2296,25 +2422,25 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
mutex_lock(&callback_mutex);
- task_lock(current);
+ rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
- task_unlock(current);
-
allowed = node_isset(node, cs->mems_allowed);
+ rcu_read_unlock();
+
mutex_unlock(&callback_mutex);
return allowed;
}
/*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes. Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
+ * set, yes, we can always allocate. If node is in our task's mems_allowed,
+ * yes. If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2448,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
* any node on the zonelist except the first. By the time any such
* calls get to this routine, we should just shut up and say 'yes'.
*
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
* mems_allowed or that we're in interrupt. It does not scan up the
* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
* It never sleeps.
*/
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
- int node; /* node that zone z is on */
-
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
/*
@@ -2348,34 +2470,8 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
}
/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
- mutex_lock(&callback_mutex);
-}
-
-/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2400,16 +2496,35 @@ void cpuset_unlock(void)
* See kmem_cache_alloc_node().
*/
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
{
int node;
- node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+ node = next_node(*rotor, current->mems_allowed);
if (node == MAX_NUMNODES)
node = first_node(current->mems_allowed);
- current->cpuset_mem_spread_rotor = node;
+ *rotor = node;
return node;
}
+
+int cpuset_mem_spread_node(void)
+{
+ if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_mem_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+ if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_slab_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
@@ -2429,26 +2544,33 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
+#define CPUSET_NODELIST_LEN (256)
+
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @task: pointer to task_struct of some task.
+ * @tsk: pointer to task_struct of some task.
*
* Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log. Must hold task_lock(task) to allow
- * dereferencing task_cs(task).
+ * mems_allowed to the kernel log.
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- struct dentry *dentry;
+ /* Statically allocated to prevent using excess stack. */
+ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+ static DEFINE_SPINLOCK(cpuset_buffer_lock);
+ struct cgroup *cgrp;
- dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+ rcu_read_lock();
+
+ cgrp = task_cs(tsk)->css.cgroup;
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
- printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cpuset_name, cpuset_nodelist);
+ pr_info("%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+
+ rcu_read_unlock();
spin_unlock(&cpuset_buffer_lock);
}
@@ -2480,9 +2602,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
void __cpuset_memory_pressure_bump(void)
{
- task_lock(current);
+ rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
- task_unlock(current);
+ rcu_read_unlock();
}
#ifdef CONFIG_PROC_PID_CPUSET
@@ -2492,19 +2614,19 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2514,50 +2636,32 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
- css = task_subsys_state(tsk, cpuset_subsys_id);
- retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
- if (retval < 0)
- goto out_unlock;
- seq_puts(m, buf);
+ retval = -ENAMETOOLONG;
+ rcu_read_lock();
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
+ rcu_read_unlock();
+ if (!p)
+ goto out_put_task;
+ seq_puts(m, p);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+ retval = 0;
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_PID_CPUSET */
-/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Cpus_allowed:\t");
- seq_cpumask(m, &task->cpus_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Cpus_allowed_list:\t");
- seq_cpumask_list(m, &task->cpus_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Mems_allowed:\t");
+ seq_puts(m, "Mems_allowed:\t");
seq_nodemask(m, &task->mems_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Mems_allowed_list:\t");
+ seq_puts(m, "\n");
+ seq_puts(m, "Mems_allowed_list:\t");
seq_nodemask_list(m, &task->mems_allowed);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 00000000000..c766ee54c0b
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,45 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * stores the size of elf header of crash image
+ */
+unsigned long long elfcorehdr_size;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ *
+ * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ if (*end == '@') {
+ elfcorehdr_size = elfcorehdr_addr;
+ elfcorehdr_addr = memparse(end + 1, &end);
+ }
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d70..e0573a43c7d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -8,73 +8,76 @@
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/cred.h>
+#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/cn_proc.h>
-#include "cred-internals.h"
-static struct kmem_cache *cred_jar;
-
-/*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
- .usage = ATOMIC_INIT(2),
- .tgid = 0,
- .lock = SPIN_LOCK_UNLOCKED,
-};
+#if 0
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#else
+#define kdebug(FMT, ...) \
+ no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#endif
+static struct kmem_cache *cred_jar;
+
/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ .subscribers = ATOMIC_INIT(2),
+ .magic = CRED_MAGIC,
+#endif
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_INIT_INH_SET,
+ .cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_INIT_EFF_SET,
- .cap_bset = CAP_INIT_BSET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
.user = INIT_USER,
+ .user_ns = &init_user_ns,
.group_info = &init_groups,
-#ifdef CONFIG_KEYS
- .tgcred = &init_tgcred,
-#endif
};
-/*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
+static inline void set_cred_subscribers(struct cred *cred, int n)
{
- struct thread_group_cred *tgcred =
- container_of(rcu, struct thread_group_cred, rcu);
-
- BUG_ON(atomic_read(&tgcred->usage) != 0);
-
- key_put(tgcred->session_keyring);
- key_put(tgcred->process_keyring);
- kfree(tgcred);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ atomic_set(&cred->subscribers, n);
+#endif
}
+
+static inline int read_cred_subscribers(const struct cred *cred)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ return atomic_read(&cred->subscribers);
+#else
+ return 0;
#endif
+}
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
+static inline void alter_cred_subscribers(const struct cred *_cred, int n)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = cred->tgcred;
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ struct cred *cred = (struct cred *) _cred;
- if (atomic_dec_and_test(&tgcred->usage))
- call_rcu(&tgcred->rcu, release_tgcred_rcu);
+ atomic_add(n, &cred->subscribers);
#endif
}
@@ -85,16 +88,32 @@ static void put_cred_rcu(struct rcu_head *rcu)
{
struct cred *cred = container_of(rcu, struct cred, rcu);
+ kdebug("put_cred_rcu(%p)", cred);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ if (cred->magic != CRED_MAGIC_DEAD ||
+ atomic_read(&cred->usage) != 0 ||
+ read_cred_subscribers(cred) != 0)
+ panic("CRED: put_cred_rcu() sees %p with"
+ " mag %x, put %p, usage %d, subscr %d\n",
+ cred, cred->magic, cred->put_addr,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+#else
if (atomic_read(&cred->usage) != 0)
panic("CRED: put_cred_rcu() sees %p with usage %d\n",
cred, atomic_read(&cred->usage));
+#endif
security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
- release_tgcred(cred);
- put_group_info(cred->group_info);
+ if (cred->group_info)
+ put_group_info(cred->group_info);
free_uid(cred->user);
+ put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -106,12 +125,99 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
+ kdebug("__put_cred(%p{%d,%d})", cred,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+
BUG_ON(atomic_read(&cred->usage) != 0);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(cred) != 0);
+ cred->magic = CRED_MAGIC_DEAD;
+ cred->put_addr = __builtin_return_address(0);
+#endif
+ BUG_ON(cred == current->cred);
+ BUG_ON(cred == current->real_cred);
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
+/*
+ * Clean up a task's credentials when it exits
+ */
+void exit_creds(struct task_struct *tsk)
+{
+ struct cred *cred;
+
+ kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ cred = (struct cred *) tsk->real_cred;
+ tsk->real_cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+
+ cred = (struct cred *) tsk->cred;
+ tsk->cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+}
+
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away. Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+ const struct cred *cred;
+
+ rcu_read_lock();
+
+ do {
+ cred = __task_cred((task));
+ BUG_ON(!cred);
+ } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+ rcu_read_unlock();
+ return cred;
+}
+
+/*
+ * Allocate blank credentials, such that the credentials can be filled in at a
+ * later date without risk of ENOMEM.
+ */
+struct cred *cred_alloc_blank(void)
+{
+ struct cred *new;
+
+ new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
+
+ if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ goto error;
+
+ return new;
+
+error:
+ abort_creds(new);
+ return NULL;
+}
+
/**
* prepare_creds - Prepare a new set of credentials for modification
*
@@ -132,23 +238,28 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- BUG_ON(atomic_read(&task->real_cred->usage) < 1);
+ validate_process_creds();
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
+ kdebug("prepare_creds() alloc %p", new);
+
old = task->cred;
memcpy(new, old, sizeof(struct cred));
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
+ get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
- atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
@@ -157,6 +268,7 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
+ validate_creds(new);
return new;
error:
@@ -167,96 +279,27 @@ EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold ->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
- struct thread_group_cred *tgcred = NULL;
struct cred *new;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred)
- return NULL;
-#endif
-
new = prepare_creds();
- if (!new) {
- kfree(tgcred);
+ if (!new)
return new;
- }
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
- /* create a new per-thread-group creds for all this set of threads to
- * share */
- memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
-
/* inherit the session keyring; new process keyring */
- key_get(tgcred->session_keyring);
- tgcred->process_keyring = NULL;
-
- release_tgcred(new);
- new->tgcred = tgcred;
-#endif
-
- return new;
-}
-
-/*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = NULL;
-#endif
- struct cred *new;
-
-#ifdef CONFIG_KEYS
- tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
- if (!tgcred)
- return NULL;
-#endif
-
- new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
- if (!new)
- return NULL;
-
- memcpy(new, &init_cred, sizeof(struct cred));
-
- atomic_set(&new->usage, 1);
- get_group_info(new->group_info);
- get_uid(new->user);
-
-#ifdef CONFIG_KEYS
- new->thread_keyring = NULL;
- new->request_key_auth = NULL;
- new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
#endif
-#ifdef CONFIG_SECURITY
- new->security = NULL;
-#endif
- if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
- goto error;
-
- BUG_ON(atomic_read(&new->usage) != 1);
return new;
-
-error:
- put_cred(new);
- return NULL;
}
/*
@@ -270,14 +313,9 @@ error:
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
struct cred *new;
int ret;
- mutex_init(&p->cred_exec_mutex);
-
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
@@ -286,6 +324,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
) {
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
+ alter_cred_subscribers(p->cred, 2);
+ kdebug("share_creds(%p{%d,%d})",
+ p->cred, atomic_read(&p->cred->usage),
+ read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes);
return 0;
}
@@ -310,27 +352,19 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
install_thread_keyring_to_cred(new);
}
- /* we share the process and session keyrings between all the threads in
- * a process - this is slightly icky as we violate COW credentials a
- * bit */
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
if (!(clone_flags & CLONE_THREAD)) {
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- ret = -ENOMEM;
- goto error_put;
- }
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
}
#endif
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
+ alter_cred_subscribers(new, 2);
+ validate_creds(new);
return 0;
error_put:
@@ -338,6 +372,31 @@ error_put:
return ret;
}
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -355,23 +414,28 @@ error_put:
int commit_creds(struct cred *new)
{
struct task_struct *task = current;
- const struct cred *old;
+ const struct cred *old = task->real_cred;
- BUG_ON(task->cred != task->real_cred);
- BUG_ON(atomic_read(&task->real_cred->usage) < 2);
- BUG_ON(atomic_read(&new->usage) < 1);
+ kdebug("commit_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
- old = task->real_cred;
- security_commit_creds(new, old);
+ BUG_ON(task->cred != old);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(old) < 2);
+ validate_creds(old);
+ validate_creds(new);
+#endif
+ BUG_ON(atomic_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (old->euid != new->euid ||
- old->egid != new->egid ||
- old->fsuid != new->fsuid ||
- old->fsgid != new->fsgid ||
- !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
@@ -379,37 +443,35 @@ int commit_creds(struct cred *new)
}
/* alter the thread keyring */
- if (new->fsuid != old->fsuid)
+ if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(task);
- if (new->fsgid != old->fsgid)
+ if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(task);
/* do it
- * - What if a process setreuid()'s and this brings the
- * new uid over his NPROC rlimit? We can check this now
- * cheaply with the new uid cache, so if it matters
- * we should be checking for it. -DaveM
+ * RLIMIT_NPROC limits on user->processes have already been checked
+ * in set_user().
*/
+ alter_cred_subscribers(new, 2);
if (new->user != old->user)
atomic_inc(&new->user->processes);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
atomic_dec(&old->user->processes);
-
- sched_switch_user(task);
+ alter_cred_subscribers(old, -2);
/* send notifications */
- if (new->uid != old->uid ||
- new->euid != old->euid ||
- new->suid != old->suid ||
- new->fsuid != old->fsuid)
+ if (!uid_eq(new->uid, old->uid) ||
+ !uid_eq(new->euid, old->euid) ||
+ !uid_eq(new->suid, old->suid) ||
+ !uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
- if (new->gid != old->gid ||
- new->egid != old->egid ||
- new->sgid != old->sgid ||
- new->fsgid != old->fsgid)
+ if (!gid_eq(new->gid, old->gid) ||
+ !gid_eq(new->egid, old->egid) ||
+ !gid_eq(new->sgid, old->sgid) ||
+ !gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
@@ -428,6 +490,13 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
+ kdebug("abort_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(new) != 0);
+#endif
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
@@ -444,7 +513,20 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- rcu_assign_pointer(current->cred, get_cred(new));
+ kdebug("override_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ validate_creds(old);
+ validate_creds(new);
+ get_cred(new);
+ alter_cred_subscribers(new, 1);
+ rcu_assign_pointer(current->cred, new);
+ alter_cred_subscribers(old, -1);
+
+ kdebug("override_creds() = %p{%d,%d}", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
return old;
}
EXPORT_SYMBOL(override_creds);
@@ -460,7 +542,15 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
+ kdebug("revert_creds(%p{%d,%d})", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
+
+ validate_creds(old);
+ validate_creds(override);
+ alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
+ alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
@@ -502,20 +592,27 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
+ kdebug("prepare_kernel_cred() alloc %p", new);
+
if (daemon)
old = get_task_cred(daemon);
else
old = get_cred(&init_cred);
+ validate_creds(old);
+
*new = *old;
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_uid(new->user);
+ get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_inc(&init_tgcred.usage);
- new->tgcred = &init_tgcred;
- new->request_key_auth = NULL;
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
@@ -525,8 +622,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
- atomic_set(&new->usage, 1);
put_cred(old);
+ validate_creds(new);
return new;
error:
@@ -589,3 +686,122 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+
+bool creds_are_invalid(const struct cred *cred)
+{
+ if (cred->magic != CRED_MAGIC)
+ return true;
+#ifdef CONFIG_SECURITY_SELINUX
+ /*
+ * cred->security == NULL if security_cred_alloc_blank() or
+ * security_prepare_creds() returned an error.
+ */
+ if (selinux_is_enabled() && cred->security) {
+ if ((unsigned long) cred->security < PAGE_SIZE)
+ return true;
+ if ((*(u32 *)cred->security & 0xffffff00) ==
+ (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
+ return true;
+ }
+#endif
+ return false;
+}
+EXPORT_SYMBOL(creds_are_invalid);
+
+/*
+ * dump invalid credentials
+ */
+static void dump_invalid_creds(const struct cred *cred, const char *label,
+ const struct task_struct *tsk)
+{
+ printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+ label, cred,
+ cred == &init_cred ? "[init]" : "",
+ cred == tsk->real_cred ? "[real]" : "",
+ cred == tsk->cred ? "[eff]" : "");
+ printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+ cred->magic, cred->put_addr);
+ printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+ printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+ from_kuid_munged(&init_user_ns, cred->uid),
+ from_kuid_munged(&init_user_ns, cred->euid),
+ from_kuid_munged(&init_user_ns, cred->suid),
+ from_kuid_munged(&init_user_ns, cred->fsuid));
+ printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+ from_kgid_munged(&init_user_ns, cred->gid),
+ from_kgid_munged(&init_user_ns, cred->egid),
+ from_kgid_munged(&init_user_ns, cred->sgid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+#ifdef CONFIG_SECURITY
+ printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+ if ((unsigned long) cred->security >= PAGE_SIZE &&
+ (((unsigned long) cred->security & 0xffffff00) !=
+ (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
+ printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+ ((u32*)cred->security)[0],
+ ((u32*)cred->security)[1]);
+#endif
+}
+
+/*
+ * report use of invalid credentials
+ */
+void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+{
+ printk(KERN_ERR "CRED: Invalid credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ dump_invalid_creds(cred, "Specified", current);
+ BUG();
+}
+EXPORT_SYMBOL(__invalid_creds);
+
+/*
+ * check the credentials on a process
+ */
+void __validate_process_creds(struct task_struct *tsk,
+ const char *file, unsigned line)
+{
+ if (tsk->cred == tsk->real_cred) {
+ if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ } else {
+ if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
+ read_cred_subscribers(tsk->cred) < 1 ||
+ creds_are_invalid(tsk->real_cred) ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ }
+ return;
+
+invalid_creds:
+ printk(KERN_ERR "CRED: Invalid process credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+
+ dump_invalid_creds(tsk->real_cred, "Real", tsk);
+ if (tsk->cred != tsk->real_cred)
+ dump_invalid_creds(tsk->cred, "Effective", tsk);
+ else
+ printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+ BUG();
+}
+EXPORT_SYMBOL(__validate_process_creds);
+
+/*
+ * check creds for do_exit()
+ */
+void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+ kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+ tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ __validate_process_creds(tsk, __FILE__, __LINE__);
+}
+
+#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 00000000000..a85edc33998
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 00000000000..1adf62b39b9
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,1067 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/serial_core.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/rcupdate.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <linux/atomic.h>
+
+#include "debug_core.h"
+
+static int kgdb_break_asap;
+
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+int kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int exception_level;
+
+struct kgdb_io *dbg_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+int dbg_kdb_mode = 1;
+
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+ [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
+static DEFINE_RAW_SPINLOCK(dbg_master_lock);
+static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t masters_in_kgdb;
+static atomic_t slaves_in_kgdb;
+static atomic_t kgdb_break_tasklet_var;
+atomic_t kgdb_setting_breakpoint;
+
+struct task_struct *kgdb_usethread;
+struct task_struct *kgdb_contthread;
+
+int kgdb_single_step;
+static pid_t kgdb_sstep_pid;
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+ kgdb_do_roundup = 0;
+
+ return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ return err;
+}
+
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ struct kgdb_bkpt tmp;
+ int err;
+ /* Validate setting the breakpoint and then removing it. If the
+ * remove fails, the kernel needs to emit a bad message because we
+ * are deep trouble not being able to put things back the way we
+ * found them.
+ */
+ tmp.bpt_addr = addr;
+ err = kgdb_arch_set_breakpoint(&tmp);
+ if (err)
+ return err;
+ err = kgdb_arch_remove_breakpoint(&tmp);
+ if (err)
+ printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+ "memory destroyed at: %lx", addr);
+ return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+ return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return 0;
+}
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
+ }
+
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+int dbg_activate_sw_breakpoints(void)
+{
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_SET)
+ continue;
+
+ error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
+ if (error) {
+ ret = error;
+ printk(KERN_INFO "KGDB: BP install failed: %lx",
+ kgdb_break[i].bpt_addr);
+ continue;
+ }
+
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+ kgdb_break[i].state = BP_ACTIVE;
+ }
+ return ret;
+}
+
+int dbg_set_sw_break(unsigned long addr)
+{
+ int err = kgdb_validate_break_address(addr);
+ int breakno = -1;
+ int i;
+
+ if (err)
+ return err;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return -EEXIST;
+ }
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_REMOVED &&
+ kgdb_break[i].bpt_addr == addr) {
+ breakno = i;
+ break;
+ }
+ }
+
+ if (breakno == -1) {
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_UNDEFINED) {
+ breakno = i;
+ break;
+ }
+ }
+ }
+
+ if (breakno == -1)
+ return -E2BIG;
+
+ kgdb_break[breakno].state = BP_SET;
+ kgdb_break[breakno].type = BP_BREAKPOINT;
+ kgdb_break[breakno].bpt_addr = addr;
+
+ return 0;
+}
+
+int dbg_deactivate_sw_breakpoints(void)
+{
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ continue;
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+ if (error) {
+ printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+ ret = error;
+ }
+
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+ kgdb_break[i].state = BP_SET;
+ }
+ return ret;
+}
+
+int dbg_remove_sw_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr)) {
+ kgdb_break[i].state = BP_REMOVED;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_REMOVED) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return 1;
+ }
+ return 0;
+}
+
+int dbg_remove_all_break(void)
+{
+ int error;
+ int i;
+
+ /* Clear memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ goto setundefined;
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+ if (error)
+ printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+setundefined:
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+
+ /* Clear hardware breakpoints. */
+ if (arch_kgdb_ops.remove_all_hw_break)
+ arch_kgdb_ops.remove_all_hw_break();
+
+ return 0;
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module. Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+ if (!dbg_io_ops)
+ return 0;
+ if (kgdb_connected)
+ return 1;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ return 1;
+ if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
+ printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+ }
+ return 1;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+ unsigned long addr;
+
+ if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+ return 0;
+
+ /* Panic on recursive debugger calls: */
+ exception_level++;
+ addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ dbg_deactivate_sw_breakpoints();
+
+ /*
+ * If the break point removed ok at the place exception
+ * occurred, try to recover and print a warning to the end
+ * user because the user planted a breakpoint in a place that
+ * KGDB needs in order to function.
+ */
+ if (dbg_remove_sw_break(addr) == 0) {
+ exception_level = 0;
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+ addr);
+ WARN_ON_ONCE(1);
+
+ return 1;
+ }
+ dbg_remove_all_break();
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+ if (exception_level > 1) {
+ dump_stack();
+ panic("Recursive entry to debugger");
+ }
+
+ printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+ /* Allow kdb to debug itself one level */
+ return 0;
+#endif
+ dump_stack();
+ panic("Recursive entry to debugger");
+
+ return 1;
+}
+
+static void dbg_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+}
+
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
+ int exception_state)
+{
+ unsigned long flags;
+ int sstep_tries = 100;
+ int error;
+ int cpu;
+ int trace_on = 0;
+ int online_cpus = num_online_cpus();
+
+ kgdb_info[ks->cpu].enter_kgdb++;
+ kgdb_info[ks->cpu].exception_state |= exception_state;
+
+ if (exception_state == DCPU_WANT_MASTER)
+ atomic_inc(&masters_in_kgdb);
+ else
+ atomic_inc(&slaves_in_kgdb);
+
+ if (arch_kgdb_ops.disable_hw_break)
+ arch_kgdb_ops.disable_hw_break(regs);
+
+acquirelock:
+ /*
+ * Interrupts will be restored by the 'trap return' code, except when
+ * single stepping.
+ */
+ local_irq_save(flags);
+
+ cpu = ks->cpu;
+ kgdb_info[cpu].debuggerinfo = regs;
+ kgdb_info[cpu].task = current;
+ kgdb_info[cpu].ret_state = 0;
+ kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
+
+ /* Make sure the above info reaches the primary CPU */
+ smp_mb();
+
+ if (exception_level == 1) {
+ if (raw_spin_trylock(&dbg_master_lock))
+ atomic_xchg(&kgdb_active, cpu);
+ goto cpu_master_loop;
+ }
+
+ /*
+ * CPU will loop if it is a slave or request to become a kgdb
+ * master cpu and acquire the kgdb_active lock:
+ */
+ while (1) {
+cpu_loop:
+ if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+ kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+ goto cpu_master_loop;
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+ if (raw_spin_trylock(&dbg_master_lock)) {
+ atomic_xchg(&kgdb_active, cpu);
+ break;
+ }
+ } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+ if (!raw_spin_is_locked(&dbg_slave_lock))
+ goto return_normal;
+ } else {
+return_normal:
+ /* Return to normal operation by executing any
+ * hw breakpoint fixup.
+ */
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic();
+ atomic_dec(&slaves_in_kgdb);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+ return 0;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * For single stepping, try to only enter on the processor
+ * that was single stepping. To guard against a deadlock, the
+ * kernel will only try for the value of sstep_tries before
+ * giving up and continuing on.
+ */
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+ (kgdb_info[cpu].task &&
+ kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+ atomic_set(&kgdb_active, -1);
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+
+ goto acquirelock;
+ }
+
+ if (!kgdb_io_ready(1)) {
+ kgdb_info[cpu].ret_state = 1;
+ goto kgdb_restore; /* No I/O connection, resume the system */
+ }
+
+ /*
+ * Don't enter if we have hit a removed breakpoint.
+ */
+ if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+ goto kgdb_restore;
+
+ /* Call the I/O driver's pre_exception routine */
+ if (dbg_io_ops->pre_exception)
+ dbg_io_ops->pre_exception();
+
+ /*
+ * Get the passive CPU lock which will hold all the non-primary
+ * CPU in a spin state while the debugger is active
+ */
+ if (!kgdb_single_step)
+ raw_spin_lock(&dbg_slave_lock);
+
+#ifdef CONFIG_SMP
+ /* If send_ready set, slaves are already waiting */
+ if (ks->send_ready)
+ atomic_set(ks->send_ready, 1);
+
+ /* Signal the other CPUs to enter kgdb_wait() */
+ else if ((!kgdb_single_step) && kgdb_do_roundup)
+ kgdb_roundup_cpus(flags);
+#endif
+
+ /*
+ * Wait for the other CPUs to be notified and be waiting for us:
+ */
+ while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
+ atomic_read(&slaves_in_kgdb)) != online_cpus)
+ cpu_relax();
+
+ /*
+ * At this point the primary processor is completely
+ * in the debugger and all secondary CPUs are quiescent
+ */
+ dbg_deactivate_sw_breakpoints();
+ kgdb_single_step = 0;
+ kgdb_contthread = current;
+ exception_level = 0;
+ trace_on = tracing_is_on();
+ if (trace_on)
+ tracing_off();
+
+ while (1) {
+cpu_master_loop:
+ if (dbg_kdb_mode) {
+ kgdb_connected = 1;
+ error = kdb_stub(ks);
+ if (error == -1)
+ continue;
+ kgdb_connected = 0;
+ } else {
+ error = gdb_serial_stub(ks);
+ }
+
+ if (error == DBG_PASS_EVENT) {
+ dbg_kdb_mode = !dbg_kdb_mode;
+ } else if (error == DBG_SWITCH_CPU_EVENT) {
+ kgdb_info[dbg_switch_cpu].exception_state |=
+ DCPU_NEXT_MASTER;
+ goto cpu_loop;
+ } else {
+ kgdb_info[cpu].ret_state = error;
+ break;
+ }
+ }
+
+ /* Call the I/O driver's post_exception routine */
+ if (dbg_io_ops->post_exception)
+ dbg_io_ops->post_exception();
+
+ if (!kgdb_single_step) {
+ raw_spin_unlock(&dbg_slave_lock);
+ /* Wait till all the CPUs have quit from the debugger. */
+ while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
+ cpu_relax();
+ }
+
+kgdb_restore:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+ if (kgdb_info[sstep_cpu].task)
+ kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+ else
+ kgdb_sstep_pid = 0;
+ }
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic();
+ atomic_dec(&masters_in_kgdb);
+ /* Free kgdb_active */
+ atomic_set(&kgdb_active, -1);
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+
+ return kgdb_info[cpu].ret_state;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ * kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = raw_smp_processor_id();
+ ks->ex_vector = evector;
+ ks->signo = signo;
+ ks->err_code = ecode;
+ ks->linux_regs = regs;
+
+ if (kgdb_reenter_check(ks))
+ goto out; /* Ouch, double exception ! */
+ if (kgdb_info[ks->cpu].enter_kgdb != 0)
+ goto out;
+
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
+}
+
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
+}
+
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->linux_regs = regs;
+
+ if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
+ raw_spin_is_locked(&dbg_master_lock)) {
+ kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
+ atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+ if (!kgdb_io_ready(0) || !send_ready)
+ return 1;
+
+ if (kgdb_info[cpu].enter_kgdb == 0) {
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->ex_vector = trapnr;
+ ks->signo = SIGTRAP;
+ ks->err_code = err_code;
+ ks->linux_regs = regs;
+ ks->send_ready = send_ready;
+ kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+ unsigned count)
+{
+ unsigned long flags;
+
+ /* If we're debugging, or KGDB has not connected, don't try
+ * and print. */
+ if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
+ return;
+
+ local_irq_save(flags);
+ gdbstub_msg_write(s, count);
+ local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+ .name = "kgdb",
+ .write = kgdb_console_write,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_dbg(int key)
+{
+ if (!dbg_io_ops) {
+ printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ return;
+ }
+ if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
+ printk(KERN_CRIT "Entering KGDB\n");
+#endif
+ }
+
+ kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_dbg_op = {
+ .handler = sysrq_handle_dbg,
+ .help_msg = "debug(g)",
+ .action_msg = "DEBUG",
+};
+#endif
+
+static int kgdb_panic_event(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (dbg_kdb_mode)
+ kdb_printf("PANIC: %s\n", (char *)data);
+ kgdb_breakpoint();
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kgdb_panic_event_nb = {
+ .notifier_call = kgdb_panic_event,
+ .priority = INT_MAX,
+};
+
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+ dbg_is_early = false;
+ if (kgdb_io_module_registered)
+ kgdb_arch_late();
+ kdb_init(KDB_INIT_FULL);
+}
+
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+ /*
+ * Take the following action on reboot notify depending on value:
+ * 1 == Enter debugger
+ * 0 == [the default] detatch debug client
+ * -1 == Do nothing... and use this until the board resets
+ */
+ switch (kgdbreboot) {
+ case 1:
+ kgdb_breakpoint();
+ case -1:
+ goto done;
+ }
+ if (!dbg_kdb_mode)
+ gdbstub_exit(code);
+done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+ .notifier_call = dbg_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX,
+};
+
+static void kgdb_register_callbacks(void)
+{
+ if (!kgdb_io_module_registered) {
+ kgdb_io_module_registered = 1;
+ kgdb_arch_init();
+ if (!dbg_is_early)
+ kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
+ register_reboot_notifier(&dbg_reboot_notifier);
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+#ifdef CONFIG_MAGIC_SYSRQ
+ register_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_use_con && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+ }
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+ /*
+ * When this routine is called KGDB should unregister from the
+ * panic handler and clean up, making sure it is not handling any
+ * break exceptions at the time.
+ */
+ if (kgdb_io_module_registered) {
+ kgdb_io_module_registered = 0;
+ unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+ kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+ unregister_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_con_registered) {
+ unregister_console(&kgdbcons);
+ kgdb_con_registered = 0;
+ }
+ }
+}
+
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+ kgdb_breakpoint();
+ atomic_set(&kgdb_break_tasklet_var, 0);
+}
+
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+
+void kgdb_schedule_breakpoint(void)
+{
+ if (atomic_read(&kgdb_break_tasklet_var) ||
+ atomic_read(&kgdb_active) != -1 ||
+ atomic_read(&kgdb_setting_breakpoint))
+ return;
+ atomic_inc(&kgdb_break_tasklet_var);
+ tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
+/**
+ * kgdb_register_io_module - register KGDB IO module
+ * @new_dbg_io_ops: the io ops vector
+ *
+ * Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
+{
+ int err;
+
+ spin_lock(&kgdb_registration_lock);
+
+ if (dbg_io_ops) {
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_ERR "kgdb: Another I/O driver is already "
+ "registered with KGDB.\n");
+ return -EBUSY;
+ }
+
+ if (new_dbg_io_ops->init) {
+ err = new_dbg_io_ops->init();
+ if (err) {
+ spin_unlock(&kgdb_registration_lock);
+ return err;
+ }
+ }
+
+ dbg_io_ops = new_dbg_io_ops;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+ new_dbg_io_ops->name);
+
+ /* Arm KGDB now. */
+ kgdb_register_callbacks();
+
+ if (kgdb_break_asap)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ * kkgdb_unregister_io_module - unregister KGDB IO module
+ * @old_dbg_io_ops: the io ops vector
+ *
+ * Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
+{
+ BUG_ON(kgdb_connected);
+
+ /*
+ * KGDB is no longer able to communicate out, so
+ * unregister our callbacks and reset state.
+ */
+ kgdb_unregister_callbacks();
+
+ spin_lock(&kgdb_registration_lock);
+
+ WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+ dbg_io_ops = NULL;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO
+ "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ old_dbg_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+int dbg_io_get_char(void)
+{
+ int ret = dbg_io_ops->read_char();
+ if (ret == NO_POLL_CHAR)
+ return -1;
+ if (!dbg_kdb_mode)
+ return ret;
+ if (ret == 127)
+ return 8;
+ return ret;
+}
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception. It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+noinline void kgdb_breakpoint(void)
+{
+ atomic_inc(&kgdb_setting_breakpoint);
+ wmb(); /* Sync point before breakpoint */
+ arch_kgdb_breakpoint();
+ wmb(); /* Sync point after breakpoint */
+ atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+ kgdb_break_asap = 1;
+
+ kdb_init(KDB_INIT_EARLY);
+ if (kgdb_io_module_registered)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 00000000000..127d9bc49fb
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,85 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+
+/* kernel debug core data structures */
+struct kgdb_state {
+ int ex_vector;
+ int signo;
+ int err_code;
+ int cpu;
+ int pass_exception;
+ unsigned long thr_query;
+ unsigned long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+ atomic_t *send_ready;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+
+struct debuggerinfo_struct {
+ void *debuggerinfo;
+ struct task_struct *task;
+ int exception_state;
+ int ret_state;
+ int irq_depth;
+ int enter_kgdb;
+};
+
+extern struct debuggerinfo_struct kgdb_info[];
+
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
+
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+ return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 00000000000..19d9a578c75
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1145 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/serial_core.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+
+#define KGDB_MAX_THREAD_QUERY 17
+
+/* Our I/O buffers. */
+static char remcom_in_buffer[BUFMAX];
+static char remcom_out_buffer[BUFMAX];
+static int gdbstub_use_prev_in_buf;
+static int gdbstub_prev_in_buf_pos;
+
+/* Storage for the registers, in GDB format. */
+static unsigned long gdb_regs[(NUMREGBYTES +
+ sizeof(unsigned long) - 1) /
+ sizeof(unsigned long)];
+
+/*
+ * GDB remote protocol parser:
+ */
+
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+ int ret = -1;
+ int i;
+
+ if (unlikely(gdbstub_use_prev_in_buf)) {
+ if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
+ return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
+ else
+ gdbstub_use_prev_in_buf = 0;
+ }
+
+ /* poll any additional I/O interfaces that are defined */
+ while (ret < 0)
+ for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+ ret = kdb_poll_funcs[i]();
+ if (ret > 0)
+ break;
+ }
+ return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+ int ret = dbg_io_ops->read_char();
+ while (ret == NO_POLL_CHAR)
+ ret = dbg_io_ops->read_char();
+ return ret;
+}
+#endif
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+ unsigned char checksum;
+ unsigned char xmitcsum;
+ int count;
+ char ch;
+
+ do {
+ /*
+ * Spin and wait around for the start character, ignore all
+ * other characters:
+ */
+ while ((ch = (gdbstub_read_wait())) != '$')
+ /* nothing */;
+
+ kgdb_connected = 1;
+ checksum = 0;
+ xmitcsum = -1;
+
+ count = 0;
+
+ /*
+ * now, read until a # or end of buffer is found:
+ */
+ while (count < (BUFMAX - 1)) {
+ ch = gdbstub_read_wait();
+ if (ch == '#')
+ break;
+ checksum = checksum + ch;
+ buffer[count] = ch;
+ count = count + 1;
+ }
+
+ if (ch == '#') {
+ xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
+ xmitcsum += hex_to_bin(gdbstub_read_wait());
+
+ if (checksum != xmitcsum)
+ /* failed checksum */
+ dbg_io_ops->write_char('-');
+ else
+ /* successful transfer */
+ dbg_io_ops->write_char('+');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ }
+ buffer[count] = 0;
+ } while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+ unsigned char checksum;
+ int count;
+ char ch;
+
+ /*
+ * $<packet info>#<checksum>.
+ */
+ while (1) {
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+ count = 0;
+
+ while ((ch = buffer[count])) {
+ dbg_io_ops->write_char(ch);
+ checksum += ch;
+ count++;
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+
+ /* Now see what we get in reply. */
+ ch = gdbstub_read_wait();
+
+ if (ch == 3)
+ ch = gdbstub_read_wait();
+
+ /* If we get an ACK, we are done. */
+ if (ch == '+')
+ return;
+
+ /*
+ * If we get the start of another packet, this means
+ * that GDB is attempting to reconnect. We will NAK
+ * the packet being sent, and stop trying to send this
+ * packet.
+ */
+ if (ch == '$') {
+ dbg_io_ops->write_char('-');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ return;
+ }
+ }
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+void gdbstub_msg_write(const char *s, int len)
+{
+ char *bufptr;
+ int wcount;
+ int i;
+
+ if (len == 0)
+ len = strlen(s);
+
+ /* 'O'utput */
+ gdbmsgbuf[0] = 'O';
+
+ /* Fill and send buffers... */
+ while (len > 0) {
+ bufptr = gdbmsgbuf + 1;
+
+ /* Calculate how many this time */
+ if ((len << 1) > (BUFMAX - 2))
+ wcount = (BUFMAX - 2) >> 1;
+ else
+ wcount = len;
+
+ /* Pack in hex chars */
+ for (i = 0; i < wcount; i++)
+ bufptr = hex_byte_pack(bufptr, s[i]);
+ *bufptr = '\0';
+
+ /* Move up */
+ s += wcount;
+ len -= wcount;
+
+ /* Write packet */
+ put_packet(gdbmsgbuf);
+ }
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf. Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+char *kgdb_mem2hex(char *mem, char *buf, int count)
+{
+ char *tmp;
+ int err;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory copy. Hex conversion will work against this one.
+ */
+ tmp = buf + count;
+
+ err = probe_kernel_read(tmp, mem, count);
+ if (err)
+ return NULL;
+ while (count > 0) {
+ buf = hex_byte_pack(buf, *tmp);
+ tmp++;
+ count--;
+ }
+ *buf = 0;
+
+ return buf;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem. Return a pointer to the character AFTER the last byte
+ * written. May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+ char *tmp_raw;
+ char *tmp_hex;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory that is converted from hex.
+ */
+ tmp_raw = buf + count * 2;
+
+ tmp_hex = tmp_raw - 1;
+ while (tmp_hex >= buf) {
+ tmp_raw--;
+ *tmp_raw = hex_to_bin(*tmp_hex--);
+ *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
+ }
+
+ return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+ int hex_val;
+ int num = 0;
+ int negate = 0;
+
+ *long_val = 0;
+
+ if (**ptr == '-') {
+ negate = 1;
+ (*ptr)++;
+ }
+ while (**ptr) {
+ hex_val = hex_to_bin(**ptr);
+ if (hex_val < 0)
+ break;
+
+ *long_val = (*long_val << 4) | hex_val;
+ num++;
+ (*ptr)++;
+ }
+
+ if (negate)
+ *long_val = -*long_val;
+
+ return num;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem. Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+ int size = 0;
+ char *c = buf;
+
+ while (count-- > 0) {
+ c[size] = *buf++;
+ if (c[size] == 0x7d)
+ c[size] = *buf++ ^ 0x20;
+ size++;
+ }
+
+ return probe_kernel_write(mem, c, size);
+}
+
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_get_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_set_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+ unsigned long length;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+ if (binary)
+ err = kgdb_ebin2mem(ptr, (char *)addr, length);
+ else
+ err = kgdb_hex2mem(ptr, (char *)addr, length);
+ if (err)
+ return err;
+ if (CACHE_FLUSH_IS_SAFE)
+ flush_icache_range(addr, addr + length);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+ error = -error;
+ pkt[0] = 'E';
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
+ pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE 8
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+ unsigned char *limit;
+ int lzero = 1;
+
+ limit = id + (BUF_THREAD_ID_SIZE / 2);
+ while (id < limit) {
+ if (!lzero || *id != 0) {
+ pkt = hex_byte_pack(pkt, *id);
+ lzero = 0;
+ }
+ id++;
+ }
+
+ if (lzero)
+ pkt = hex_byte_pack(pkt, 0);
+
+ return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+ put_unaligned_be32(value, id);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+ /*
+ * Non-positive TIDs are remapped to the cpu shadow information
+ */
+ if (tid == 0 || tid == -1)
+ tid = -atomic_read(&kgdb_active) - 2;
+ if (tid < -1 && tid > -NR_CPUS - 2) {
+ if (kgdb_info[-tid - 2].task)
+ return kgdb_info[-tid - 2].task;
+ else
+ return idle_task(-tid - 2);
+ }
+ if (tid <= 0) {
+ printk(KERN_ERR "KGDB: Internal thread select error\n");
+ dump_stack();
+ return NULL;
+ }
+
+ /*
+ * find_task_by_pid_ns() does not take the tasklist lock anymore
+ * but is nicely RCU locked - hence is a pretty resilient
+ * thing to use:
+ */
+ return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+ if (realpid)
+ return realpid;
+
+ return -raw_smp_processor_id() - 2;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+ /*
+ * We know that this packet is only sent
+ * during initial connect. So to be safe,
+ * we clear out our breakpoints now in case
+ * GDB is reconnecting.
+ */
+ dbg_remove_all_break();
+
+ remcom_out_buffer[0] = 'S';
+ hex_byte_pack(&remcom_out_buffer[1], ks->signo);
+}
+
+static void gdb_get_regs_helper(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ void *local_debuggerinfo;
+ int i;
+
+ thread = kgdb_usethread;
+ if (!thread) {
+ thread = kgdb_info[ks->cpu].task;
+ local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+ } else {
+ local_debuggerinfo = NULL;
+ for_each_online_cpu(i) {
+ /*
+ * Try to find the task on some other
+ * or possibly this node if we do not
+ * find the matching task then we try
+ * to approximate the results.
+ */
+ if (thread == kgdb_info[i].task)
+ local_debuggerinfo = kgdb_info[i].debuggerinfo;
+ }
+ }
+
+ /*
+ * All threads that don't have debuggerinfo should be
+ * in schedule() sleeping, since all other CPUs
+ * are in kgdb_wait, and thus have debuggerinfo.
+ */
+ if (local_debuggerinfo) {
+ pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+ } else {
+ /*
+ * Pull stuff saved during switch_to; nothing
+ * else is accessible (or even particularly
+ * relevant).
+ *
+ * This should be enough for a stack trace.
+ */
+ sleeping_thread_to_gdb_regs(gdb_regs, thread);
+ }
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ gdb_get_regs_helper(ks);
+ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+ if (kgdb_usethread && kgdb_usethread != current) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+ }
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long length;
+ unsigned long addr;
+ char *err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0) {
+ err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+ if (!err)
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ }
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(0);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+ int i;
+ int offset = 0;
+
+ for (i = 0; i < regnum; i++)
+ offset += dbg_reg_def[i].size;
+ return kgdb_mem2hex((char *)gdb_regs + offset, out,
+ dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (regnum >= DBG_MAX_REG_NUM) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ gdb_get_regs_helper(ks);
+ gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+ int i = 0;
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (*ptr++ != '=' ||
+ !(!kgdb_usethread || kgdb_usethread == current) ||
+ !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ memset(gdb_regs, 0, sizeof(gdb_regs));
+ while (i < sizeof(gdb_regs) * 2)
+ if (hex_to_bin(ptr[i]) >= 0)
+ i++;
+ else
+ break;
+ i = i / 2;
+ kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+ dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(1);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+ int error;
+
+ /* The detach case */
+ if (remcom_in_buffer[0] == 'D') {
+ error = dbg_remove_all_break();
+ if (error < 0) {
+ error_packet(remcom_out_buffer, error);
+ } else {
+ strcpy(remcom_out_buffer, "OK");
+ kgdb_connected = 0;
+ }
+ put_packet(remcom_out_buffer);
+ } else {
+ /*
+ * Assume the kill case, with no exit code checking,
+ * trying to force detach the debugger:
+ */
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ }
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+ /* For now, only honor R0 */
+ if (strcmp(remcom_in_buffer, "R0") == 0) {
+ printk(KERN_CRIT "Executing emergency reboot\n");
+ strcpy(remcom_out_buffer, "OK");
+ put_packet(remcom_out_buffer);
+
+ /*
+ * Execution should not return from
+ * machine_emergency_restart()
+ */
+ machine_emergency_restart();
+ kgdb_connected = 0;
+
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+ int i;
+ int cpu;
+ int finished = 0;
+
+ switch (remcom_in_buffer[1]) {
+ case 's':
+ case 'f':
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
+ break;
+
+ i = 0;
+ remcom_out_buffer[0] = 'm';
+ ptr = remcom_out_buffer + 1;
+ if (remcom_in_buffer[1] == 'f') {
+ /* Each cpu is a shadow thread */
+ for_each_online_cpu(cpu) {
+ ks->thr_query = 0;
+ int_to_threadref(thref, -cpu - 2);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ i++;
+ }
+ }
+
+ do_each_thread(g, p) {
+ if (i >= ks->thr_query && !finished) {
+ int_to_threadref(thref, p->pid);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ ks->thr_query++;
+ if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+ finished = 1;
+ }
+ i++;
+ } while_each_thread(g, p);
+
+ *(--ptr) = '\0';
+ break;
+
+ case 'C':
+ /* Current thread id */
+ strcpy(remcom_out_buffer, "QC");
+ ks->threadid = shadow_pid(current->pid);
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(remcom_out_buffer + 2, thref);
+ break;
+ case 'T':
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
+ break;
+
+ ks->threadid = 0;
+ ptr = remcom_in_buffer + 17;
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!getthread(ks->linux_regs, ks->threadid)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ if ((int)ks->threadid > 0) {
+ kgdb_mem2hex(getthread(ks->linux_regs,
+ ks->threadid)->comm,
+ remcom_out_buffer, 16);
+ } else {
+ static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+ sprintf(tmpstr, "shadowCPU%d",
+ (int)(-ks->threadid - 2));
+ kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+ }
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case 'R':
+ if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+ int len = strlen(remcom_in_buffer + 6);
+
+ if ((len % 2) != 0) {
+ strcpy(remcom_out_buffer, "E01");
+ break;
+ }
+ kgdb_hex2mem(remcom_in_buffer + 6,
+ remcom_out_buffer, len);
+ len = len / 2;
+ remcom_out_buffer[len++] = 0;
+
+ kdb_common_init_state(ks);
+ kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
+ strcpy(remcom_out_buffer, "OK");
+ }
+ break;
+#endif
+ }
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ char *ptr;
+
+ switch (remcom_in_buffer[1]) {
+ case 'g':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_usethread = thread;
+ ks->kgdb_usethreadid = ks->threadid;
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ case 'c':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!ks->threadid) {
+ kgdb_contthread = NULL;
+ } else {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_contthread = thread;
+ }
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ }
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ struct task_struct *thread;
+
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+ /*
+ * Since GDB-5.3, it's been drafted that '0' is a software
+ * breakpoint, '1' is a hardware breakpoint, so let's do that.
+ */
+ char *bpt_type = &remcom_in_buffer[1];
+ char *ptr = &remcom_in_buffer[2];
+ unsigned long addr;
+ unsigned long length;
+ int error = 0;
+
+ if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+ /* Unsupported */
+ if (*bpt_type > '4')
+ return;
+ } else {
+ if (*bpt_type != '0' && *bpt_type != '1')
+ /* Unsupported. */
+ return;
+ }
+
+ /*
+ * Test if this is a hardware breakpoint, and
+ * if we support it:
+ */
+ if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+ /* Unsupported. */
+ return;
+
+ if (*(ptr++) != ',') {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (!kgdb_hex2long(&ptr, &addr)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (*(ptr++) != ',' ||
+ !kgdb_hex2long(&ptr, &length)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+
+ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+ error = dbg_set_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+ error = dbg_remove_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'Z')
+ error = arch_kgdb_ops.set_hw_breakpoint(addr,
+ (int)length, *bpt_type - '0');
+ else if (remcom_in_buffer[0] == 'z')
+ error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+ (int) length, *bpt_type - '0');
+
+ if (error == 0)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+ /* C09 == pass exception
+ * C15 == detach kgdb, pass exception
+ */
+ if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'c';
+
+ } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'D';
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ return 1;
+
+ } else {
+ gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+ " and 15 (pass and disconnect)\n"
+ "Executing a continue without signal passing\n", 0);
+ remcom_in_buffer[0] = 'c';
+ }
+
+ /* Indicate fall through */
+ return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ int tmp;
+
+ /* Initialize comm buffer and globals. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
+
+ if (kgdb_connected) {
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+
+ /* Reply to host that an exception has occurred */
+ ptr = remcom_out_buffer;
+ *ptr++ = 'T';
+ ptr = hex_byte_pack(ptr, ks->signo);
+ ptr += strlen(strcpy(ptr, "thread:"));
+ int_to_threadref(thref, shadow_pid(current->pid));
+ ptr = pack_threadid(ptr, thref);
+ *ptr++ = ';';
+ put_packet(remcom_out_buffer);
+ }
+
+ while (1) {
+ error = 0;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ get_packet(remcom_in_buffer);
+
+ switch (remcom_in_buffer[0]) {
+ case '?': /* gdbserial status */
+ gdb_cmd_status(ks);
+ break;
+ case 'g': /* return the value of the CPU registers */
+ gdb_cmd_getregs(ks);
+ break;
+ case 'G': /* set the value of the CPU registers - return OK */
+ gdb_cmd_setregs(ks);
+ break;
+ case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
+ gdb_cmd_memread(ks);
+ break;
+ case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_memwrite(ks);
+ break;
+#if DBG_MAX_REG_NUM > 0
+ case 'p': /* pXX Return gdb register XX (in hex) */
+ gdb_cmd_reg_get(ks);
+ break;
+ case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+ gdb_cmd_reg_set(ks);
+ break;
+#endif /* DBG_MAX_REG_NUM > 0 */
+ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_binwrite(ks);
+ break;
+ /* kill or detach. KGDB should treat this like a
+ * continue.
+ */
+ case 'D': /* Debugger detach */
+ case 'k': /* Debugger detach via kill */
+ gdb_cmd_detachkill(ks);
+ goto default_handle;
+ case 'R': /* Reboot */
+ if (gdb_cmd_reboot(ks))
+ goto default_handle;
+ break;
+ case 'q': /* query command */
+ gdb_cmd_query(ks);
+ break;
+ case 'H': /* task related */
+ gdb_cmd_task(ks);
+ break;
+ case 'T': /* Query thread status */
+ gdb_cmd_thread(ks);
+ break;
+ case 'z': /* Break point remove */
+ case 'Z': /* Break point set */
+ gdb_cmd_break(ks);
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case '3': /* Escape into back into kdb */
+ if (remcom_in_buffer[1] == '\0') {
+ gdb_cmd_detachkill(ks);
+ return DBG_PASS_EVENT;
+ }
+#endif
+ case 'C': /* Exception passing */
+ tmp = gdb_cmd_exception_pass(ks);
+ if (tmp > 0)
+ goto default_handle;
+ if (tmp == 0)
+ break;
+ /* Fall through on tmp < 0 */
+ case 'c': /* Continue packet */
+ case 's': /* Single step packet */
+ if (kgdb_contthread && kgdb_contthread != current) {
+ /* Can't switch threads in kgdb */
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ dbg_activate_sw_breakpoints();
+ /* Fall through to default processing */
+ default:
+default_handle:
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ /*
+ * Leave cmd processing on error, detach,
+ * kill, continue, or single step.
+ */
+ if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+ remcom_in_buffer[0] == 'k') {
+ error = 0;
+ goto kgdb_exit;
+ }
+
+ }
+
+ /* reply to the request */
+ put_packet(remcom_out_buffer);
+ }
+
+kgdb_exit:
+ if (ks->pass_exception)
+ error = 1;
+ return error;
+}
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+ int error;
+
+ switch (cmd[0]) {
+ case 'e':
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ return error;
+ case 's':
+ case 'c':
+ strcpy(remcom_in_buffer, cmd);
+ return 0;
+ case '$':
+ strcpy(remcom_in_buffer, cmd);
+ gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
+ gdbstub_prev_in_buf_pos = 0;
+ return 0;
+ }
+ dbg_io_ops->write_char('+');
+ put_packet(remcom_out_buffer);
+ return 0;
+}
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+ unsigned char checksum, ch, buffer[3];
+ int loop;
+
+ if (!kgdb_connected)
+ return;
+ kgdb_connected = 0;
+
+ if (!dbg_io_ops || dbg_kdb_mode)
+ return;
+
+ buffer[0] = 'W';
+ buffer[1] = hex_asc_hi(status);
+ buffer[2] = hex_asc_lo(status);
+
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+
+ for (loop = 0; loop < 3; loop++) {
+ ch = buffer[loop];
+ checksum += ch;
+ dbg_io_ops->write_char(ch);
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+ /* make sure the output is flushed, lest the bootloader clobber it */
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 00000000000..396d12eda9e
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 00000000000..d4fc58f4b88
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+
+CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
+
+clean-files := gen-kdb_cmds.c
+
+quiet_cmd_gen-kdb = GENKDB $@
+ cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+ /^\#/{next} \
+ /^[ \t]*$$/{next} \
+ {gsub(/"/, "\\\"", $$0); \
+ print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+ END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
+ $(filter-out %/Makefile,$^) > $@#
+
+$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
+ $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 00000000000..70a504601dc
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,553 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+ KDB_STATE_SET(DOING_SS);
+}
+
+static char *kdb_rwtypes[] = {
+ "Instruction(i)",
+ "Instruction(Register)",
+ "Data Write",
+ "I/O",
+ "Data Access"
+};
+
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+ if (bp->bp_type < 0 || bp->bp_type > 4)
+ return "";
+
+ return kdb_rwtypes[bp->bp_type];
+}
+
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+ int nextarg = *nextargp;
+ int diag;
+
+ bp->bph_length = 1;
+ if ((argc + 1) != nextarg) {
+ if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+ bp->bp_type = BP_ACCESS_WATCHPOINT;
+ else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+ bp->bp_type = BP_WRITE_WATCHPOINT;
+ else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+ bp->bp_type = BP_HARDWARE_BREAKPOINT;
+ else
+ return KDB_ARGCOUNT;
+
+ bp->bph_length = 1;
+
+ nextarg++;
+
+ if ((argc + 1) != nextarg) {
+ unsigned long len;
+
+ diag = kdbgetularg((char *)argv[nextarg],
+ &len);
+ if (diag)
+ return diag;
+
+
+ if (len > 8)
+ return KDB_BADLENGTH;
+
+ bp->bph_length = len;
+ nextarg++;
+ }
+
+ if ((argc + 1) != nextarg)
+ return KDB_ARGCOUNT;
+ }
+
+ *nextargp = nextarg;
+ return 0;
+}
+
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+ int ret = 1;
+ if (!bp->bp_installed)
+ return ret;
+ if (!bp->bp_type)
+ ret = dbg_remove_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0)
+ bp->bp_installed = 0;
+ return ret;
+}
+
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ if (KDB_DEBUG(BP))
+ kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+
+ /*
+ * Setup single step
+ */
+ kdb_setsinglestep(regs);
+
+ /*
+ * Reset delay attribute
+ */
+ bp->bp_delay = 0;
+ bp->bp_delayed = 1;
+}
+
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ int ret;
+ /*
+ * Install the breakpoint, if it is not already installed.
+ */
+
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: bp_installed %d\n",
+ __func__, bp->bp_installed);
+ if (!KDB_STATE(SSBPT))
+ bp->bp_delay = 0;
+ if (bp->bp_installed)
+ return 1;
+ if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: delayed bp\n", __func__);
+ kdb_handle_bp(regs, bp);
+ return 0;
+ }
+ if (!bp->bp_type)
+ ret = dbg_set_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0) {
+ bp->bp_installed = 1;
+ } else {
+ kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+ __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!bp->bp_type) {
+ kdb_printf("Software breakpoints are unavailable.\n"
+ " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " OR use hw breaks: help bph\n");
+ }
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_bp_install
+ *
+ * Install kdb_breakpoints prior to returning from the
+ * kernel debugger. This allows the kdb_breakpoints to be set
+ * upon functions that are used internally by kdb, such as
+ * printk(). This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+ int i;
+
+ for (i = 0; i < KDB_MAXBPT; i++) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_install(regs, bp);
+ }
+}
+
+/*
+ * kdb_bp_remove
+ *
+ * Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+ int i;
+
+ for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_remove(bp);
+ }
+}
+
+
+/*
+ * kdb_printbp
+ *
+ * Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+ kdb_printf("%s ", kdb_bptype(bp));
+ kdb_printf("BP #%d at ", i);
+ kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+
+ if (bp->bp_enabled)
+ kdb_printf("\n is enabled");
+ else
+ kdb_printf("\n is disabled");
+
+ kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ bp->bp_addr, bp->bp_type, bp->bp_installed);
+
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_bp
+ *
+ * Handle the bp commands.
+ *
+ * [bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * bp Set breakpoint on all cpus. Only use hardware assist if need.
+ * bph Set breakpoint on all cpus. Force hardware register
+ */
+
+static int kdb_bp(int argc, const char **argv)
+{
+ int i, bpno;
+ kdb_bp_t *bp, *bp_check;
+ int diag;
+ char *symname = NULL;
+ long offset = 0ul;
+ int nextarg;
+ kdb_bp_t template = {0};
+
+ if (argc == 0) {
+ /*
+ * Display breakpoint table
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+ bpno++, bp++) {
+ if (bp->bp_free)
+ continue;
+ kdb_printbp(bp, bpno);
+ }
+
+ return 0;
+ }
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+ &offset, &symname);
+ if (diag)
+ return diag;
+ if (!template.bp_addr)
+ return KDB_BADINT;
+
+ /*
+ * Find an empty bp structure to allocate
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+ if (bp->bp_free)
+ break;
+ }
+
+ if (bpno == KDB_MAXBPT)
+ return KDB_TOOMANYBPT;
+
+ if (strcmp(argv[0], "bph") == 0) {
+ template.bp_type = BP_HARDWARE_BREAKPOINT;
+ diag = kdb_parsebp(argc, argv, &nextarg, &template);
+ if (diag)
+ return diag;
+ } else {
+ template.bp_type = BP_BREAKPOINT;
+ }
+
+ /*
+ * Check for clashing breakpoints.
+ *
+ * Note, in this design we can't have hardware breakpoints
+ * enabled for both read and write on the same address.
+ */
+ for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp_check++) {
+ if (!bp_check->bp_free &&
+ bp_check->bp_addr == template.bp_addr) {
+ kdb_printf("You already have a breakpoint at "
+ kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+ return KDB_DUPBPT;
+ }
+ }
+
+ template.bp_enabled = 1;
+
+ /*
+ * Actually allocate the breakpoint found earlier
+ */
+ *bp = template;
+ bp->bp_free = 0;
+
+ kdb_printbp(bp, bpno);
+
+ return 0;
+}
+
+/*
+ * kdb_bc
+ *
+ * Handles the 'bc', 'be', and 'bd' commands
+ *
+ * [bd|bc|be] <breakpoint-number>
+ * [bd|bc|be] *
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic for failure
+ * Locking:
+ * None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+ unsigned long addr;
+ kdb_bp_t *bp = NULL;
+ int lowbp = KDB_MAXBPT;
+ int highbp = 0;
+ int done = 0;
+ int i;
+ int diag = 0;
+
+ int cmd; /* KDBCMD_B? */
+#define KDBCMD_BC 0
+#define KDBCMD_BE 1
+#define KDBCMD_BD 2
+
+ if (strcmp(argv[0], "be") == 0)
+ cmd = KDBCMD_BE;
+ else if (strcmp(argv[0], "bd") == 0)
+ cmd = KDBCMD_BD;
+ else
+ cmd = KDBCMD_BC;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ if (strcmp(argv[1], "*") == 0) {
+ lowbp = 0;
+ highbp = KDB_MAXBPT;
+ } else {
+ diag = kdbgetularg(argv[1], &addr);
+ if (diag)
+ return diag;
+
+ /*
+ * For addresses less than the maximum breakpoint number,
+ * assume that the breakpoint number is desired.
+ */
+ if (addr < KDB_MAXBPT) {
+ bp = &kdb_breakpoints[addr];
+ lowbp = highbp = addr;
+ highbp++;
+ } else {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp++) {
+ if (bp->bp_addr == addr) {
+ lowbp = highbp = i;
+ highbp++;
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Now operate on the set of breakpoints matching the input
+ * criteria (either '*' for all, or an individual breakpoint).
+ */
+ for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+ i < highbp;
+ i++, bp++) {
+ if (bp->bp_free)
+ continue;
+
+ done++;
+
+ switch (cmd) {
+ case KDBCMD_BC:
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " cleared\n",
+ i, bp->bp_addr);
+
+ bp->bp_addr = 0;
+ bp->bp_free = 1;
+
+ break;
+ case KDBCMD_BE:
+ bp->bp_enabled = 1;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " enabled",
+ i, bp->bp_addr);
+
+ kdb_printf("\n");
+ break;
+ case KDBCMD_BD:
+ if (!bp->bp_enabled)
+ break;
+
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " disabled\n",
+ i, bp->bp_addr);
+
+ break;
+ }
+ if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+ bp->bp_delay = 0;
+ KDB_STATE_CLEAR(SSBPT);
+ }
+ }
+
+ return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+
+/*
+ * kdb_ss
+ *
+ * Process the 'ss' (Single Step) command.
+ *
+ * ss
+ *
+ * Parameters:
+ * argc Argument count
+ * argv Argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * KDB_CMD_SS for success, a kdb error if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * Set the arch specific option to trigger a debug trap after the next
+ * instruction.
+ */
+
+static int kdb_ss(int argc, const char **argv)
+{
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ /*
+ * Set trace flag and go.
+ */
+ KDB_STATE_SET(DOING_SS);
+ return KDB_CMD_SS;
+}
+
+/* Initialize the breakpoint table and register breakpoint commands. */
+
+void __init kdb_initbptab(void)
+{
+ int i;
+ kdb_bp_t *bp;
+
+ /*
+ * First time initialization.
+ */
+ memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+ bp->bp_free = 1;
+
+ kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+ kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0, KDB_REPEAT_NONE);
+
+ kdb_register_repeat("ss", kdb_ss, "",
+ "Single Step", 1, KDB_REPEAT_NO_ARGS);
+ /*
+ * Architecture dependent initialization.
+ */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 00000000000..fe15fff5df5
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include "kdb_private.h"
+
+
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_trap_printk++;
+ kdb_set_current_task(p);
+ if (addr) {
+ show_stack((struct task_struct *)p, addr);
+ } else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+ show_stack(p, &kdb_current_regs->sp);
+#else
+ show_stack(p, NULL);
+#endif
+ } else {
+ show_stack(p, NULL);
+ }
+ console_loglevel = old_lvl;
+ kdb_trap_printk--;
+}
+
+/*
+ * kdb_bt
+ *
+ * This function implements the 'bt' command. Print a stack
+ * traceback.
+ *
+ * bt [<address-expression>] (addr-exp is for alternate stacks)
+ * btp <pid> Kernel stack for <pid>
+ * btt <address-expression> Kernel stack for task structure at
+ * <address-expression>
+ * bta [DRSTCZEUIMA] All useful processes, optionally
+ * filtered by state
+ * btc [<cpu>] The current process on one cpu,
+ * default is all cpus
+ *
+ * bt <address-expression> refers to a address on the stack, that location
+ * is assumed to contain a return address.
+ *
+ * btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * Backtrack works best when the code uses frame pointers. But even
+ * without frame pointers we should get a reasonable trace.
+ *
+ * mds comes in handy when examining the stack to do a manual traceback or
+ * to get a starting point for bt <address-expression>.
+ */
+
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+ int argcount, int btaprompt)
+{
+ char buffer[2];
+ if (kdb_getarea(buffer[0], (unsigned long)p) ||
+ kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ return KDB_BADADDR;
+ if (!kdb_task_state(p, mask))
+ return 0;
+ kdb_printf("Stack traceback for pid %d\n", p->pid);
+ kdb_ps1(p);
+ kdb_show_stack(p, NULL);
+ if (btaprompt) {
+ kdb_getstr(buffer, sizeof(buffer),
+ "Enter <q> to end, <cr> to continue:");
+ if (buffer[0] == 'q') {
+ kdb_printf("\n");
+ return 1;
+ }
+ }
+ touch_nmi_watchdog();
+ return 0;
+}
+
+int
+kdb_bt(int argc, const char **argv)
+{
+ int diag;
+ int argcount = 5;
+ int btaprompt = 1;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+
+ /* Prompt after each proc in bta */
+ kdbgetintenv("BTAPROMPT", &btaprompt);
+
+ if (strcmp(argv[0], "bta") == 0) {
+ struct task_struct *g, *p;
+ unsigned long cpu;
+ unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+ NULL);
+ if (argc == 0)
+ kdb_ps_suppressed();
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ }
+ /* Now the inactive tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (task_curr(p))
+ continue;
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ } kdb_while_each_thread(g, p);
+ } else if (strcmp(argv[0], "btp") == 0) {
+ struct task_struct *p;
+ unsigned long pid;
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &pid);
+ if (diag)
+ return diag;
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p) {
+ kdb_set_current_task(p);
+ return kdb_bt1(p, ~0UL, argcount, 0);
+ }
+ kdb_printf("No process with pid == %ld found\n", pid);
+ return 0;
+ } else if (strcmp(argv[0], "btt") == 0) {
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &addr);
+ if (diag)
+ return diag;
+ kdb_set_current_task((struct task_struct *)addr);
+ return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ } else if (strcmp(argv[0], "btc") == 0) {
+ unsigned long cpu = ~0;
+ struct task_struct *save_current_task = kdb_current_task;
+ char buf[80];
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+ if (argc == 1) {
+ diag = kdbgetularg((char *)argv[1], &cpu);
+ if (diag)
+ return diag;
+ }
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ if (cpu != ~0) {
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("no process for cpu %ld\n", cpu);
+ return 0;
+ }
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ return 0;
+ }
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ touch_nmi_watchdog();
+ }
+ kdb_set_current_task(save_current_task);
+ return 0;
+ } else {
+ if (argc) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ kdb_show_stack(kdb_current_task, (void *)addr);
+ return 0;
+ } else {
+ return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 00000000000..9834ad303ab
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,31 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes. Commands that require process data (including stack or
+# registers) are not reliable this early. set and bp commands should
+# be safe. Global breakpoint commands affect each cpu as it is booted.
+
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+
+defcmd dumpcommon "" "Common kdb debugging"
+ set BTAPROMPT 0
+ set LINES 10000
+ -summary
+ -cpu
+ -ps
+ -dmesg 600
+ -bt
+endefcmd
+
+defcmd dumpall "" "First line debugging"
+ pid R
+ -dumpcommon
+ -bta
+endefcmd
+
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+ pid R
+ -dumpcommon
+ -btc
+endefcmd
+
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 00000000000..8859ca34dcf
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,182 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+ dbg_io_get_char,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
+
+static struct kgdb_state *kdb_ks;
+
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
+int kdb_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ kdb_bp_t *bp;
+ unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ kdb_reason_t reason = KDB_REASON_OOPS;
+ kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+ int i;
+
+ kdb_ks = ks;
+ if (KDB_STATE(REENTRY)) {
+ reason = KDB_REASON_SWITCH;
+ KDB_STATE_CLEAR(REENTRY);
+ addr = instruction_pointer(ks->linux_regs);
+ }
+ ks->pass_exception = 0;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ reason = KDB_REASON_KEYBOARD;
+
+ if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+ reason = KDB_REASON_SYSTEM_NMI;
+
+ else if (in_nmi())
+ reason = KDB_REASON_NMI;
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ if (addr != instruction_pointer(ks->linux_regs))
+ kgdb_arch_set_pc(ks->linux_regs, addr);
+ break;
+ }
+ }
+ if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if (bp->bp_free)
+ continue;
+ if (bp->bp_addr == addr) {
+ bp->bp_delay = 1;
+ bp->bp_delayed = 1;
+ /*
+ * SSBPT is set when the kernel debugger must single step a
+ * task in order to re-establish an instruction breakpoint
+ * which uses the instruction replacement mechanism. It is
+ * cleared by any action that removes the need to single-step
+ * the breakpoint.
+ */
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ KDB_STATE_SET(SSBPT);
+ break;
+ }
+ }
+ }
+
+ if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+ ks->signo == SIGTRAP) {
+ reason = KDB_REASON_SSTEP;
+ db_result = KDB_DB_BPT;
+ }
+ /* Set initial kdb state variables */
+ KDB_STATE_CLEAR(KGDB_TRANS);
+ kdb_common_init_state(ks);
+ /* Remove any breakpoints as needed by kdb and clear single step */
+ kdb_bp_remove();
+ KDB_STATE_CLEAR(DOING_SS);
+ KDB_STATE_SET(PAGER);
+ /* zero out any offline cpu data */
+ for_each_present_cpu(i) {
+ if (!cpu_online(i)) {
+ kgdb_info[i].debuggerinfo = NULL;
+ kgdb_info[i].task = NULL;
+ }
+ }
+ if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+ ks->pass_exception = 1;
+ KDB_FLAG_SET(CATASTROPHIC);
+ }
+ if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+ KDB_STATE_CLEAR(SSBPT);
+ KDB_STATE_CLEAR(DOING_SS);
+ } else {
+ /* Start kdb main loop */
+ error = kdb_main_loop(KDB_REASON_ENTER, reason,
+ ks->err_code, db_result, ks->linux_regs);
+ }
+ /*
+ * Upon exit from the kdb main loop setup break points and restart
+ * the system based on the requested continue state
+ */
+ kdb_common_deinit_state();
+ KDB_STATE_CLEAR(PAGER);
+ kdbnearsym_cleanup();
+ if (error == KDB_CMD_KGDB) {
+ if (KDB_STATE(DOING_KGDB))
+ KDB_STATE_CLEAR(DOING_KGDB);
+ return DBG_PASS_EVENT;
+ }
+ kdb_bp_install(ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ /* Set the exit state to a single step or a continue */
+ if (KDB_STATE(DOING_SS))
+ gdbstub_state(ks, "s");
+ else
+ gdbstub_state(ks, "c");
+
+ KDB_FLAG_CLEAR(CATASTROPHIC);
+
+ /* Invoke arch specific exception handling prior to system resume */
+ kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+ if (ks->pass_exception)
+ kgdb_info[ks->cpu].ret_state = 1;
+ if (error == KDB_CMD_CPU) {
+ KDB_STATE_SET(REENTRY);
+ /*
+ * Force clear the single step bit because kdb emulates this
+ * differently vs the gdbstub
+ */
+ kgdb_single_step = 0;
+ dbg_deactivate_sw_breakpoints();
+ return DBG_SWITCH_CPU_EVENT;
+ }
+ return kgdb_info[ks->cpu].ret_state;
+}
+
+void kdb_gdb_state_pass(char *buf)
+{
+ gdbstub_state(kdb_ks, buf);
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 00000000000..7c70812caea
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,852 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+
+int kdb_trap_printk;
+
+static int kgdb_transition_check(char *buffer)
+{
+ if (buffer[0] != '+' && buffer[0] != '$') {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ } else {
+ int slen = strlen(buffer);
+ if (slen > 3 && buffer[slen - 3] == '#') {
+ kdb_gdb_state_pass(buffer);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+ char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
+ char *ped = escape_data;
+ int escape_delay = 0;
+ get_char_func *f, *f_escape = NULL;
+ int key;
+
+ for (f = &kdb_poll_funcs[0]; ; ++f) {
+ if (*f == NULL) {
+ /* Reset NMI watchdog once per poll loop */
+ touch_nmi_watchdog();
+ f = &kdb_poll_funcs[0];
+ }
+ if (escape_delay == 2) {
+ *ped = '\0';
+ ped = escape_data;
+ --escape_delay;
+ }
+ if (escape_delay == 1) {
+ key = *ped++;
+ if (!*ped)
+ --escape_delay;
+ break;
+ }
+ key = (*f)();
+ if (key == -1) {
+ if (escape_delay) {
+ udelay(ESCAPE_UDELAY);
+ --escape_delay;
+ }
+ continue;
+ }
+ if (bufsize <= 2) {
+ if (key == '\r')
+ key = '\n';
+ *buffer++ = key;
+ *buffer = '\0';
+ return -1;
+ }
+ if (escape_delay == 0 && key == '\e') {
+ escape_delay = ESCAPE_DELAY;
+ ped = escape_data;
+ f_escape = f;
+ }
+ if (escape_delay) {
+ *ped++ = key;
+ if (f_escape != f) {
+ escape_delay = 2;
+ continue;
+ }
+ if (ped - escape_data == 1) {
+ /* \e */
+ continue;
+ } else if (ped - escape_data == 2) {
+ /* \e<something> */
+ if (key != '[')
+ escape_delay = 2;
+ continue;
+ } else if (ped - escape_data == 3) {
+ /* \e[<something> */
+ int mapkey = 0;
+ switch (key) {
+ case 'A': /* \e[A, up arrow */
+ mapkey = 16;
+ break;
+ case 'B': /* \e[B, down arrow */
+ mapkey = 14;
+ break;
+ case 'C': /* \e[C, right arrow */
+ mapkey = 6;
+ break;
+ case 'D': /* \e[D, left arrow */
+ mapkey = 2;
+ break;
+ case '1': /* dropthrough */
+ case '3': /* dropthrough */
+ /* \e[<1,3,4>], may be home, del, end */
+ case '4':
+ mapkey = -1;
+ break;
+ }
+ if (mapkey != -1) {
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ }
+ continue;
+ } else if (ped - escape_data == 4) {
+ /* \e[<1,3,4><something> */
+ int mapkey = 0;
+ if (key == '~') {
+ switch (escape_data[2]) {
+ case '1': /* \e[1~, home */
+ mapkey = 1;
+ break;
+ case '3': /* \e[3~, del */
+ mapkey = 4;
+ break;
+ case '4': /* \e[4~, end */
+ mapkey = 5;
+ break;
+ }
+ }
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ continue;
+ }
+ }
+ break; /* A key to process */
+ }
+ return key;
+}
+
+/*
+ * kdb_read
+ *
+ * This function reads a string of characters, terminated by
+ * a newline, or by reaching the end of the supplied buffer,
+ * from the current kernel debugger console device.
+ * Parameters:
+ * buffer - Address of character buffer to receive input characters.
+ * bufsize - size, in bytes, of the character buffer
+ * Returns:
+ * Returns a pointer to the buffer containing the received
+ * character string. This string will be terminated by a
+ * newline character.
+ * Locking:
+ * No locks are required to be held upon entry to this
+ * function. It is not reentrant - it relies on the fact
+ * that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2. A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key. Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+ char *cp = buffer;
+ char *bufend = buffer+bufsize-2; /* Reserve space for newline
+ * and null byte */
+ char *lastchar;
+ char *p_tmp;
+ char tmp;
+ static char tmpbuffer[CMD_BUFLEN];
+ int len = strlen(buffer);
+ int len_tmp;
+ int tab = 0;
+ int count;
+ int i;
+ int diag, dtab_count;
+ int key;
+
+
+ diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+ if (diag)
+ dtab_count = 30;
+
+ if (len > 0) {
+ cp += len;
+ if (*(buffer+len-1) == '\n')
+ cp--;
+ }
+
+ lastchar = cp;
+ *cp = '\0';
+ kdb_printf("%s", buffer);
+poll_again:
+ key = kdb_read_get_key(buffer, bufsize);
+ if (key == -1)
+ return buffer;
+ if (key != 9)
+ tab = 0;
+ switch (key) {
+ case 8: /* backspace */
+ if (cp > buffer) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp-1, tmpbuffer, lastchar - cp);
+ }
+ *(--lastchar) = '\0';
+ --cp;
+ kdb_printf("\b%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 13: /* enter */
+ *lastchar++ = '\n';
+ *lastchar++ = '\0';
+ if (!KDB_STATE(KGDB_TRANS)) {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ }
+ kdb_printf("\n");
+ return buffer;
+ case 4: /* Del */
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+ memcpy(cp, tmpbuffer, lastchar - cp - 1);
+ *(--lastchar) = '\0';
+ kdb_printf("%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 1: /* Home */
+ if (cp > buffer) {
+ kdb_printf("\r");
+ kdb_printf(kdb_prompt_str);
+ cp = buffer;
+ }
+ break;
+ case 5: /* End */
+ if (cp < lastchar) {
+ kdb_printf("%s", cp);
+ cp = lastchar;
+ }
+ break;
+ case 2: /* Left */
+ if (cp > buffer) {
+ kdb_printf("\b");
+ --cp;
+ }
+ break;
+ case 14: /* Down */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 6: /* Right */
+ if (cp < lastchar) {
+ kdb_printf("%c", *cp);
+ ++cp;
+ }
+ break;
+ case 16: /* Up */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 9: /* Tab */
+ if (tab < 2)
+ ++tab;
+ p_tmp = buffer;
+ while (*p_tmp == ' ')
+ p_tmp++;
+ if (p_tmp > cp)
+ break;
+ memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+ *(tmpbuffer + (cp-p_tmp)) = '\0';
+ p_tmp = strrchr(tmpbuffer, ' ');
+ if (p_tmp)
+ ++p_tmp;
+ else
+ p_tmp = tmpbuffer;
+ len = strlen(p_tmp);
+ count = kallsyms_symbol_complete(p_tmp,
+ sizeof(tmpbuffer) -
+ (p_tmp - tmpbuffer));
+ if (tab == 2 && count > 0) {
+ kdb_printf("\n%d symbols are found.", count);
+ if (count > dtab_count) {
+ count = dtab_count;
+ kdb_printf(" But only first %d symbols will"
+ " be printed.\nYou can change the"
+ " environment variable DTABCOUNT.",
+ count);
+ }
+ kdb_printf("\n");
+ for (i = 0; i < count; i++) {
+ if (kallsyms_symbol_next(p_tmp, i) < 0)
+ break;
+ kdb_printf("%s ", p_tmp);
+ *(p_tmp + len) = '\0';
+ }
+ if (i >= dtab_count)
+ kdb_printf("...");
+ kdb_printf("\n");
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ } else if (tab != 2 && count > 0) {
+ len_tmp = strlen(p_tmp);
+ strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+ len_tmp = strlen(p_tmp);
+ strncpy(cp, p_tmp+len, len_tmp-len + 1);
+ len = len_tmp - len;
+ kdb_printf("%s", cp);
+ cp += len;
+ lastchar += len;
+ }
+ kdb_nextline = 1; /* reset output line number */
+ break;
+ default:
+ if (key >= 32 && lastchar < bufend) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp+1, tmpbuffer, lastchar - cp);
+ *++lastchar = '\0';
+ *cp = key;
+ kdb_printf("%s\r", cp);
+ ++cp;
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ } else {
+ *++lastchar = '\0';
+ *cp++ = key;
+ /* The kgdb transition check will hide
+ * printed characters if we think that
+ * kgdb is connecting, until the check
+ * fails */
+ if (!KDB_STATE(KGDB_TRANS)) {
+ if (kgdb_transition_check(buffer))
+ return buffer;
+ } else {
+ kdb_printf("%c", key);
+ }
+ }
+ /* Special escape to kgdb */
+ if (lastchar - buffer >= 5 &&
+ strcmp(lastchar - 5, "$?#3f") == 0) {
+ kdb_gdb_state_pass(lastchar - 5);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ if (lastchar - buffer >= 11 &&
+ strcmp(lastchar - 11, "$qSupported") == 0) {
+ kdb_gdb_state_pass(lastchar - 11);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ }
+ break;
+ }
+ goto poll_again;
+}
+
+/*
+ * kdb_getstr
+ *
+ * Print the prompt string and read a command from the
+ * input device.
+ *
+ * Parameters:
+ * buffer Address of buffer to receive command
+ * bufsize Size of buffer in bytes
+ * prompt Pointer to string to use as prompt string
+ * Returns:
+ * Pointer to command buffer.
+ * Locking:
+ * None.
+ * Remarks:
+ * For SMP kernels, the processor number will be
+ * substituted for %d, %x or %o in the prompt.
+ */
+
+char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+{
+ if (prompt && kdb_prompt_str != prompt)
+ strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+ kdb_printf(kdb_prompt_str);
+ kdb_nextline = 1; /* Prompt and input resets line number */
+ return kdb_read(buffer, bufsize);
+}
+
+/*
+ * kdb_input_flush
+ *
+ * Get rid of any buffered console input.
+ *
+ * Parameters:
+ * none
+ * Returns:
+ * nothing
+ * Locking:
+ * none
+ * Remarks:
+ * Call this function whenever you want to flush input. If there is any
+ * outstanding input, it ignores all characters until there has been no
+ * data for approximately 1ms.
+ */
+
+static void kdb_input_flush(void)
+{
+ get_char_func *f;
+ int res;
+ int flush_delay = 1;
+ while (flush_delay) {
+ flush_delay--;
+empty:
+ touch_nmi_watchdog();
+ for (f = &kdb_poll_funcs[0]; *f; ++f) {
+ res = (*f)();
+ if (res != -1) {
+ flush_delay = 1;
+ goto empty;
+ }
+ }
+ if (flush_delay)
+ mdelay(1);
+ }
+}
+
+/*
+ * kdb_printf
+ *
+ * Print a string to the output device(s).
+ *
+ * Parameters:
+ * printf-like format and optional args.
+ * Returns:
+ * 0
+ * Locking:
+ * None.
+ * Remarks:
+ * use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ * kdb output.
+ *
+ * If the user is doing a cmd args | grep srch
+ * then kdb_grepping_flag is set.
+ * In that case we need to accumulate full lines (ending in \n) before
+ * searching for the pattern.
+ */
+
+static char kdb_buffer[256]; /* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int size_avail;
+static int suspend_grep;
+
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+ char firstchar, *cp;
+ int len1, len2;
+
+ /* not counting the newline at the end of "searched" */
+ len1 = strlen(searched)-1;
+ len2 = strlen(searchfor);
+ if (len1 < len2)
+ return 0;
+ if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+ return 0;
+ if (kdb_grep_leading) {
+ if (!strncmp(searched, searchfor, len2))
+ return 1;
+ } else if (kdb_grep_trailing) {
+ if (!strncmp(searched+len1-len2, searchfor, len2))
+ return 1;
+ } else {
+ firstchar = *searchfor;
+ cp = searched;
+ while ((cp = strchr(cp, firstchar))) {
+ if (!strncmp(cp, searchfor, len2))
+ return 1;
+ cp++;
+ }
+ }
+ return 0;
+}
+
+int vkdb_printf(const char *fmt, va_list ap)
+{
+ int diag;
+ int linecount;
+ int colcount;
+ int logging, saved_loglevel = 0;
+ int saved_trap_printk;
+ int got_printf_lock = 0;
+ int retlen = 0;
+ int fnd, len;
+ char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+ char *moreprompt = "more> ";
+ struct console *c = console_drivers;
+ static DEFINE_SPINLOCK(kdb_printf_lock);
+ unsigned long uninitialized_var(flags);
+
+ preempt_disable();
+ saved_trap_printk = kdb_trap_printk;
+ kdb_trap_printk = 0;
+
+ /* Serialize kdb_printf if multiple cpus try to write at once.
+ * But if any cpu goes recursive in kdb, just print the output,
+ * even if it is interleaved with any other text.
+ */
+ if (!KDB_STATE(PRINTF_LOCK)) {
+ KDB_STATE_SET(PRINTF_LOCK);
+ spin_lock_irqsave(&kdb_printf_lock, flags);
+ got_printf_lock = 1;
+ atomic_inc(&kdb_event);
+ } else {
+ __acquire(kdb_printf_lock);
+ }
+
+ diag = kdbgetintenv("LINES", &linecount);
+ if (diag || linecount <= 1)
+ linecount = 24;
+
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (diag)
+ logging = 0;
+
+ if (!kdb_grepping_flag || suspend_grep) {
+ /* normally, every vsnprintf starts a new buffer */
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ }
+ vsnprintf(next_avail, size_avail, fmt, ap);
+
+ /*
+ * If kdb_parse() found that the command was cmd xxx | grep yyy
+ * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+ *
+ * Accumulate the print data up to a newline before searching it.
+ * (vsnprintf does null-terminate the string that it generates)
+ */
+
+ /* skip the search if prints are temporarily unconditional */
+ if (!suspend_grep && kdb_grepping_flag) {
+ cp = strchr(kdb_buffer, '\n');
+ if (!cp) {
+ /*
+ * Special cases that don't end with newlines
+ * but should be written without one:
+ * The "[nn]kdb> " prompt should
+ * appear at the front of the buffer.
+ *
+ * The "[nn]more " prompt should also be
+ * (MOREPROMPT -> moreprompt)
+ * written * but we print that ourselves,
+ * we set the suspend_grep flag to make
+ * it unconditional.
+ *
+ */
+ if (next_avail == kdb_buffer) {
+ /*
+ * these should occur after a newline,
+ * so they will be at the front of the
+ * buffer
+ */
+ cp2 = kdb_buffer;
+ len = strlen(kdb_prompt_str);
+ if (!strncmp(cp2, kdb_prompt_str, len)) {
+ /*
+ * We're about to start a new
+ * command, so we can go back
+ * to normal mode.
+ */
+ kdb_grepping_flag = 0;
+ goto kdb_printit;
+ }
+ }
+ /* no newline; don't search/write the buffer
+ until one is there */
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+
+ /*
+ * The newline is present; print through it or discard
+ * it, depending on the results of the search.
+ */
+ cp++; /* to byte after the newline */
+ replaced_byte = *cp; /* remember what/where it was */
+ cphold = cp;
+ *cp = '\0'; /* end the string for our search */
+
+ /*
+ * We now have a newline at the end of the string
+ * Only continue with this output if it contains the
+ * search string.
+ */
+ fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+ if (!fnd) {
+ /*
+ * At this point the complete line at the start
+ * of kdb_buffer can be discarded, as it does
+ * not contain what the user is looking for.
+ * Shift the buffer left.
+ */
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+ /*
+ * at this point the string is a full line and
+ * should be printed, up to the null.
+ */
+ }
+kdb_printit:
+
+ /*
+ * Write to all consoles.
+ */
+ retlen = strlen(kdb_buffer);
+ if (!dbg_kdb_mode && kgdb_connected) {
+ gdbstub_msg_write(kdb_buffer, retlen);
+ } else {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = retlen;
+ cp = kdb_buffer;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, kdb_buffer, retlen);
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+ }
+ if (logging) {
+ saved_loglevel = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_SILENT;
+ printk(KERN_INFO "%s", kdb_buffer);
+ }
+
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
+
+ /* check for having reached the LINES number of printed lines */
+ if (kdb_nextline >= linecount) {
+ char buf1[16] = "";
+
+ /* Watch out for recursion here. Any routine that calls
+ * kdb_printf will come back through here. And kdb_read
+ * uses kdb_printf to echo on serial consoles ...
+ */
+ kdb_nextline = 1; /* In case of recursion */
+
+ /*
+ * Pause until cr.
+ */
+ moreprompt = kdbgetenv("MOREPROMPT");
+ if (moreprompt == NULL)
+ moreprompt = "more> ";
+
+ kdb_input_flush();
+ c = console_drivers;
+
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = strlen(moreprompt);
+ cp = moreprompt;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, moreprompt, strlen(moreprompt));
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+
+ if (logging)
+ printk("%s", moreprompt);
+
+ kdb_read(buf1, 2); /* '2' indicates to return
+ * immediately after getting one key. */
+ kdb_nextline = 1; /* Really set output line 1 */
+
+ /* empty and reset the buffer: */
+ kdb_buffer[0] = '\0';
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ /* user hit q or Q */
+ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+ KDB_STATE_CLEAR(PAGER);
+ /* end of command output; back to normal mode */
+ kdb_grepping_flag = 0;
+ kdb_printf("\n");
+ } else if (buf1[0] == ' ') {
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '\n') {
+ kdb_nextline = linecount - 1;
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] && buf1[0] != '\n') {
+ /* user hit something other than enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+ "prompt, input ignored\n");
+ } else if (kdb_grepping_flag) {
+ /* user hit enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\n");
+ }
+ kdb_input_flush();
+ }
+
+ /*
+ * For grep searches, shift the printed string left.
+ * replaced_byte contains the character that was overwritten with
+ * the terminating null, and cphold points to the null.
+ * Then adjust the notion of available space in the buffer.
+ */
+ if (kdb_grepping_flag && !suspend_grep) {
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ }
+
+kdb_print_out:
+ suspend_grep = 0; /* end of what may have been a recursive call */
+ if (logging)
+ console_loglevel = saved_loglevel;
+ if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+ got_printf_lock = 0;
+ spin_unlock_irqrestore(&kdb_printf_lock, flags);
+ KDB_STATE_CLEAR(PRINTF_LOCK);
+ atomic_dec(&kdb_event);
+ } else {
+ __release(kdb_printf_lock);
+ }
+ kdb_trap_printk = saved_trap_printk;
+ preempt_enable();
+ return retlen;
+}
+
+int kdb_printf(const char *fmt, ...)
+{
+ va_list ap;
+ int r;
+
+ va_start(ap, fmt);
+ r = vkdb_printf(fmt, ap);
+ va_end(ap);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 00000000000..118527aa60e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,263 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+/* Keyboard Controller Registers on normal PCs. */
+
+#define KBD_STATUS_REG 0x64 /* Status register (R) */
+#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
+
+/* Status Register Bits */
+
+#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+
+static int kbd_exists;
+static int kbd_last_ret;
+
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+ int scancode, scanstatus;
+ static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
+ static int shift_key; /* Shift next keypress */
+ static int ctrl_key;
+ u_short keychar;
+
+ if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+ (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+ kbd_exists = 0;
+ return -1;
+ }
+ kbd_exists = 1;
+
+ if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ return -1;
+
+ /*
+ * Fetch the scancode
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Ignore mouse events.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ return -1;
+
+ /*
+ * Ignore release, trigger on make
+ * (except for shift keys, where we want to
+ * keep the shift state so long as the key is
+ * held down).
+ */
+
+ if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+ /*
+ * Next key may use shift table
+ */
+ if ((scancode & 0x80) == 0)
+ shift_key = 1;
+ else
+ shift_key = 0;
+ return -1;
+ }
+
+ if ((scancode&0x7f) == 0x1d) {
+ /*
+ * Left ctrl key
+ */
+ if ((scancode & 0x80) == 0)
+ ctrl_key = 1;
+ else
+ ctrl_key = 0;
+ return -1;
+ }
+
+ if ((scancode & 0x80) != 0) {
+ if (scancode == 0x9c)
+ kbd_last_ret = 0;
+ return -1;
+ }
+
+ scancode &= 0x7f;
+
+ /*
+ * Translate scancode
+ */
+
+ if (scancode == 0x3a) {
+ /*
+ * Toggle caps lock
+ */
+ shift_lock ^= 1;
+
+#ifdef KDB_BLINK_LED
+ kdb_toggleled(0x4);
+#endif
+ return -1;
+ }
+
+ if (scancode == 0x0e) {
+ /*
+ * Backspace
+ */
+ return 8;
+ }
+
+ /* Special Key */
+ switch (scancode) {
+ case 0xF: /* Tab */
+ return 9;
+ case 0x53: /* Del */
+ return 4;
+ case 0x47: /* Home */
+ return 1;
+ case 0x4F: /* End */
+ return 5;
+ case 0x4B: /* Left */
+ return 2;
+ case 0x48: /* Up */
+ return 16;
+ case 0x50: /* Down */
+ return 14;
+ case 0x4D: /* Right */
+ return 6;
+ }
+
+ if (scancode == 0xe0)
+ return -1;
+
+ /*
+ * For Japanese 86/106 keyboards
+ * See comment in drivers/char/pc_keyb.c.
+ * - Masahiro Adegawa
+ */
+ if (scancode == 0x73)
+ scancode = 0x59;
+ else if (scancode == 0x7d)
+ scancode = 0x7c;
+
+ if (!shift_lock && !shift_key && !ctrl_key) {
+ keychar = plain_map[scancode];
+ } else if ((shift_lock || shift_key) && key_maps[1]) {
+ keychar = key_maps[1][scancode];
+ } else if (ctrl_key && key_maps[4]) {
+ keychar = key_maps[4][scancode];
+ } else {
+ keychar = 0x0020;
+ kdb_printf("Unknown state/scancode (%d)\n", scancode);
+ }
+ keychar &= 0x0fff;
+ if (keychar == '\t')
+ keychar = ' ';
+ switch (KTYP(keychar)) {
+ case KT_LETTER:
+ case KT_LATIN:
+ if (isprint(keychar))
+ break; /* printable characters */
+ /* drop through */
+ case KT_SPEC:
+ if (keychar == K_ENTER)
+ break;
+ /* drop through */
+ default:
+ return -1; /* ignore unprintables */
+ }
+
+ if (scancode == 0x1c) {
+ kbd_last_ret = 1;
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+ int scancode, scanstatus;
+
+ /*
+ * Nothing to clean up, since either
+ * ENTER was never pressed, or has already
+ * gotten cleaned up.
+ */
+ if (!kbd_last_ret)
+ return;
+
+ kbd_last_ret = 0;
+ /*
+ * Enter key. Need to absorb the break code here, lest it gets
+ * leaked out if we exit KDB as the result of processing 'g'.
+ *
+ * This has several interesting implications:
+ * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+ * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+ * only get a break code at the end of the repeated
+ * sequence. This means we can't propagate the repeated key
+ * press, and must swallow it away.
+ * + Need to handle possible PS/2 mouse input.
+ * + Need to handle mashed keys.
+ */
+
+ while (1) {
+ while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ cpu_relax();
+
+ /*
+ * Fetch the scancode.
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Skip mouse input.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ continue;
+
+ /*
+ * If we see 0xe0, this is either a break code for KP
+ * ENTER, or a repeat make for KP ENTER. Either way,
+ * since the second byte is equivalent to an ENTER,
+ * skip the 0xe0 and try again.
+ *
+ * If we see 0x1c, this must be a repeat ENTER or KP
+ * ENTER (and we swallowed 0xe0 before). Try again.
+ *
+ * We can also see make and break codes for other keys
+ * mashed before or after pressing ENTER. Thus, if we
+ * see anything other than 0x9c, we have to try again.
+ *
+ * Note, if you held some key as ENTER was depressed,
+ * that break code would get leaked out.
+ */
+ if (scancode != 0x9c)
+ continue;
+
+ return;
+ }
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 00000000000..2f7c760305c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2879 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+#define GREP_LEN 256
+char kdb_grep_string[GREP_LEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+
+/*
+ * kdb_lock protects updates to kdb_initial_cpu. Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1; /* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state; /* General KDB state */
+
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+ CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
+#define for_each_kdbcmd(cmd, num) \
+ for ((cmd) = kdb_base_commands, (num) = 0; \
+ num < kdb_max_commands; \
+ num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+
+typedef struct _kdbmsg {
+ int km_diag; /* kdb diagnostic */
+ char *km_msg; /* Corresponding message text */
+} kdbmsg_t;
+
+#define KDBMSG(msgnum, text) \
+ { KDB_##msgnum, text }
+
+static kdbmsg_t kdbmsgs[] = {
+ KDBMSG(NOTFOUND, "Command Not Found"),
+ KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+ KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+ "8 is only allowed on 64 bit systems"),
+ KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+ KDBMSG(NOTENV, "Cannot find environment variable"),
+ KDBMSG(NOENVVALUE, "Environment variable should have value"),
+ KDBMSG(NOTIMP, "Command not implemented"),
+ KDBMSG(ENVFULL, "Environment full"),
+ KDBMSG(ENVBUFFULL, "Environment buffer full"),
+ KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+ KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+ KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+ KDBMSG(BADMODE, "Invalid IDMODE"),
+ KDBMSG(BADINT, "Illegal numeric value"),
+ KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+ KDBMSG(BADREG, "Invalid register name"),
+ KDBMSG(BADCPUNUM, "Invalid cpu number"),
+ KDBMSG(BADLENGTH, "Invalid length field"),
+ KDBMSG(NOBP, "No Breakpoint exists"),
+ KDBMSG(BADADDR, "Invalid address"),
+};
+#undef KDBMSG
+
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
+
+
+/*
+ * Initial environment. This is all kept static and local to
+ * this file. We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables. The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+#else
+ "PROMPT=kdb> ",
+#endif
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+
+static const int __nenv = ARRAY_SIZE(__env);
+
+struct task_struct *kdb_curr_task(int cpu)
+{
+ struct task_struct *p = curr_task(cpu);
+#ifdef _TIF_MCA_INIT
+ if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+ p = krp->p;
+#endif
+ return p;
+}
+
+/*
+ * kdbgetenv - This function will return the character string value of
+ * an environment variable.
+ * Parameters:
+ * match A character string representing an environment variable.
+ * Returns:
+ * NULL No environment variable matches 'match'
+ * char* Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+ char **ep = __env;
+ int matchlen = strlen(match);
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ char *e = *ep++;
+
+ if (!e)
+ continue;
+
+ if ((strncmp(match, e, matchlen) == 0)
+ && ((e[matchlen] == '\0')
+ || (e[matchlen] == '='))) {
+ char *cp = strchr(e, '=');
+ return cp ? ++cp : "";
+ }
+ }
+ return NULL;
+}
+
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ * environment entries.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long representation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ * We use a static environment buffer (envbuffer) to hold the values
+ * of dynamically generated environment variables (see kdb_set). Buffer
+ * space once allocated is never free'd, so over time, the amount of space
+ * (currently 512 bytes) will be exhausted if env variables are changed
+ * frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define KDB_ENVBUFSIZE 512
+ static char envbuffer[KDB_ENVBUFSIZE];
+ static int envbufsize;
+ char *ep = NULL;
+
+ if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+ ep = &envbuffer[envbufsize];
+ envbufsize += bytes;
+ }
+ return ep;
+}
+
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ * long-valued environment variable.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+ char *ep;
+
+ ep = kdbgetenv(match);
+ if (!ep)
+ return KDB_NOTENV;
+ if (strlen(ep) == 0)
+ return KDB_NOENVVALUE;
+
+ *value = simple_strtoul(ep, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * kdbgetintenv - This function will return the value of an
+ * integer-valued environment variable.
+ * Parameters:
+ * match A character string representing an integer-valued env variable
+ * Outputs:
+ * *value the integer representation of the environment variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+ unsigned long val;
+ int diag;
+
+ diag = kdbgetulenv(match, &val);
+ if (!diag)
+ *value = (int) val;
+ return diag;
+}
+
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ * unsigned long value.
+ * Parameters:
+ * arg A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of arg.
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+ char *endp;
+ unsigned long val;
+
+ val = simple_strtoul(arg, &endp, 0);
+
+ if (endp == arg) {
+ /*
+ * Also try base 16, for us folks too lazy to type the
+ * leading 0x...
+ */
+ val = simple_strtoul(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+ char *endp;
+ u64 val;
+
+ val = simple_strtoull(arg, &endp, 0);
+
+ if (endp == arg) {
+
+ val = simple_strtoull(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+/*
+ * kdb_set - This function implements the 'set' command. Alter an
+ * existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ /*
+ * we can be invoked two ways:
+ * set var=value argv[1]="var", argv[2]="value"
+ * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
+ * - if the latter, shift 'em down.
+ */
+ if (argc == 3) {
+ argv[2] = argv[3];
+ argc--;
+ }
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ /*
+ * Check for internal variables
+ */
+ if (strcmp(argv[1], "KDBDEBUG") == 0) {
+ unsigned int debugflags;
+ char *cp;
+
+ debugflags = simple_strtoul(argv[2], &cp, 0);
+ if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ kdb_printf("kdb: illegal debug flags '%s'\n",
+ argv[2]);
+ return 0;
+ }
+ kdb_flags = (kdb_flags &
+ ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ | (debugflags << KDB_DEBUG_FLAG_SHIFT);
+
+ return 0;
+ }
+
+ /*
+ * Tokenizer squashed the '=' sign. argv[1] is variable
+ * name, argv[2] = value.
+ */
+ varlen = strlen(argv[1]);
+ vallen = strlen(argv[2]);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", argv[1], argv[2]);
+
+ ep[varlen+vallen+1] = '\0';
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], argv[1], varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+static int kdb_check_regs(void)
+{
+ if (!kdb_current_regs) {
+ kdb_printf("No current kdb registers."
+ " You may need to select another task\n");
+ return KDB_BADREG;
+ }
+ return 0;
+}
+
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ * address-expression and returning the value of the expression,
+ * symbol name, and offset to the caller.
+ *
+ * The argument may consist of a numeric value (decimal or
+ * hexidecimal), a symbol name, a register name (preceded by the
+ * percent sign), an environment variable with a numeric value
+ * (preceded by a dollar sign) or a simple arithmetic expression
+ * consisting of a symbol name, +/-, and a numeric constant value
+ * (offset).
+ * Parameters:
+ * argc - count of arguments in argv
+ * argv - argument vector
+ * *nextarg - index to next unparsed argument in argv[]
+ * regs - Register state at time of KDB entry
+ * Outputs:
+ * *value - receives the value of the address-expression
+ * *offset - receives the offset specified, if any
+ * *name - receives the symbol name, if any
+ * *nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ * zero is returned on success, a kdb diagnostic code is
+ * returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+ unsigned long *value, long *offset,
+ char **name)
+{
+ unsigned long addr;
+ unsigned long off = 0;
+ int positive;
+ int diag;
+ int found = 0;
+ char *symname;
+ char symbol = '\0';
+ char *cp;
+ kdb_symtab_t symtab;
+
+ /*
+ * Process arguments which follow the following syntax:
+ *
+ * symbol | numeric-address [+/- numeric-offset]
+ * %register
+ * $environment-variable
+ */
+
+ if (*nextarg > argc)
+ return KDB_ARGCOUNT;
+
+ symname = (char *)argv[*nextarg];
+
+ /*
+ * If there is no whitespace between the symbol
+ * or address and the '+' or '-' symbols, we
+ * remember the character and replace it with a
+ * null so the symbol/value can be properly parsed
+ */
+ cp = strpbrk(symname, "+-");
+ if (cp != NULL) {
+ symbol = *cp;
+ *cp++ = '\0';
+ }
+
+ if (symname[0] == '$') {
+ diag = kdbgetulenv(&symname[1], &addr);
+ if (diag)
+ return diag;
+ } else if (symname[0] == '%') {
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+ /* Implement register values with % at a later time as it is
+ * arch optional.
+ */
+ return KDB_NOTIMP;
+ } else {
+ found = kdbgetsymval(symname, &symtab);
+ if (found) {
+ addr = symtab.sym_start;
+ } else {
+ diag = kdbgetularg(argv[*nextarg], &addr);
+ if (diag)
+ return diag;
+ }
+ }
+
+ if (!found)
+ found = kdbnearsym(addr, &symtab);
+
+ (*nextarg)++;
+
+ if (name)
+ *name = symname;
+ if (value)
+ *value = addr;
+ if (offset && name && *name)
+ *offset = addr - symtab.sym_start;
+
+ if ((*nextarg > argc)
+ && (symbol == '\0'))
+ return 0;
+
+ /*
+ * check for +/- and offset
+ */
+
+ if (symbol == '\0') {
+ if ((argv[*nextarg][0] != '+')
+ && (argv[*nextarg][0] != '-')) {
+ /*
+ * Not our argument. Return.
+ */
+ return 0;
+ } else {
+ positive = (argv[*nextarg][0] == '+');
+ (*nextarg)++;
+ }
+ } else
+ positive = (symbol == '+');
+
+ /*
+ * Now there must be an offset!
+ */
+ if ((*nextarg > argc)
+ && (symbol == '\0')) {
+ return KDB_INVADDRFMT;
+ }
+
+ if (!symbol) {
+ cp = (char *)argv[*nextarg];
+ (*nextarg)++;
+ }
+
+ diag = kdbgetularg(cp, &off);
+ if (diag)
+ return diag;
+
+ if (!positive)
+ off = -off;
+
+ if (offset)
+ *offset += off;
+
+ if (value)
+ *value += off;
+
+ return 0;
+}
+
+static void kdb_cmderror(int diag)
+{
+ int i;
+
+ if (diag >= 0) {
+ kdb_printf("no error detected (diagnostic is %d)\n", diag);
+ return;
+ }
+
+ for (i = 0; i < __nkdb_err; i++) {
+ if (kdbmsgs[i].km_diag == diag) {
+ kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+ return;
+ }
+ }
+
+ kdb_printf("Unknown diag %d\n", -diag);
+}
+
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ * command which defines one command as a set of other commands,
+ * terminated by endefcmd. kdb_defcmd processes the initial
+ * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ * the following commands until 'endefcmd'.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+ int count;
+ int usable;
+ char *name;
+ char *usage;
+ char *help;
+ char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+ struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+ char **save_command = s->command;
+ if (strcmp(argv0, "endefcmd") == 0) {
+ defcmd_in_progress = 0;
+ if (!s->count)
+ s->usable = 0;
+ if (s->usable)
+ kdb_register(s->name, kdb_exec_defcmd,
+ s->usage, s->help, 0);
+ return 0;
+ }
+ if (!s->usable)
+ return KDB_NOTIMP;
+ s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+ if (!s->command) {
+ kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+ cmdstr);
+ s->usable = 0;
+ return KDB_NOTIMP;
+ }
+ memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+ s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+ kfree(save_command);
+ return 0;
+}
+
+static int kdb_defcmd(int argc, const char **argv)
+{
+ struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ if (defcmd_in_progress) {
+ kdb_printf("kdb: nested defcmd detected, assuming missing "
+ "endefcmd\n");
+ kdb_defcmd2("endefcmd", "endefcmd");
+ }
+ if (argc == 0) {
+ int i;
+ for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+ s->usage, s->help);
+ for (i = 0; i < s->count; ++i)
+ kdb_printf("%s", s->command[i]);
+ kdb_printf("endefcmd\n");
+ }
+ return 0;
+ }
+ if (argc != 3)
+ return KDB_ARGCOUNT;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
+ return KDB_NOTIMP;
+ }
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
+ memcpy(defcmd_set, save_defcmd_set,
+ defcmd_set_count * sizeof(*defcmd_set));
+ s = defcmd_set + defcmd_set_count;
+ memset(s, 0, sizeof(*s));
+ s->usable = 1;
+ s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
+ s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
+ s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
+ if (s->usage[0] == '"') {
+ strcpy(s->usage, argv[2]+1);
+ s->usage[strlen(s->usage)-1] = '\0';
+ }
+ if (s->help[0] == '"') {
+ strcpy(s->help, argv[3]+1);
+ s->help[strlen(s->help)-1] = '\0';
+ }
+ ++defcmd_set_count;
+ defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
+ return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
+}
+
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ * defcmd name.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+ int i, ret;
+ struct defcmd_set *s;
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+ if (strcmp(s->name, argv[0]) == 0)
+ break;
+ }
+ if (i == defcmd_set_count) {
+ kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+ argv[0]);
+ return KDB_NOTIMP;
+ }
+ for (i = 0; i < s->count; ++i) {
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+ ret = kdb_parse(s->command[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT 32
+#define CMD_BUFLEN 200 /* kdb_printf: max printline
+ * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+
+/*
+ * The "str" argument may point to something like | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+ int len;
+ char *cp = (char *)str, *cp2;
+
+ /* sanity check: we should have been called with the \ first */
+ if (*cp != '|')
+ return;
+ cp++;
+ while (isspace(*cp))
+ cp++;
+ if (strncmp(cp, "grep ", 5)) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ cp += 5;
+ while (isspace(*cp))
+ cp++;
+ cp2 = strchr(cp, '\n');
+ if (cp2)
+ *cp2 = '\0'; /* remove the trailing newline */
+ len = strlen(cp);
+ if (len == 0) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ /* now cp points to a nonzero length search string */
+ if (*cp == '"') {
+ /* allow it be "x y z" by removing the "'s - there must
+ be two of them */
+ cp++;
+ cp2 = strchr(cp, '"');
+ if (!cp2) {
+ kdb_printf("invalid quoted string, see grephelp\n");
+ return;
+ }
+ *cp2 = '\0'; /* end the string where the 2nd " was */
+ }
+ kdb_grep_leading = 0;
+ if (*cp == '^') {
+ kdb_grep_leading = 1;
+ cp++;
+ }
+ len = strlen(cp);
+ kdb_grep_trailing = 0;
+ if (*(cp+len-1) == '$') {
+ kdb_grep_trailing = 1;
+ *(cp+len-1) = '\0';
+ }
+ len = strlen(cp);
+ if (!len)
+ return;
+ if (len >= GREP_LEN) {
+ kdb_printf("search string too long\n");
+ return;
+ }
+ strcpy(kdb_grep_string, cp);
+ kdb_grepping_flag++;
+ return;
+}
+
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ * matching command and invoke the command function. This
+ * function may be called recursively, if it is, the second call
+ * will overwrite argv and cbuf. It is the caller's
+ * responsibility to save their argv if they recursively call
+ * kdb_parse().
+ * Parameters:
+ * cmdstr The input command line to be parsed.
+ * regs The registers at the time kdb was entered.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ * Limited to 20 tokens.
+ *
+ * Real rudimentary tokenization. Basically only whitespace
+ * is considered a token delimeter (but special consideration
+ * is taken of the '=' sign as used by the 'set' command).
+ *
+ * The algorithm used to tokenize the input string relies on
+ * there being at least one whitespace (or otherwise useless)
+ * character between tokens as the character immediately following
+ * the token is altered in-place to a null-byte to terminate the
+ * token string.
+ */
+
+#define MAXARGC 20
+
+int kdb_parse(const char *cmdstr)
+{
+ static char *argv[MAXARGC];
+ static int argc;
+ static char cbuf[CMD_BUFLEN+2];
+ char *cp;
+ char *cpp, quoted;
+ kdbtab_t *tp;
+ int i, escaped, ignore_errors = 0, check_grep;
+
+ /*
+ * First tokenize the command string.
+ */
+ cp = (char *)cmdstr;
+ kdb_grepping_flag = check_grep = 0;
+
+ if (KDB_FLAG(CMD_INTERRUPT)) {
+ /* Previous command was interrupted, newline must not
+ * repeat the command */
+ KDB_FLAG_CLEAR(CMD_INTERRUPT);
+ KDB_STATE_SET(PAGER);
+ argc = 0; /* no repeat */
+ }
+
+ if (*cp != '\n' && *cp != '\0') {
+ argc = 0;
+ cpp = cbuf;
+ while (*cp) {
+ /* skip whitespace */
+ while (isspace(*cp))
+ cp++;
+ if ((*cp == '\0') || (*cp == '\n') ||
+ (*cp == '#' && !defcmd_in_progress))
+ break;
+ /* special case: check for | grep pattern */
+ if (*cp == '|') {
+ check_grep++;
+ break;
+ }
+ if (cpp >= cbuf + CMD_BUFLEN) {
+ kdb_printf("kdb_parse: command buffer "
+ "overflow, command ignored\n%s\n",
+ cmdstr);
+ return KDB_NOTFOUND;
+ }
+ if (argc >= MAXARGC - 1) {
+ kdb_printf("kdb_parse: too many arguments, "
+ "command ignored\n%s\n", cmdstr);
+ return KDB_NOTFOUND;
+ }
+ argv[argc++] = cpp;
+ escaped = 0;
+ quoted = '\0';
+ /* Copy to next unquoted and unescaped
+ * whitespace or '=' */
+ while (*cp && *cp != '\n' &&
+ (escaped || quoted || !isspace(*cp))) {
+ if (cpp >= cbuf + CMD_BUFLEN)
+ break;
+ if (escaped) {
+ escaped = 0;
+ *cpp++ = *cp++;
+ continue;
+ }
+ if (*cp == '\\') {
+ escaped = 1;
+ ++cp;
+ continue;
+ }
+ if (*cp == quoted)
+ quoted = '\0';
+ else if (*cp == '\'' || *cp == '"')
+ quoted = *cp;
+ *cpp = *cp++;
+ if (*cpp == '=' && !quoted)
+ break;
+ ++cpp;
+ }
+ *cpp++ = '\0'; /* Squash a ws or '=' character */
+ }
+ }
+ if (!argc)
+ return 0;
+ if (check_grep)
+ parse_grep(cp);
+ if (defcmd_in_progress) {
+ int result = kdb_defcmd2(cmdstr, argv[0]);
+ if (!defcmd_in_progress) {
+ argc = 0; /* avoid repeat on endefcmd */
+ *(argv[0]) = '\0';
+ }
+ return result;
+ }
+ if (argv[0][0] == '-' && argv[0][1] &&
+ (argv[0][1] < '0' || argv[0][1] > '9')) {
+ ignore_errors = 1;
+ ++argv[0];
+ }
+
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+
+ if (tp->cmd_minlen
+ && (strlen(argv[0]) <= tp->cmd_minlen)) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ tp->cmd_minlen) == 0) {
+ break;
+ }
+ }
+
+ if (strcmp(argv[0], tp->cmd_name) == 0)
+ break;
+ }
+ }
+
+ /*
+ * If we don't find a command by this name, see if the first
+ * few characters of this match any of the known commands.
+ * e.g., md1c20 should match md.
+ */
+ if (i == kdb_max_commands) {
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ strlen(tp->cmd_name)) == 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (i < kdb_max_commands) {
+ int result;
+ KDB_STATE_SET(CMD);
+ result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ if (result && ignore_errors && result > KDB_CMD_GO)
+ result = 0;
+ KDB_STATE_CLEAR(CMD);
+ switch (tp->cmd_repeat) {
+ case KDB_REPEAT_NONE:
+ argc = 0;
+ if (argv[0])
+ *(argv[0]) = '\0';
+ break;
+ case KDB_REPEAT_NO_ARGS:
+ argc = 1;
+ if (argv[1])
+ *(argv[1]) = '\0';
+ break;
+ case KDB_REPEAT_WITH_ARGS:
+ break;
+ }
+ return result;
+ }
+
+ /*
+ * If the input with which we were presented does not
+ * map to an existing command, attempt to parse it as an
+ * address argument and display the result. Useful for
+ * obtaining the address of a variable, or the nearest symbol
+ * to an address contained in a register.
+ */
+ {
+ unsigned long value;
+ char *name = NULL;
+ long offset;
+ int nextarg = 0;
+
+ if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+ &value, &offset, &name)) {
+ return KDB_NOTFOUND;
+ }
+
+ kdb_printf("%s = ", argv[0]);
+ kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+ kdb_printf("\n");
+ return 0;
+ }
+}
+
+
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P 16
+#define CTRL_N 14
+
+ /* initial situation */
+ if (cmd_head == cmd_tail)
+ return 0;
+ switch (*cmd) {
+ case CTRL_P:
+ if (cmdptr != cmd_tail)
+ cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ case CTRL_N:
+ if (cmdptr != cmd_head)
+ cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_reboot - This function implements the 'reboot' command. Reboot
+ * the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+ emergency_restart();
+ kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+ while (1)
+ cpu_relax();
+ /* NOTREACHED */
+ return 0;
+}
+
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_trap_printk++;
+ show_regs(regs);
+ kdb_trap_printk--;
+ kdb_printf("\n");
+ console_loglevel = old_lvl;
+}
+
+void kdb_set_current_task(struct task_struct *p)
+{
+ kdb_current_task = p;
+
+ if (kdb_task_has_cpu(p)) {
+ kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+ return;
+ }
+ kdb_current_regs = NULL;
+}
+
+/*
+ * kdb_local - The main code for kdb. This routine is invoked on a
+ * specific processor, it is not global. The main kdb() routine
+ * ensures that only one processor at a time is in this routine.
+ * This code is called with the real reason code on the first
+ * entry to a kdb session, thereafter it is called with reason
+ * SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * regs The exception frame at time of fault/breakpoint.
+ * db_result Result code from the break or debug point.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ * KDB_CMD_GO User typed 'go'.
+ * KDB_CMD_CPU User switched to another cpu.
+ * KDB_CMD_SS Single step.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+ kdb_dbtrap_t db_result)
+{
+ char *cmdbuf;
+ int diag;
+ struct task_struct *kdb_current =
+ kdb_curr_task(raw_smp_processor_id());
+
+ KDB_DEBUG_STATE("kdb_local 1", reason);
+ kdb_go_count = 0;
+ if (reason == KDB_REASON_DEBUG) {
+ /* special case below */
+ } else {
+ kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_current, kdb_current ? kdb_current->pid : 0);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ }
+
+ switch (reason) {
+ case KDB_REASON_DEBUG:
+ {
+ /*
+ * If re-entering kdb after a single step
+ * command, don't print the message.
+ */
+ switch (db_result) {
+ case KDB_DB_BPT:
+ kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ case KDB_DB_SS:
+ break;
+ case KDB_DB_SSBPT:
+ KDB_DEBUG_STATE("kdb_local 4", reason);
+ return 1; /* kdba_db_trap did the work */
+ default:
+ kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+ db_result);
+ break;
+ }
+
+ }
+ break;
+ case KDB_REASON_ENTER:
+ if (KDB_STATE(KEYBOARD))
+ kdb_printf("due to Keyboard Entry\n");
+ else
+ kdb_printf("due to KDB_ENTER()\n");
+ break;
+ case KDB_REASON_KEYBOARD:
+ KDB_STATE_SET(KEYBOARD);
+ kdb_printf("due to Keyboard Entry\n");
+ break;
+ case KDB_REASON_ENTER_SLAVE:
+ /* drop through, slaves only get released via cpu switch */
+ case KDB_REASON_SWITCH:
+ kdb_printf("due to cpu switch\n");
+ break;
+ case KDB_REASON_OOPS:
+ kdb_printf("Oops: %s\n", kdb_diemsg);
+ kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_SYSTEM_NMI:
+ kdb_printf("due to System NonMaskable Interrupt\n");
+ break;
+ case KDB_REASON_NMI:
+ kdb_printf("due to NonMaskable Interrupt @ "
+ kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_SSTEP:
+ case KDB_REASON_BREAK:
+ kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+ reason == KDB_REASON_BREAK ?
+ "Breakpoint" : "SS trap", instruction_pointer(regs));
+ /*
+ * Determine if this breakpoint is one that we
+ * are interested in.
+ */
+ if (db_result != KDB_DB_BPT) {
+ kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+ db_result);
+ KDB_DEBUG_STATE("kdb_local 6", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+ break;
+ case KDB_REASON_RECURSE:
+ kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ default:
+ kdb_printf("kdb: unexpected reason code: %d\n", reason);
+ KDB_DEBUG_STATE("kdb_local 8", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+
+ while (1) {
+ /*
+ * Initialize pager context.
+ */
+ kdb_nextline = 1;
+ KDB_STATE_CLEAR(SUPPRESS);
+
+ cmdbuf = cmd_cur;
+ *cmdbuf = '\0';
+ *(cmd_hist[cmd_head]) = '\0';
+
+do_full_getstr:
+#if defined(CONFIG_SMP)
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+ raw_smp_processor_id());
+#else
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+ if (defcmd_in_progress)
+ strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+
+ /*
+ * Fetch command from keyboard
+ */
+ cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+ if (*cmdbuf != '\n') {
+ if (*cmdbuf < 32) {
+ if (cmdptr == cmd_head) {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ *(cmd_hist[cmd_head] +
+ strlen(cmd_hist[cmd_head])-1) = '\0';
+ }
+ if (!handle_ctrl_cmd(cmdbuf))
+ *(cmd_cur+strlen(cmd_cur)-1) = '\0';
+ cmdbuf = cmd_cur;
+ goto do_full_getstr;
+ } else {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ }
+
+ cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+ if (cmd_head == cmd_tail)
+ cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+ }
+
+ cmdptr = cmd_head;
+ diag = kdb_parse(cmdbuf);
+ if (diag == KDB_NOTFOUND) {
+ kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+ diag = 0;
+ }
+ if (diag == KDB_CMD_GO
+ || diag == KDB_CMD_CPU
+ || diag == KDB_CMD_SS
+ || diag == KDB_CMD_KGDB)
+ break;
+
+ if (diag)
+ kdb_cmderror(diag);
+ }
+ KDB_DEBUG_STATE("kdb_local 9", diag);
+ return diag;
+}
+
+
+/*
+ * kdb_print_state - Print the state data for the current processor
+ * for debugging.
+ * Inputs:
+ * text Identifies the debug point
+ * value Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+ kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+ text, raw_smp_processor_id(), value, kdb_initial_cpu,
+ kdb_state);
+}
+
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ * controlling cpu, all cpus are in this loop. One cpu is in
+ * control and will issue the kdb prompt, the others will spin
+ * until 'go' or cpu switch.
+ *
+ * To get a consistent view of the kernel stacks for all
+ * processes, this routine is invoked from the main kdb code via
+ * an architecture specific routine. kdba_main_loop is
+ * responsible for making the kernel stacks consistent for all
+ * processes, there should be no difference between a blocked
+ * process and a running process as far as kdb is concerned.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * reason2 kdb's current reason code.
+ * Initially error but can change
+ * according to kdb state.
+ * db_result Result code from break or debug point.
+ * regs The exception frame at time of fault/breakpoint.
+ * should always be valid.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+ kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+ int result = 1;
+ /* Stay in kdb() until 'go', 'ss[b]' or an error */
+ while (1) {
+ /*
+ * All processors except the one that is in control
+ * will spin here.
+ */
+ KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+ while (KDB_STATE(HOLD_CPU)) {
+ /* state KDB is turned off by kdb_cpu to see if the
+ * other cpus are still live, each cpu in this loop
+ * turns it back on.
+ */
+ if (!KDB_STATE(KDB))
+ KDB_STATE_SET(KDB);
+ }
+
+ KDB_STATE_CLEAR(SUPPRESS);
+ KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+ if (KDB_STATE(LEAVING))
+ break; /* Another cpu said 'go' */
+ /* Still using kdb, this processor is in control */
+ result = kdb_local(reason2, error, regs, db_result);
+ KDB_DEBUG_STATE("kdb_main_loop 3", result);
+
+ if (result == KDB_CMD_CPU)
+ break;
+
+ if (result == KDB_CMD_SS) {
+ KDB_STATE_SET(DOING_SS);
+ break;
+ }
+
+ if (result == KDB_CMD_KGDB) {
+ if (!KDB_STATE(DOING_KGDB))
+ kdb_printf("Entering please attach debugger "
+ "or use $D#44+ or $3#33\n");
+ break;
+ }
+ if (result && result != 1 && result != KDB_CMD_GO)
+ kdb_printf("\nUnexpected kdb_local return code %d\n",
+ result);
+ KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+ break;
+ }
+ if (KDB_STATE(DOING_SS))
+ KDB_STATE_CLEAR(SSBPT);
+
+ /* Clean up any keyboard devices before leaving */
+ kdb_kbd_cleanup_state();
+
+ return result;
+}
+
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ * mdr <addr arg>,<byte count>
+ * Inputs:
+ * addr Start address
+ * count Number of bytes
+ * Returns:
+ * Always 0. Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+ unsigned char c;
+ while (count--) {
+ if (kdb_getarea(c, addr))
+ return 0;
+ kdb_printf("%02x", c);
+ addr++;
+ }
+ kdb_printf("\n");
+ return 0;
+}
+
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ * 'md8' 'mdr' and 'mds' commands.
+ *
+ * md|mds [<addr arg> [<line count> [<radix>]]]
+ * mdWcN [<addr arg> [<line count> [<radix>]]]
+ * where W = is the width (1, 2, 4 or 8) and N is the count.
+ * for eg., md1c20 reads 20 bytes, 1 at a time.
+ * mdr <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+ int symbolic, int nosect, int bytesperword,
+ int num, int repeat, int phys)
+{
+ /* print just one line of data */
+ kdb_symtab_t symtab;
+ char cbuf[32];
+ char *c = cbuf;
+ int i;
+ unsigned long word;
+
+ memset(cbuf, '\0', sizeof(cbuf));
+ if (phys)
+ kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+ else
+ kdb_printf(kdb_machreg_fmt0 " ", addr);
+
+ for (i = 0; i < num && repeat--; i++) {
+ if (phys) {
+ if (kdb_getphysword(&word, addr, bytesperword))
+ break;
+ } else if (kdb_getword(&word, addr, bytesperword))
+ break;
+ kdb_printf(fmtstr, word);
+ if (symbolic)
+ kdbnearsym(word, &symtab);
+ else
+ memset(&symtab, 0, sizeof(symtab));
+ if (symtab.sym_name) {
+ kdb_symbol_print(word, &symtab, 0);
+ if (!nosect) {
+ kdb_printf("\n");
+ kdb_printf(" %s %s "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt, symtab.mod_name,
+ symtab.sec_name, symtab.sec_start,
+ symtab.sym_start, symtab.sym_end);
+ }
+ addr += bytesperword;
+ } else {
+ union {
+ u64 word;
+ unsigned char c[8];
+ } wc;
+ unsigned char *cp;
+#ifdef __BIG_ENDIAN
+ cp = wc.c + 8 - bytesperword;
+#else
+ cp = wc.c;
+#endif
+ wc.word = word;
+#define printable_char(c) \
+ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+ switch (bytesperword) {
+ case 8:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 4;
+ case 4:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 2;
+ case 2:
+ *c++ = printable_char(*cp++);
+ addr++;
+ case 1:
+ *c++ = printable_char(*cp++);
+ addr++;
+ break;
+ }
+#undef printable_char
+ }
+ }
+ kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+ " ", cbuf);
+}
+
+static int kdb_md(int argc, const char **argv)
+{
+ static unsigned long last_addr;
+ static int last_radix, last_bytesperword, last_repeat;
+ int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+ int nosect = 0;
+ char fmtchar, fmtstr[64];
+ unsigned long addr;
+ unsigned long word;
+ long offset = 0;
+ int symbolic = 0;
+ int valid = 0;
+ int phys = 0;
+
+ kdbgetintenv("MDCOUNT", &mdcount);
+ kdbgetintenv("RADIX", &radix);
+ kdbgetintenv("BYTESPERWORD", &bytesperword);
+
+ /* Assume 'md <addr>' and start with environment values */
+ repeat = mdcount * 16 / bytesperword;
+
+ if (strcmp(argv[0], "mdr") == 0) {
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ valid = 1;
+ } else if (isdigit(argv[0][2])) {
+ bytesperword = (int)(argv[0][2] - '0');
+ if (bytesperword == 0) {
+ bytesperword = last_bytesperword;
+ if (bytesperword == 0)
+ bytesperword = 4;
+ }
+ last_bytesperword = bytesperword;
+ repeat = mdcount * 16 / bytesperword;
+ if (!argv[0][3])
+ valid = 1;
+ else if (argv[0][3] == 'c' && argv[0][4]) {
+ char *p;
+ repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ valid = !*p;
+ }
+ last_repeat = repeat;
+ } else if (strcmp(argv[0], "md") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mds") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mdp") == 0) {
+ phys = valid = 1;
+ }
+ if (!valid)
+ return KDB_NOTFOUND;
+
+ if (argc == 0) {
+ if (last_addr == 0)
+ return KDB_ARGCOUNT;
+ addr = last_addr;
+ radix = last_radix;
+ bytesperword = last_bytesperword;
+ repeat = last_repeat;
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ }
+
+ if (argc) {
+ unsigned long val;
+ int diag, nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ if (argc > nextarg+2)
+ return KDB_ARGCOUNT;
+
+ if (argc >= nextarg) {
+ diag = kdbgetularg(argv[nextarg], &val);
+ if (!diag) {
+ mdcount = (int) val;
+ repeat = mdcount * 16 / bytesperword;
+ }
+ }
+ if (argc >= nextarg+1) {
+ diag = kdbgetularg(argv[nextarg+1], &val);
+ if (!diag)
+ radix = (int) val;
+ }
+ }
+
+ if (strcmp(argv[0], "mdr") == 0)
+ return kdb_mdr(addr, mdcount);
+
+ switch (radix) {
+ case 10:
+ fmtchar = 'd';
+ break;
+ case 16:
+ fmtchar = 'x';
+ break;
+ case 8:
+ fmtchar = 'o';
+ break;
+ default:
+ return KDB_BADRADIX;
+ }
+
+ last_radix = radix;
+
+ if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+
+ switch (bytesperword) {
+ case 8:
+ sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+ break;
+ case 4:
+ sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+ break;
+ case 2:
+ sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+ break;
+ case 1:
+ sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+ break;
+ default:
+ return KDB_BADWIDTH;
+ }
+
+ last_repeat = repeat;
+ last_bytesperword = bytesperword;
+
+ if (strcmp(argv[0], "mds") == 0) {
+ symbolic = 1;
+ /* Do not save these changes as last_*, they are temporary mds
+ * overrides.
+ */
+ bytesperword = KDB_WORD_SIZE;
+ repeat = mdcount;
+ kdbgetintenv("NOSECT", &nosect);
+ }
+
+ /* Round address down modulo BYTESPERWORD */
+
+ addr &= ~(bytesperword-1);
+
+ while (repeat > 0) {
+ unsigned long a;
+ int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+ if (phys) {
+ if (kdb_getphysword(&word, a, bytesperword)
+ || word)
+ break;
+ } else if (kdb_getword(&word, a, bytesperword) || word)
+ break;
+ }
+ n = min(num, repeat);
+ kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+ num, repeat, phys);
+ addr += bytesperword * n;
+ repeat -= n;
+ z = (z + num - 1) / num;
+ if (z > 2) {
+ int s = num * (z-2);
+ kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+ " zero suppressed\n",
+ addr, addr + bytesperword * s - 1);
+ addr += bytesperword * s;
+ repeat -= s;
+ }
+ }
+ last_addr = addr;
+
+ return 0;
+}
+
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ * mm address-expression new-value
+ * Remarks:
+ * mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset = 0;
+ unsigned long contents;
+ int nextarg;
+ int width;
+
+ if (argv[0][2] && !isdigit(argv[0][2]))
+ return KDB_NOTFOUND;
+
+ if (argc < 2)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg > argc)
+ return KDB_ARGCOUNT;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg != argc + 1)
+ return KDB_ARGCOUNT;
+
+ width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+ diag = kdb_putword(addr, contents, width);
+ if (diag)
+ return diag;
+
+ kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+
+ return 0;
+}
+
+/*
+ * kdb_go - This function implements the 'go' command.
+ * go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+ unsigned long addr;
+ int diag;
+ int nextarg;
+ long offset;
+
+ if (raw_smp_processor_id() != kdb_initial_cpu) {
+ kdb_printf("go must execute on the entry cpu, "
+ "please use \"cpu %d\" and then execute go\n",
+ kdb_initial_cpu);
+ return KDB_BADCPUNUM;
+ }
+ if (argc == 1) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg,
+ &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ } else if (argc) {
+ return KDB_ARGCOUNT;
+ }
+
+ diag = KDB_CMD_GO;
+ if (KDB_FLAG(CATASTROPHIC)) {
+ kdb_printf("Catastrophic error detected\n");
+ kdb_printf("kdb_continue_catastrophic=%d, ",
+ kdb_continue_catastrophic);
+ if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+ kdb_printf("type go a second time if you really want "
+ "to continue\n");
+ return 0;
+ }
+ if (kdb_continue_catastrophic == 2) {
+ kdb_printf("forcing reboot\n");
+ kdb_reboot(0, NULL);
+ }
+ kdb_printf("attempting to continue\n");
+ }
+ return diag;
+}
+
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+ int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+ int i;
+ char *rname;
+ int rsize;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (len)
+ return len;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ rsize = dbg_reg_def[i].size * 2;
+ if (rsize > 16)
+ rsize = 2;
+ if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+ len = 0;
+ kdb_printf("\n");
+ }
+ if (len)
+ len += kdb_printf(" ");
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %02x", rname, reg8);
+ break;
+ case 16:
+ rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %04x", rname, reg16);
+ break;
+ case 32:
+ rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %08x", rname, reg32);
+ break;
+ case 64:
+ rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %016llx", rname, reg64);
+ break;
+ default:
+ len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+ }
+ }
+ kdb_printf("\n");
+#else
+ if (len)
+ return len;
+
+ kdb_dumpregs(kdb_current_regs);
+#endif
+ return 0;
+}
+
+/*
+ * kdb_rm - This function implements the 'rm' (register modify) command.
+ * rm register-name new-contents
+ * Remarks:
+ * Allows register modification with the same restrictions as gdb
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+#if DBG_MAX_REG_NUM > 0
+ int diag;
+ const char *rname;
+ int i;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ /*
+ * Allow presence or absence of leading '%' symbol.
+ */
+ rname = argv[1];
+ if (*rname == '%')
+ rname++;
+
+ diag = kdbgetu64arg(argv[2], &reg64);
+ if (diag)
+ return diag;
+
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+
+ diag = KDB_BADREG;
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+ diag = 0;
+ break;
+ }
+ }
+ if (!diag) {
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ reg8 = reg64;
+ dbg_set_reg(i, &reg8, kdb_current_regs);
+ break;
+ case 16:
+ reg16 = reg64;
+ dbg_set_reg(i, &reg16, kdb_current_regs);
+ break;
+ case 32:
+ reg32 = reg64;
+ dbg_set_reg(i, &reg32, kdb_current_regs);
+ break;
+ case 64:
+ dbg_set_reg(i, &reg64, kdb_current_regs);
+ break;
+ }
+ }
+ return diag;
+#else
+ kdb_printf("ERROR: Register set currently not implemented\n");
+ return 0;
+#endif
+}
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ * which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ * sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ kdb_trap_printk++;
+ __handle_sysrq(*argv[1], false);
+ kdb_trap_printk--;
+
+ return 0;
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ * frame) command. This command takes an address and expects to
+ * find an exception frame at that address, formats and prints
+ * it.
+ * regs address-expression
+ * Remarks:
+ * Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset;
+ int nextarg;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ show_regs((struct pt_regs *)addr);
+ return 0;
+}
+
+#if defined(CONFIG_MODULES)
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u 0x%p ", mod->name,
+ mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4ld ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%p", mod->module_core);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+/*
+ * kdb_env - This function implements the 'env' command. Display the
+ * current environment variables.
+ */
+
+static int kdb_env(int argc, const char **argv)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+
+ if (KDB_DEBUG(MASK))
+ kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+
+ return 0;
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ * the contents of the syslog buffer.
+ * dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+ int diag;
+ int logging;
+ int lines = 0;
+ int adjust = 0;
+ int n = 0;
+ int skip = 0;
+ struct kmsg_dumper dumper = { .active = 1 };
+ size_t len;
+ char buf[201];
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+ if (argc) {
+ char *cp;
+ lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ lines = 0;
+ if (argc > 1) {
+ adjust = simple_strtoul(argv[2], &cp, 0);
+ if (*cp || adjust < 0)
+ adjust = 0;
+ }
+ }
+
+ /* disable LOGGING if set */
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (!diag && logging) {
+ const char *setargs[] = { "set", "LOGGING", "0" };
+ kdb_set(2, setargs);
+ }
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ n++;
+
+ if (lines < 0) {
+ if (adjust >= n)
+ kdb_printf("buffer only contains %d lines, nothing "
+ "printed\n", n);
+ else if (adjust - lines >= n)
+ kdb_printf("buffer only contains %d lines, last %d "
+ "lines printed\n", n, n - adjust);
+ skip = adjust;
+ lines = abs(lines);
+ } else if (lines > 0) {
+ skip = n - lines - adjust;
+ lines = abs(lines);
+ if (adjust >= n) {
+ kdb_printf("buffer only contains %d lines, "
+ "nothing printed\n", n);
+ skip = n;
+ } else if (skip < 0) {
+ lines += skip;
+ skip = 0;
+ kdb_printf("buffer only contains %d lines, first "
+ "%d lines printed\n", n, lines);
+ }
+ } else {
+ lines = n;
+ }
+
+ if (skip >= n || skip < 0)
+ return 0;
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ if (skip) {
+ skip--;
+ continue;
+ }
+ if (!lines--)
+ break;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ kdb_printf("%.*s\n", (int)len - 1, buf);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ * cpu [<cpunum>]
+ * Returns:
+ * KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+ int i, start_cpu, first_print = 1;
+ char state, prev_state = '?';
+
+ kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+ kdb_printf("Available cpus: ");
+ for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i)) {
+ state = 'F'; /* cpu is offline */
+ } else {
+ state = ' '; /* cpu is responding to kdb */
+ if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+ state = 'I'; /* idle task */
+ }
+ if (state != prev_state) {
+ if (prev_state != '?') {
+ if (!first_print)
+ kdb_printf(", ");
+ first_print = 0;
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ prev_state = state;
+ start_cpu = i;
+ }
+ }
+ /* print the trailing cpus, ignoring them if they are all offline */
+ if (prev_state != 'F') {
+ if (!first_print)
+ kdb_printf(", ");
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ kdb_printf("\n");
+}
+
+static int kdb_cpu(int argc, const char **argv)
+{
+ unsigned long cpunum;
+ int diag;
+
+ if (argc == 0) {
+ kdb_cpu_status();
+ return 0;
+ }
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetularg(argv[1], &cpunum);
+ if (diag)
+ return diag;
+
+ /*
+ * Validate cpunum
+ */
+ if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ return KDB_BADCPUNUM;
+
+ dbg_switch_cpu = cpunum;
+
+ /*
+ * Switch to other cpu
+ */
+ return KDB_CMD_CPU;
+}
+
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+ int idle = 0, daemon = 0;
+ unsigned long mask_I = kdb_task_state_string("I"),
+ mask_M = kdb_task_state_string("M");
+ unsigned long cpu;
+ const struct task_struct *p, *g;
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask_I))
+ ++idle;
+ }
+ kdb_do_each_thread(g, p) {
+ if (kdb_task_state(p, mask_M))
+ ++daemon;
+ } kdb_while_each_thread(g, p);
+ if (idle || daemon) {
+ if (idle)
+ kdb_printf("%d idle process%s (state I)%s\n",
+ idle, idle == 1 ? "" : "es",
+ daemon ? " and " : "");
+ if (daemon)
+ kdb_printf("%d sleeping system daemon (state M) "
+ "process%s", daemon,
+ daemon == 1 ? "" : "es");
+ kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+ }
+}
+
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ * ps [DRSTCZEUIMA] All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+ int cpu;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return;
+
+ cpu = kdb_process_cpu(p);
+ kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ (void *)p, p->pid, p->parent->pid,
+ kdb_task_has_cpu(p), kdb_process_cpu(p),
+ kdb_task_state_char(p),
+ (void *)(&p->thread),
+ p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p->comm);
+ if (kdb_task_has_cpu(p)) {
+ if (!KDB_TSK(cpu)) {
+ kdb_printf(" Error: no saved data for this cpu\n");
+ } else {
+ if (KDB_TSK(cpu) != p)
+ kdb_printf(" Error: does not match running "
+ "process table (0x%p)\n", KDB_TSK(cpu));
+ }
+ }
+}
+
+static int kdb_ps(int argc, const char **argv)
+{
+ struct task_struct *g, *p;
+ unsigned long mask, cpu;
+
+ if (argc == 0)
+ kdb_ps_suppressed();
+ kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
+ (int)(2*sizeof(void *))+2, "Task Addr",
+ (int)(2*sizeof(void *))+2, "Thread");
+ mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ }
+ kdb_printf("\n");
+ /* Now the real tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ } kdb_while_each_thread(g, p);
+
+ return 0;
+}
+
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ * the currently active process.
+ * pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+ struct task_struct *p;
+ unsigned long val;
+ int diag;
+
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ if (strcmp(argv[1], "R") == 0) {
+ p = KDB_TSK(kdb_initial_cpu);
+ } else {
+ diag = kdbgetularg(argv[1], &val);
+ if (diag)
+ return KDB_BADINT;
+
+ p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
+ if (!p) {
+ kdb_printf("No task with pid=%d\n", (pid_t)val);
+ return 0;
+ }
+ }
+ kdb_set_current_task(p);
+ }
+ kdb_printf("KDB current process is %s(pid=%d)\n",
+ kdb_current_task->comm,
+ kdb_current_task->pid);
+
+ return 0;
+}
+
+static int kdb_kgdb(int argc, const char **argv)
+{
+ return KDB_CMD_KGDB;
+}
+
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+ kdbtab_t *kt;
+ int i;
+
+ kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+ kdb_printf("-----------------------------"
+ "-----------------------------\n");
+ for_each_kdbcmd(kt, i) {
+ char *space = "";
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
+ }
+ return 0;
+}
+
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+ long sig, pid;
+ char *endp;
+ struct task_struct *p;
+ struct siginfo info;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ sig = simple_strtol(argv[1], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (sig >= 0) {
+ kdb_printf("Invalid signal parameter.<-signal>\n");
+ return 0;
+ }
+ sig = -sig;
+
+ pid = simple_strtol(argv[2], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (pid <= 0) {
+ kdb_printf("Process ID must be large than 0.\n");
+ return 0;
+ }
+
+ /* Find the process. */
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (!p) {
+ kdb_printf("The specified process isn't found.\n");
+ return 0;
+ }
+ p = p->group_leader;
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = pid; /* same capabilities as process being signalled */
+ info.si_uid = 0; /* kdb has root authority */
+ kdb_send_sig_info(p, &info);
+ return 0;
+}
+
+struct kdb_tm {
+ int tm_sec; /* seconds */
+ int tm_min; /* minutes */
+ int tm_hour; /* hours */
+ int tm_mday; /* day of the month */
+ int tm_mon; /* month */
+ int tm_year; /* year */
+};
+
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+ /* This will work from 1970-2099, 2100 is not a leap year */
+ static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+ 31, 30, 31, 30, 31 };
+ memset(tm, 0, sizeof(*tm));
+ tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
+ tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+ (2 * 365 + 1); /* shift base from 1970 to 1968 */
+ tm->tm_min = tm->tm_sec / 60 % 60;
+ tm->tm_hour = tm->tm_sec / 60 / 60;
+ tm->tm_sec = tm->tm_sec % 60;
+ tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+ tm->tm_mday %= (4*365+1);
+ mon_day[1] = 29;
+ while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+ tm->tm_mday -= mon_day[tm->tm_mon];
+ if (++tm->tm_mon == 12) {
+ tm->tm_mon = 0;
+ ++tm->tm_year;
+ mon_day[1] = 28;
+ }
+ }
+ ++tm->tm_mday;
+}
+
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+ struct timespec uptime;
+ do_posix_clock_monotonic_gettime(&uptime);
+ memset(val, 0, sizeof(*val));
+ val->uptime = uptime.tv_sec;
+ val->loads[0] = avenrun[0];
+ val->loads[1] = avenrun[1];
+ val->loads[2] = avenrun[2];
+ val->procs = nr_threads-1;
+ si_meminfo(val);
+
+ return;
+}
+
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+ struct timespec now;
+ struct kdb_tm tm;
+ struct sysinfo val;
+
+ if (argc)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
+ kdb_printf("release %s\n", init_uts_ns.name.release);
+ kdb_printf("version %s\n", init_uts_ns.name.version);
+ kdb_printf("machine %s\n", init_uts_ns.name.machine);
+ kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
+ kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+ kdb_printf("ccversion %s\n", __stringify(CCVERSION));
+
+ now = __current_kernel_time();
+ kdb_gmtime(&now, &tm);
+ kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
+ "tz_minuteswest %d\n",
+ 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec,
+ sys_tz.tz_minuteswest);
+
+ kdb_sysinfo(&val);
+ kdb_printf("uptime ");
+ if (val.uptime > (24*60*60)) {
+ int days = val.uptime / (24*60*60);
+ val.uptime %= (24*60*60);
+ kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+ }
+ kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+
+ /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+ kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
+ LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+ LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+ LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+ /* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
+ "Buffers: %8lu kB\n",
+ val.totalram, val.freeram, val.bufferram);
+ return 0;
+}
+
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+ char fmtstr[64];
+ int cpu, diag, nextarg = 1;
+ unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
+
+ if (argc < 1 || argc > 3)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (argc >= 2) {
+ diag = kdbgetularg(argv[2], &bytesperword);
+ if (diag)
+ return diag;
+ }
+ if (!bytesperword)
+ bytesperword = KDB_WORD_SIZE;
+ else if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+ sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+ if (argc >= 3) {
+ diag = kdbgetularg(argv[3], &whichcpu);
+ if (diag)
+ return diag;
+ if (!cpu_online(whichcpu)) {
+ kdb_printf("cpu %ld is not online\n", whichcpu);
+ return KDB_BADCPUNUM;
+ }
+ }
+
+ /* Most architectures use __per_cpu_offset[cpu], some use
+ * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+ */
+#ifdef __per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ if (whichcpu != ~0UL && whichcpu != cpu)
+ continue;
+ addr = symaddr + KDB_PCU(cpu);
+ diag = kdb_getword(&val, addr, bytesperword);
+ if (diag) {
+ kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+ "read, diag=%d\n", cpu, addr, diag);
+ continue;
+ }
+ kdb_printf("%5d ", cpu);
+ kdb_md_line(fmtstr, addr,
+ bytesperword == KDB_WORD_SIZE,
+ 1, bytesperword, 1, 1, 0);
+ }
+#undef KDB_PCU
+ return 0;
+}
+
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+ kdb_printf("Usage of cmd args | grep pattern:\n");
+ kdb_printf(" Any command's output may be filtered through an ");
+ kdb_printf("emulated 'pipe'.\n");
+ kdb_printf(" 'grep' is just a key word.\n");
+ kdb_printf(" The pattern may include a very limited set of "
+ "metacharacters:\n");
+ kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
+ kdb_printf(" And if there are spaces in the pattern, you may "
+ "quote it:\n");
+ kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+ " or \"^pat tern$\"\n");
+ return 0;
+}
+
+/*
+ * kdb_register_repeat - This function is used to register a kernel
+ * debugger command.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * repeat Does the command auto repeat on enter?
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50 /* arbitrary */
+int kdb_register_repeat(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_repeat_t repeat)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * Brute force method to determine duplicates
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kdb_printf("Duplicate kdb command registered: "
+ "%s, func %p help %s\n", cmd, func, help);
+ return 1;
+ }
+ }
+
+ /*
+ * Insert command into first available location in table
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name == NULL)
+ break;
+ }
+
+ if (i >= kdb_max_commands) {
+ kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+ kdb_command_extend) * sizeof(*new), GFP_KDB);
+ if (!new) {
+ kdb_printf("Could not allocate new kdb_command "
+ "table\n");
+ return 1;
+ }
+ if (kdb_commands) {
+ memcpy(new, kdb_commands,
+ (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
+ kfree(kdb_commands);
+ }
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
+ kdb_command_extend * sizeof(*new));
+ kdb_commands = new;
+ kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
+ kdb_max_commands += kdb_command_extend;
+ }
+
+ kp->cmd_name = cmd;
+ kp->cmd_func = func;
+ kp->cmd_usage = usage;
+ kp->cmd_help = help;
+ kp->cmd_flags = 0;
+ kp->cmd_minlen = minlen;
+ kp->cmd_repeat = repeat;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kdb_register_repeat);
+
+
+/*
+ * kdb_register - Compatibility register function for commands that do
+ * not need to specify a repeat state. Equivalent to
+ * kdb_register_repeat with KDB_REPEAT_NONE.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen)
+{
+ return kdb_register_repeat(cmd, func, usage, help, minlen,
+ KDB_REPEAT_NONE);
+}
+EXPORT_SYMBOL_GPL(kdb_register);
+
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ * debugger command. It is generally called when a module which
+ * implements kdb commands is unloaded.
+ * Inputs:
+ * cmd Command name
+ * Returns:
+ * zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * find the command.
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kp->cmd_name = NULL;
+ return 0;
+ }
+ }
+
+ /* Couldn't find it. */
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kdb_unregister);
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ int i;
+ kdbtab_t *kp;
+
+ for_each_kdbcmd(kp, i)
+ kp->cmd_name = NULL;
+
+ kdb_register_repeat("md", kdb_md, "<vaddr>",
+ "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+ KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("rd", kdb_rd, "",
+ "Display Registers", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+ "Backtrace process given its struct task address", 0,
+ KDB_REPEAT_NONE);
+ kdb_register_repeat("env", kdb_env, "",
+ "Show environment variables", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("set", kdb_set, "",
+ "Set environment variables", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("help", kdb_help, "",
+ "Display Help Message", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("?", kdb_help, "",
+ "Display Help Message", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+#if defined(CONFIG_MODULES)
+ kdb_register_repeat("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+ kdb_register_repeat("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_PRINTK)
+ kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0, KDB_REPEAT_NONE);
+#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+ }
+ kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("summary", kdb_summary, "",
+ "Summarize the system", 4, KDB_REPEAT_NONE);
+ kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+ "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+ kdb_register_repeat("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0, KDB_REPEAT_NONE);
+}
+
+/* Execute any commands defined in kdb_cmds. */
+static void __init kdb_cmd_init(void)
+{
+ int i, diag;
+ for (i = 0; kdb_cmds[i]; ++i) {
+ diag = kdb_parse(kdb_cmds[i]);
+ if (diag)
+ kdb_printf("kdb command %s failed, kdb diag %d\n",
+ kdb_cmds[i], diag);
+ }
+ if (defcmd_in_progress) {
+ kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+ kdb_parse("endefcmd");
+ }
+}
+
+/* Initialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+ static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+ int i;
+
+ if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+ return;
+ for (i = kdb_init_lvl; i < lvl; i++) {
+ switch (i) {
+ case KDB_NOT_INITIALIZED:
+ kdb_inittab(); /* Initialize Command Table */
+ kdb_initbptab(); /* Initialize Breakpoints */
+ break;
+ case KDB_INIT_EARLY:
+ kdb_cmd_init(); /* Build kdb_cmds tables */
+ break;
+ }
+ }
+ kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 00000000000..7afd3c8c41d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,260 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+
+/* Kernel Debugger Command codes. Must not overlap with error codes. */
+#define KDB_CMD_GO (-1001)
+#define KDB_CMD_CPU (-1002)
+#define KDB_CMD_SS (-1003)
+#define KDB_CMD_KGDB (-1005)
+
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
+#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
+
+#define KDB_DEBUG(flag) (kdb_flags & \
+ (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+ kdb_print_state(text, value)
+
+#if BITS_PER_LONG == 32
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%08lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%08lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%08x"
+#define kdb_f_count_fmt "%d"
+
+#elif BITS_PER_LONG == 64
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%016lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%016lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%016x"
+#define kdb_f_count_fmt "%ld"
+
+#endif
+
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT 16
+
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+ unsigned long value; /* Address of symbol */
+ const char *mod_name; /* Module containing symbol or
+ * "kernel" */
+ unsigned long mod_start;
+ unsigned long mod_end;
+ const char *sec_name; /* Section containing symbol */
+ unsigned long sec_start;
+ unsigned long sec_end;
+ const char *sym_name; /* Full symbol name, including
+ * any version */
+ unsigned long sym_start;
+ unsigned long sym_end;
+ } kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers. The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+
+extern int kdb_getphysword(unsigned long *word,
+ unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+
+extern int kdbgetularg(const char *, unsigned long *);
+extern int kdbgetu64arg(const char *, u64 *);
+extern char *kdbgetenv(const char *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+ long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+
+extern int kdb_state;
+#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
+#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
+#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
+ * kdb control */
+#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
+#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
+ * after one ss, independent of
+ * DOING_SS */
+#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
+#define KDB_STATE_PAGER 0x00000400 /* pager is available */
+#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
+ * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
+ * adjusted */
+#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
+#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
+ * keyboard on this cpu */
+#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
+#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
+#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
+ * specific use */
+
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+
+extern int kdb_nextline; /* Current number of lines displayed */
+
+typedef struct _kdb_bp {
+ unsigned long bp_addr; /* Address breakpoint is present at */
+ unsigned int bp_free:1; /* This entry is available */
+ unsigned int bp_enabled:1; /* Breakpoint is active in register */
+ unsigned int bp_type:4; /* Uses hardware register */
+ unsigned int bp_installed:1; /* Breakpoint is installed */
+ unsigned int bp_delay:1; /* Do delayed bp handling */
+ unsigned int bp_delayed:1; /* Delayed breakpoint */
+ unsigned int bph_length; /* HW break length */
+} kdb_bp_t;
+
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+
+/* The KDB shell command table */
+typedef struct _kdbtab {
+ char *cmd_name; /* Command name */
+ kdb_func_t cmd_func; /* Function to execute command */
+ char *cmd_usage; /* Usage String for this command */
+ char *cmd_help; /* Help message for this command */
+ short cmd_flags; /* Parsing flags */
+ short cmd_minlen; /* Minimum legal # command
+ * chars required */
+ kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
+} kdbtab_t;
+
+extern int kdb_bt(int, const char **); /* KDB display back trace */
+
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+
+typedef enum {
+ KDB_DB_BPT, /* Breakpoint */
+ KDB_DB_SS, /* Single-step trap */
+ KDB_DB_SSBPT, /* Single step over breakpoint */
+ KDB_DB_NOBPT /* Spurious breakpoint */
+} kdb_dbtrap_t;
+
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+ int, kdb_dbtrap_t, struct pt_regs *);
+
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+extern char kdb_grep_string[];
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+ unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+extern char *kdb_getstr(char *, size_t, char *);
+extern void kdb_gdb_state_pass(char *buf);
+
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB 0x0001 /* Space before string */
+#define KDB_SP_SPACEA 0x0002 /* Space after string */
+#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
+#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
+#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
+#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+
+extern struct task_struct *kdb_curr_task(int);
+
+#define kdb_task_has_cpu(p) (task_curr(p))
+
+/* Simplify coexistence with NPTL */
+#define kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define kdb_while_each_thread(g, p) while_each_thread(g, p)
+
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+
+extern char kdb_prompt_str[];
+
+#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
+
+#endif /* CONFIG_KGDB_KDB */
+#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 00000000000..d35cc2d3a4c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ * symname Character string containing symbol name
+ * symtab Structure to receive results
+ * Returns:
+ * 0 Symbol not found, symtab zero filled
+ * 1 Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+ symtab);
+ memset(symtab, 0, sizeof(*symtab));
+ symtab->sym_start = kallsyms_lookup_name(symname);
+ if (symtab->sym_start) {
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 1, "
+ "symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
+ return 1;
+ }
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 0\n");
+ return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+
+static char *kdb_name_table[100]; /* arbitrary size */
+
+/*
+ * kdbnearsym - Return the name of the symbol with the nearest address
+ * less than 'addr'.
+ *
+ * Parameters:
+ * addr Address to check for symbol near
+ * symtab Structure to receive results
+ * Returns:
+ * 0 No sections contain this address, symtab zero filled
+ * 1 Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ * 2.6 kallsyms has a "feature" where it unpacks the name into a
+ * string. If that string is reused before the caller expects it
+ * then the caller sees its string change without warning. To
+ * avoid cluttering up the main kdb code with lots of kdb_strdup,
+ * tests and kfree calls, kdbnearsym maintains an LRU list of the
+ * last few unique strings. The list is sized large enough to
+ * hold active strings, no kdb caller of kdbnearsym makes more
+ * than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+ int ret = 0;
+ unsigned long symbolsize = 0;
+ unsigned long offset = 0;
+#define knt1_size 128 /* must be >= kallsyms table size */
+ char *knt1 = NULL;
+
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ memset(symtab, 0, sizeof(*symtab));
+
+ if (addr < 4096)
+ goto out;
+ knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+ if (!knt1) {
+ kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+ addr);
+ goto out;
+ }
+ symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+ (char **)(&symtab->mod_name), knt1);
+ if (offset > 8*1024*1024) {
+ symtab->sym_name = NULL;
+ addr = offset = symbolsize = 0;
+ }
+ symtab->sym_start = addr - offset;
+ symtab->sym_end = symtab->sym_start + symbolsize;
+ ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+
+ if (ret) {
+ int i;
+ /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
+ * set but the buffer passed into kallsyms_lookup is not used,
+ * so it contains garbage. The caller has to work out which
+ * buffer needs to be saved.
+ *
+ * What was Rusty smoking when he wrote that code?
+ */
+ if (symtab->sym_name != knt1) {
+ strncpy(knt1, symtab->sym_name, knt1_size);
+ knt1[knt1_size-1] = '\0';
+ }
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i] &&
+ strcmp(kdb_name_table[i], knt1) == 0)
+ break;
+ }
+ if (i >= ARRAY_SIZE(kdb_name_table)) {
+ debug_kfree(kdb_name_table[0]);
+ memcpy(kdb_name_table, kdb_name_table+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-1));
+ } else {
+ debug_kfree(knt1);
+ knt1 = kdb_name_table[i];
+ memcpy(kdb_name_table+i, kdb_name_table+i+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-i-1));
+ }
+ i = ARRAY_SIZE(kdb_name_table) - 1;
+ kdb_name_table[i] = knt1;
+ symtab->sym_name = kdb_name_table[i];
+ knt1 = NULL;
+ }
+
+ if (symtab->mod_name == NULL)
+ symtab->mod_name = "kernel";
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+ "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+ symtab->sym_start, symtab->mod_name, symtab->sym_name,
+ symtab->sym_name);
+
+out:
+ debug_kfree(knt1);
+ return ret;
+}
+
+void kdbnearsym_cleanup(void)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i]) {
+ debug_kfree(kdb_name_table[i]);
+ kdb_name_table[i] = NULL;
+ }
+ }
+}
+
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * max_len maximum length that can be returned
+ * Returns:
+ * Number of symbols which match the given prefix.
+ * Notes:
+ * prefix_name is changed to contain the longest unique prefix that
+ * starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+ loff_t pos = 0;
+ int prefix_len = strlen(prefix_name), prev_len = 0;
+ int i, number = 0;
+ const char *name;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strcpy(ks_namebuf, name);
+ /* Work out the longest name that matches the prefix */
+ if (++number == 1) {
+ prev_len = min_t(int, max_len-1,
+ strlen(ks_namebuf));
+ memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+ ks_namebuf_prev[prev_len] = '\0';
+ continue;
+ }
+ for (i = 0; i < prev_len; i++) {
+ if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+ prev_len = i;
+ ks_namebuf_prev[i] = '\0';
+ break;
+ }
+ }
+ }
+ }
+ if (prev_len > prefix_len)
+ memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+ return number;
+}
+
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * flag 0 means search from the head, 1 means continue search.
+ * Returns:
+ * 1 if a symbol matches the given prefix.
+ * 0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+ int prefix_len = strlen(prefix_name);
+ static loff_t pos;
+ const char *name;
+
+ if (!flag)
+ pos = 0;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strncpy(prefix_name, name, strlen(name)+1);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ * addr Address to be printed.
+ * symtab Address of symbol data, if NULL this routine does its
+ * own lookup.
+ * punc Punctuation for string, bit field.
+ * Remarks:
+ * The string and its punctuation is only printed if the address
+ * is inside the kernel, except that the value is always printed
+ * when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+ unsigned int punc)
+{
+ kdb_symtab_t symtab, *symtab_p2;
+ if (symtab_p) {
+ symtab_p2 = (kdb_symtab_t *)symtab_p;
+ } else {
+ symtab_p2 = &symtab;
+ kdbnearsym(addr, symtab_p2);
+ }
+ if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+ return;
+ if (punc & KDB_SP_SPACEB)
+ kdb_printf(" ");
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(kdb_machreg_fmt0, addr);
+ if (symtab_p2->sym_name) {
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(" ");
+ if (punc & KDB_SP_PAREN)
+ kdb_printf("(");
+ if (strcmp(symtab_p2->mod_name, "kernel"))
+ kdb_printf("[%s]", symtab_p2->mod_name);
+ kdb_printf("%s", symtab_p2->sym_name);
+ if (addr != symtab_p2->sym_start)
+ kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+ if (punc & KDB_SP_SYMSIZE)
+ kdb_printf("/0x%lx",
+ symtab_p2->sym_end - symtab_p2->sym_start);
+ if (punc & KDB_SP_PAREN)
+ kdb_printf(")");
+ }
+ if (punc & KDB_SP_SPACEA)
+ kdb_printf(" ");
+ if (punc & KDB_SP_NEWLINE)
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ * str The string to duplicate.
+ * type Flags to kmalloc for the new string.
+ * Returns:
+ * Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ * This is not in lib/string.c because it uses kmalloc which is not
+ * available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+ int n = strlen(str)+1;
+ char *s = kmalloc(n, type);
+ if (!s)
+ return NULL;
+ return strcpy(s, str);
+}
+
+/*
+ * kdb_getarea_size - Read an area of data. The kdb equivalent of
+ * copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * res Pointer to the area to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+ int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_putarea_size - Write an area of data. The kdb equivalent of
+ * copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * addr Address of the area to write to.
+ * res Pointer to the area holding the data.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+ int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ * address is in range, use kmap_atomic() to get data
+ * similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ * res Pointer to the word to receive the result
+ * addr Physical address of the area to copy
+ * size Size of the area
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+ unsigned long pfn;
+ void *vaddr;
+ struct page *page;
+
+ pfn = (addr >> PAGE_SHIFT);
+ if (!pfn_valid(pfn))
+ return 1;
+ page = pfn_to_page(pfn);
+ vaddr = kmap_atomic(page);
+ memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+ kunmap_atomic(vaddr);
+
+ return 0;
+}
+
+/*
+ * kdb_getphysword
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+
+ switch (size) {
+ case 1:
+ diag = kdb_getphys(&w1, addr, sizeof(w1));
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getphys(&w2, addr, sizeof(w2));
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getphys(&w4, addr, sizeof(w4));
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getphys(&w8, addr, sizeof(w8));
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
+ * data as numbers.
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+ switch (size) {
+ case 1:
+ diag = kdb_getarea(w1, addr);
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getarea(w2, addr);
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getarea(w4, addr);
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getarea(w8, addr);
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_putword - Write a binary value. Unlike kdb_putarea, this
+ * treats data as numbers.
+ * Inputs:
+ * addr Address of the area to write to..
+ * word The value to set.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ switch (size) {
+ case 1:
+ w1 = word;
+ diag = kdb_putarea(addr, w1);
+ break;
+ case 2:
+ w2 = word;
+ diag = kdb_putarea(addr, w2);
+ break;
+ case 4:
+ w4 = word;
+ diag = kdb_putarea(addr, w4);
+ break;
+ case 8:
+ if (size <= sizeof(word)) {
+ w8 = word;
+ diag = kdb_putarea(addr, w8);
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ * letters DRSTCZEUIMA to a mask for the process state field and
+ * return the value. If no argument is supplied, return the mask
+ * that corresponds to environment variable PS, DRSTCZEU by
+ * default.
+ * Inputs:
+ * s String to convert
+ * Returns:
+ * Mask for process state.
+ * Notes:
+ * The mask folds data from several sources into a single long value, so
+ * be careful not to overlap the bits. TASK_* bits are in the LSB,
+ * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
+ * is no overlap between TASK_* and EXIT_* but that may not always be
+ * true, so EXIT_* bits are shifted left 16 bits before being stored in
+ * the mask.
+ */
+
+/* unrunnable is < 0 */
+#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
+
+unsigned long kdb_task_state_string(const char *s)
+{
+ long res = 0;
+ if (!s) {
+ s = kdbgetenv("PS");
+ if (!s)
+ s = "DRSTCZEU"; /* default value for ps */
+ }
+ while (*s) {
+ switch (*s) {
+ case 'D':
+ res |= TASK_UNINTERRUPTIBLE;
+ break;
+ case 'R':
+ res |= RUNNING;
+ break;
+ case 'S':
+ res |= TASK_INTERRUPTIBLE;
+ break;
+ case 'T':
+ res |= TASK_STOPPED;
+ break;
+ case 'C':
+ res |= TASK_TRACED;
+ break;
+ case 'Z':
+ res |= EXIT_ZOMBIE << 16;
+ break;
+ case 'E':
+ res |= EXIT_DEAD << 16;
+ break;
+ case 'U':
+ res |= UNRUNNABLE;
+ break;
+ case 'I':
+ res |= IDLE;
+ break;
+ case 'M':
+ res |= DAEMON;
+ break;
+ case 'A':
+ res = ~0UL;
+ break;
+ default:
+ kdb_printf("%s: unknown flag '%c' ignored\n",
+ __func__, *s);
+ break;
+ }
+ ++s;
+ }
+ return res;
+}
+
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ * p struct task for the process
+ * Returns:
+ * One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+ int cpu;
+ char state;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return 'E';
+
+ cpu = kdb_process_cpu(p);
+ state = (p->state == 0) ? 'R' :
+ (p->state < 0) ? 'U' :
+ (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+ (p->state & TASK_STOPPED) ? 'T' :
+ (p->state & TASK_TRACED) ? 'C' :
+ (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+ (p->exit_state & EXIT_DEAD) ? 'E' :
+ (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ if (is_idle_task(p)) {
+ /* Idle task. Is it really idle, apart from the kdb
+ * interrupt? */
+ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+ if (cpu != kdb_initial_cpu)
+ state = 'I'; /* idle task */
+ }
+ } else if (!p->mm && state == 'S') {
+ state = 'M'; /* sleeping system daemon */
+ }
+ return state;
+}
+
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ * given by the mask.
+ * Inputs:
+ * p struct task for the process
+ * mask mask from kdb_task_state_string to select processes
+ * Returns:
+ * True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+ char state[] = { kdb_task_state_char(p), '\0' };
+ return (mask & kdb_task_state_string(state)) != 0;
+}
+
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ * value to a symbol lookup if possible.
+ * Inputs:
+ * name field name to print
+ * val value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+ kdb_symtab_t symtab;
+ kdb_printf(" %-11.11s ", name);
+ if (kdbnearsym(val, &symtab))
+ kdb_symbol_print(val, &symtab,
+ KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+ else
+ kdb_printf("0x%lx\n", val);
+}
+
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
+ * for space usage, not for speed. One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+
+struct debug_alloc_header {
+ u32 next; /* offset of next header from start of pool */
+ u32 size;
+ void *caller;
+};
+
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size. Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+
+static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+
+/* Locking is awkward. The debug code is called from all contexts,
+ * including non maskable interrupts. A normal spinlock is not safe
+ * in NMI context. Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up. If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+ __acquires(dap_lock)
+{
+ static int dap_locked = -1;
+ int count;
+ if (dap_locked == smp_processor_id())
+ count = 1;
+ else
+ count = 1000;
+ while (1) {
+ if (spin_trylock(&dap_lock)) {
+ dap_locked = -1;
+ return 1;
+ }
+ if (!count--)
+ break;
+ udelay(1000);
+ }
+ dap_locked = smp_processor_id();
+ __acquire(dap_lock);
+ return 0;
+}
+
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+ unsigned int rem, h_offset;
+ struct debug_alloc_header *best, *bestprev, *prev, *h;
+ void *p = NULL;
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return NULL;
+ }
+ h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first_call) {
+ h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+ dah_first_call = 0;
+ }
+ size = ALIGN(size, dah_align);
+ prev = best = bestprev = NULL;
+ while (1) {
+ if (h->size >= size && (!best || h->size < best->size)) {
+ best = h;
+ bestprev = prev;
+ if (h->size == size)
+ break;
+ }
+ if (!h->next)
+ break;
+ prev = h;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+ }
+ if (!best)
+ goto out;
+ rem = best->size - size;
+ /* The pool must always contain at least one header */
+ if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+ goto out;
+ if (rem >= dah_overhead) {
+ best->size = size;
+ h_offset = ((char *)best - debug_alloc_pool) +
+ dah_overhead + best->size;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+ h->size = rem - dah_overhead;
+ h->next = best->next;
+ } else
+ h_offset = best->next;
+ best->caller = __builtin_return_address(0);
+ dah_used += best->size;
+ dah_used_max = max(dah_used, dah_used_max);
+ if (bestprev)
+ bestprev->next = h_offset;
+ else
+ dah_first = h_offset;
+ p = (char *)best + dah_overhead;
+ memset(p, POISON_INUSE, best->size - 1);
+ *((char *)p + best->size - 1) = POISON_END;
+out:
+ spin_unlock(&dap_lock);
+ return p;
+}
+
+void debug_kfree(void *p)
+{
+ struct debug_alloc_header *h;
+ unsigned int h_offset;
+ if (!p)
+ return;
+ if ((char *)p < debug_alloc_pool ||
+ (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+ kfree(p);
+ return;
+ }
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return; /* memory leak, cannot be helped */
+ }
+ h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+ memset(p, POISON_FREE, h->size - 1);
+ *((char *)p + h->size - 1) = POISON_END;
+ h->caller = NULL;
+ dah_used -= h->size;
+ h_offset = (char *)h - debug_alloc_pool;
+ if (h_offset < dah_first) {
+ h->next = dah_first;
+ dah_first = h_offset;
+ } else {
+ struct debug_alloc_header *prev;
+ unsigned int prev_offset;
+ prev = (struct debug_alloc_header *)(debug_alloc_pool +
+ dah_first);
+ while (1) {
+ if (!prev->next || prev->next > h_offset)
+ break;
+ prev = (struct debug_alloc_header *)
+ (debug_alloc_pool + prev->next);
+ }
+ prev_offset = (char *)prev - debug_alloc_pool;
+ if (prev_offset + dah_overhead + prev->size == h_offset) {
+ prev->size += dah_overhead + h->size;
+ memset(h, POISON_FREE, dah_overhead - 1);
+ *((char *)h + dah_overhead - 1) = POISON_END;
+ h = prev;
+ h_offset = prev_offset;
+ } else {
+ h->next = prev->next;
+ prev->next = h_offset;
+ }
+ }
+ if (h_offset + dah_overhead + h->size == h->next) {
+ struct debug_alloc_header *next;
+ next = (struct debug_alloc_header *)
+ (debug_alloc_pool + h->next);
+ h->size += dah_overhead + next->size;
+ h->next = next->next;
+ memset(next, POISON_FREE, dah_overhead - 1);
+ *((char *)next + dah_overhead - 1) = POISON_END;
+ }
+ spin_unlock(&dap_lock);
+}
+
+void debug_kusage(void)
+{
+ struct debug_alloc_header *h_free, *h_used;
+#ifdef CONFIG_IA64
+ /* FIXME: using dah for ia64 unwind always results in a memory leak.
+ * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+ * all architectures.
+ */
+ static int debug_kusage_one_time;
+#else
+ static int debug_kusage_one_time = 1;
+#endif
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return;
+ }
+ h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first == 0 &&
+ (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+ dah_first_call))
+ goto out;
+ if (!debug_kusage_one_time)
+ goto out;
+ debug_kusage_one_time = 0;
+ kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+ __func__, dah_first);
+ if (dah_first) {
+ h_used = (struct debug_alloc_header *)debug_alloc_pool;
+ kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+ h_used->size);
+ }
+ do {
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+ h_free = (struct debug_alloc_header *)
+ (debug_alloc_pool + h_free->next);
+ } while (h_free->next);
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ if ((char *)h_used - debug_alloc_pool !=
+ sizeof(debug_alloc_pool_aligned))
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+out:
+ spin_unlock(&dap_lock);
+}
+
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+
+static int kdb_flags_stack[4], kdb_flags_index;
+
+void kdb_save_flags(void)
+{
+ BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+ kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+
+void kdb_restore_flags(void)
+{
+ BUG_ON(kdb_flags_index <= 0);
+ kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e..54996b71e66 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,11 +15,14 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/taskstats.h>
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
+#include <linux/module.h>
int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
struct kmem_cache *delayacct_cache;
static int __init delayacct_setup_disable(char *str)
@@ -103,20 +106,17 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
-
- /* Though tsk->delays accessed later, early exit avoids
- * unnecessary returning of other data
- */
- if (!tsk->delays)
- goto done;
+ cputime_t utime, stime, stimescaled, utimescaled;
tmp = (s64)d->cpu_run_real_total;
- cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ task_cputime(tsk, &utime, &stime);
+ cputime_to_timespec(utime + stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
- cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ cputime_to_timespec(utimescaled + stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -152,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
-done:
return 0;
}
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f2..00000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Coherent per-device memory handling.
- * Borrowed from i386
- */
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-
-struct dma_coherent_mem {
- void *virt_base;
- u32 device_base;
- int size;
- int flags;
- unsigned long *bitmap;
-};
-
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
- dma_addr_t device_addr, size_t size, int flags)
-{
- void __iomem *mem_base = NULL;
- int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
- goto out;
- if (!size)
- goto out;
- if (dev->dma_mem)
- goto out;
-
- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
- mem_base = ioremap(bus_addr, size);
- if (!mem_base)
- goto out;
-
- dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dev->dma_mem)
- goto out;
- dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dev->dma_mem->bitmap)
- goto free1_out;
-
- dev->dma_mem->virt_base = mem_base;
- dev->dma_mem->device_base = device_addr;
- dev->dma_mem->size = pages;
- dev->dma_mem->flags = flags;
-
- if (flags & DMA_MEMORY_MAP)
- return DMA_MEMORY_MAP;
-
- return DMA_MEMORY_IO;
-
- free1_out:
- kfree(dev->dma_mem);
- out:
- if (mem_base)
- iounmap(mem_base);
- return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if (!mem)
- return;
- dev->dma_mem = NULL;
- iounmap(mem->virt_base);
- kfree(mem->bitmap);
- kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
- dma_addr_t device_addr, size_t size)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
- int pos, err;
-
- size += device_addr & ~PAGE_MASK;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-
- pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
- if (err != 0)
- return ERR_PTR(err);
- return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-/**
- * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
- *
- * @dev: device from which we allocate memory
- * @size: size of requested memory area
- * @dma_handle: This will be filled with the correct dma handle
- * @ret: This pointer will be filled with the virtual address
- * to allocated area.
- *
- * This function should be only called from per-arch dma_alloc_coherent()
- * to support allocation from per-device coherent memory pools.
- *
- * Returns 0 if dma_alloc_coherent should continue with allocating from
- * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
- */
-int dma_alloc_from_coherent(struct device *dev, ssize_t size,
- dma_addr_t *dma_handle, void **ret)
-{
- struct dma_coherent_mem *mem;
- int order = get_order(size);
- int pageno;
-
- if (!dev)
- return 0;
- mem = dev->dma_mem;
- if (!mem)
- return 0;
-
- *ret = NULL;
-
- if (unlikely(size > (mem->size << PAGE_SHIFT)))
- goto err;
-
- pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
- if (unlikely(pageno < 0))
- goto err;
-
- /*
- * Memory was found in the per-device area.
- */
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
- *ret = mem->virt_base + (pageno << PAGE_SHIFT);
- memset(*ret, 0, size);
-
- return 1;
-
-err:
- /*
- * In the case where the allocation can not be satisfied from the
- * per-device area, try to fall back to generic memory if the
- * constraints allow it.
- */
- return mem->flags & DMA_MEMORY_EXCLUSIVE;
-}
-EXPORT_SYMBOL(dma_alloc_from_coherent);
-
-/**
- * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
- * @dev: device from which the memory was allocated
- * @order: the order of pages allocated
- * @vaddr: virtual address of allocated pages
- *
- * This checks whether the memory was allocated from the per-device
- * coherent memory pool and if so, releases that memory.
- *
- * Returns 1 if we correctly released the memory, or 0 if
- * dma_release_coherent() should proceed with releasing memory from
- * generic pools.
- */
-int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
-{
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-
- if (mem && vaddr >= mem->virt_base && vaddr <
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
- bitmap_release_region(mem->bitmap, page, order);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c530..6c6262f86c1 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
* [It also happened to remove the sizeof(char *) == sizeof(int)
* assumption introduced because of those /proc/dma patches. -- Hennus]
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 00000000000..e556751d15d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,24 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/binfmts.h>
+
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+ return 0;
+}
+
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+{
+ return 1;
+}
+
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
+{
+ return 1;
+}
+
+size_t __weak elf_core_extra_data_size(void)
+{
+ return 0;
+}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 00000000000..103f5d147b2
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,9 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+
+obj-y := core.o ring_buffer.o callchain.o
+
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 00000000000..97b67df8fbf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,209 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct callchain_cpus_entries {
+ struct rcu_head rcu_head;
+ struct perf_callchain_entry *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+ struct callchain_cpus_entries *entries;
+ int cpu;
+
+ entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+
+ kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+ struct callchain_cpus_entries *entries;
+
+ entries = callchain_cpus_entries;
+ rcu_assign_pointer(callchain_cpus_entries, NULL);
+ call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+ int cpu;
+ int size;
+ struct callchain_cpus_entries *entries;
+
+ /*
+ * We can't use the percpu allocation API for data that can be
+ * accessed from NMI. Use a temporary manual per cpu allocation
+ * until that gets sorted out.
+ */
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+
+ entries = kzalloc(size, GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+ for_each_possible_cpu(cpu) {
+ entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!entries->cpu_entries[cpu])
+ goto fail;
+ }
+
+ rcu_assign_pointer(callchain_cpus_entries, entries);
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+ kfree(entries);
+
+ return -ENOMEM;
+}
+
+int get_callchain_buffers(void)
+{
+ int err = 0;
+ int count;
+
+ mutex_lock(&callchain_mutex);
+
+ count = atomic_inc_return(&nr_callchain_events);
+ if (WARN_ON_ONCE(count < 1)) {
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if (count > 1) {
+ /* If the allocation failed, give up */
+ if (!callchain_cpus_entries)
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ err = alloc_callchain_buffers();
+exit:
+ if (err)
+ atomic_dec(&nr_callchain_events);
+
+ mutex_unlock(&callchain_mutex);
+
+ return err;
+}
+
+void put_callchain_buffers(void)
+{
+ if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+ release_callchain_buffers();
+ mutex_unlock(&callchain_mutex);
+ }
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+ int cpu;
+ struct callchain_cpus_entries *entries;
+
+ *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+ if (*rctx == -1)
+ return NULL;
+
+ entries = rcu_dereference(callchain_cpus_entries);
+ if (!entries)
+ return NULL;
+
+ cpu = smp_processor_id();
+
+ return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+ put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+ int rctx;
+ struct perf_callchain_entry *entry;
+
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
+
+ entry = get_callchain_entry(&rctx);
+ if (rctx == -1)
+ return NULL;
+
+ if (!entry)
+ goto exit_put;
+
+ entry->nr = 0;
+
+ if (kernel && !user_mode(regs)) {
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(entry, regs);
+ }
+
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
+ }
+
+exit_put:
+ put_callchain_entry(rctx);
+
+ return entry;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
new file mode 100644