diff options
Diffstat (limited to 'init')
| -rw-r--r-- | init/Kconfig | 1828 | ||||
| -rw-r--r-- | init/Makefile | 27 | ||||
| -rw-r--r-- | init/calibrate.c | 287 | ||||
| -rw-r--r-- | init/do_mounts.c | 422 | ||||
| -rw-r--r-- | init/do_mounts.h | 19 | ||||
| -rw-r--r-- | init/do_mounts_devfs.c | 137 | ||||
| -rw-r--r-- | init/do_mounts_initrd.c | 83 | ||||
| -rw-r--r-- | init/do_mounts_md.c | 96 | ||||
| -rw-r--r-- | init/do_mounts_rd.c | 278 | ||||
| -rw-r--r-- | init/init_task.c | 26 | ||||
| -rw-r--r-- | init/initramfs.c | 410 | ||||
| -rw-r--r-- | init/main.c | 964 | ||||
| -rw-r--r-- | init/noinitramfs.c | 52 | ||||
| -rw-r--r-- | init/version.c | 37 |
14 files changed, 3367 insertions, 1299 deletions
diff --git a/init/Kconfig b/init/Kconfig index 3dcbd5bfd49..9d76b99af1b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1,71 +1,71 @@ -menu "Code maturity level options" +config ARCH + string + option env="ARCH" + +config KERNELVERSION + string + option env="KERNELVERSION" + +config DEFCONFIG_LIST + string + depends on !UML + option defconfig_list + default "/lib/modules/$UNAME_RELEASE/.config" + default "/etc/kernel-config" + default "/boot/config-$UNAME_RELEASE" + default "$ARCH_DEFCONFIG" + default "arch/$ARCH/defconfig" + +config CONSTRUCTORS + bool + depends on !UML -config EXPERIMENTAL - bool "Prompt for development and/or incomplete code/drivers" - ---help--- - Some of the various things that Linux supports (such as network - drivers, file systems, network protocols, etc.) can be in a state - of development where the functionality, stability, or the level of - testing is not yet high enough for general use. This is usually - known as the "alpha-test" phase among developers. If a feature is - currently in alpha-test, then the developers usually discourage - uninformed widespread use of this feature by the general public to - avoid "Why doesn't this work?" type mail messages. However, active - testing and use of these systems is welcomed. Just be aware that it - may not meet the normal level of reliability or it may fail to work - in some special cases. Detailed bug reports from people familiar - with the kernel internals are usually welcomed by the developers - (before submitting bug reports, please read the documents - <file:README>, <file:MAINTAINERS>, <file:REPORTING-BUGS>, - <file:Documentation/BUG-HUNTING>, and - <file:Documentation/oops-tracing.txt> in the kernel source). - - This option will also make obsoleted drivers available. These are - drivers that have been replaced by something else, and/or are - scheduled to be removed in a future kernel release. - - Unless you intend to help test and develop a feature or driver that - falls into this category, or you have a situation that requires - using these features, you should probably say N here, which will - cause the configurator to present you with fewer choices. If - you say Y here, you will be offered the choice of using features or - drivers that are currently considered to be in the alpha-test phase. - -config CLEAN_COMPILE - bool "Select only drivers expected to compile cleanly" if EXPERIMENTAL - default y - help - Select this option if you don't even want to see the option - to configure known-broken drivers. - - If unsure, say Y +config IRQ_WORK + bool + +config BUILDTIME_EXTABLE_SORT + bool + +menu "General setup" config BROKEN bool - depends on !CLEAN_COMPILE - default y config BROKEN_ON_SMP bool depends on BROKEN || !SMP default y -config LOCK_KERNEL - bool - depends on SMP || PREEMPT - default y - config INIT_ENV_ARG_LIMIT int - default 32 if !USERMODE - default 128 if USERMODE + default 32 if !UML + default 128 if UML help Maximum of each of the number of arguments and environment variables passed to init from the kernel command line. -endmenu -menu "General setup" +config CROSS_COMPILE + string "Cross-compiler tool prefix" + help + Same as running 'make CROSS_COMPILE=prefix-' but stored for + default make runs in this kernel build directory. You don't + need to set this unless you want the configured kernel build + directory to select the cross-compiler automatically. + +config COMPILE_TEST + bool "Compile also drivers which will not load" + default n + help + Some drivers can be compiled on a different platform than they are + intended to be run on. Despite they cannot be loaded there (or even + when they load they cannot be used due to missing HW support), + developers still, opposing to distributors, might want to build such + drivers to compile-test them. + + If you are a developer and want to build everything available, say Y + here. If you are a user/distributor, say N here to exclude useless + drivers to be distributed. config LOCALVERSION string "Local version - append to kernel release" @@ -82,30 +82,144 @@ config LOCALVERSION_AUTO default y help This will try to automatically determine if the current tree is a - release tree by looking for git tags that - belong to the current top of tree revision. + release tree by looking for git tags that belong to the current + top of tree revision. A string of the format -gxxxxxxxx will be added to the localversion - if a git based tree is found. The string generated by this will be + if a git-based tree is found. The string generated by this will be appended after any matching localversion* files, and after the value - set in CONFIG_LOCALVERSION + set in CONFIG_LOCALVERSION. + + (The actual string used here is the first eight characters produced + by running the command: + + $ git rev-parse --verify HEAD + + which is done within the script "scripts/setlocalversion".) + +config HAVE_KERNEL_GZIP + bool + +config HAVE_KERNEL_BZIP2 + bool + +config HAVE_KERNEL_LZMA + bool + +config HAVE_KERNEL_XZ + bool + +config HAVE_KERNEL_LZO + bool + +config HAVE_KERNEL_LZ4 + bool + +choice + prompt "Kernel compression mode" + default KERNEL_GZIP + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 + help + The linux kernel is a kind of self-extracting executable. + Several compression algorithms are available, which differ + in efficiency, compression and decompression speed. + Compression speed is only relevant when building a kernel. + Decompression speed is relevant at each boot. + + If you have any problems with bzip2 or lzma compressed + kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older + version of this functionality (bzip2 only), for 2.4, was + supplied by Christian Ludwig) + + High compression options are mostly useful for users, who + are low on disk space (embedded systems), but for whom ram + size matters less. + + If in doubt, select 'gzip' + +config KERNEL_GZIP + bool "Gzip" + depends on HAVE_KERNEL_GZIP + help + The old and tried gzip compression. It provides a good balance + between compression ratio and decompression speed. + +config KERNEL_BZIP2 + bool "Bzip2" + depends on HAVE_KERNEL_BZIP2 + help + Its compression ratio and speed is intermediate. + Decompression speed is slowest among the choices. The kernel + size is about 10% smaller with bzip2, in comparison to gzip. + Bzip2 uses a large amount of memory. For modern kernels you + will need at least 8MB RAM or more for booting. + +config KERNEL_LZMA + bool "LZMA" + depends on HAVE_KERNEL_LZMA + help + This compression algorithm's ratio is best. Decompression speed + is between gzip and bzip2. Compression is slowest. + The kernel size is about 33% smaller with LZMA in comparison to gzip. + +config KERNEL_XZ + bool "XZ" + depends on HAVE_KERNEL_XZ + help + XZ uses the LZMA2 algorithm and instruction set specific + BCJ filters which can improve compression ratio of executable + code. The size of the kernel is about 30% smaller with XZ in + comparison to gzip. On architectures for which there is a BCJ + filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ + will create a few percent smaller kernel than plain LZMA. + + The speed is about the same as with LZMA: The decompression + speed of XZ is better than that of bzip2 but worse than gzip + and LZO. Compression is slow. + +config KERNEL_LZO + bool "LZO" + depends on HAVE_KERNEL_LZO + help + Its compression ratio is the poorest among the choices. The kernel + size is about 10% bigger than gzip; however its speed + (both compression and decompression) is the fastest. - Note: This requires Perl, and a git repository, but not necessarily - the git or cogito tools to be installed. +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + <https://code.google.com/p/lz4/>. + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + +endchoice + +config DEFAULT_HOSTNAME + string "Default hostname" + default "(none)" + help + This option determines the default system hostname before userspace + calls sethostname(2). The kernel traditionally uses "(none)" here, + but you may wish to use a different default here to make a minimal + system more usable with less configuration. config SWAP bool "Support for paging of anonymous memory (swap)" - depends on MMU + depends on MMU && BLOCK default y help This option allows you to choose whether you want to have support - for socalled swap devices or swap files in your kernel that are + for so called swap devices or swap files in your kernel that are used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. config SYSVIPC bool "System V IPC" - depends on MMU ---help--- Inter Process Communication is a suite of library functions and system calls which let processes (running programs) synchronize and @@ -119,17 +233,21 @@ config SYSVIPC section 6.4 of the Linux Programmer's Guide, available from <http://www.tldp.org/guides.html>. +config SYSVIPC_SYSCTL + bool + depends on SYSVIPC + depends on SYSCTL + default y + config POSIX_MQUEUE bool "POSIX Message Queues" - depends on NET && EXPERIMENTAL + depends on NET ---help--- POSIX variant of message queues is a part of IPC. In POSIX message queues every message has a priority which decides about succession of receiving it by a process. If you want to compile and run programs written e.g. for Solaris with use of its POSIX message - queues (functions mq_*) say Y here. To use this feature you will - also need mqueue library, available from - <http://www.mat.uni.torun.pl/~wrona/posix_ipc/> + queues (functions mq_*) say Y here. POSIX message queues are visible as a filesystem called 'mqueue' and can be mounted somewhere if you want to do filesystem @@ -137,6 +255,143 @@ config POSIX_MQUEUE If unsure, say Y. +config POSIX_MQUEUE_SYSCTL + bool + depends on POSIX_MQUEUE + depends on SYSCTL + default y + +config CROSS_MEMORY_ATTACH + bool "Enable process_vm_readv/writev syscalls" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to to another process's address space. + See the man page for more details. + +config FHANDLE + bool "open by fhandle syscalls" + select EXPORTFS + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + +config USELIB + bool "uselib syscall" + default y + help + This option enables the uselib syscall, a system call used in the + dynamic linker from libc5 and earlier. glibc does not use this + system call. If you intend to run programs built on libc5 or + earlier, you may need to enable this syscall. Current systems + running glibc can safely disable this. + +config AUDIT + bool "Auditing support" + depends on NET + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for + logging of avc messages output). Does not do system-call + auditing without CONFIG_AUDITSYSCALL. + +config HAVE_ARCH_AUDITSYSCALL + bool + +config AUDITSYSCALL + bool "Enable system-call auditing support" + depends on AUDIT && HAVE_ARCH_AUDITSYSCALL + default y if SECURITY_SELINUX + help + Enable low-overhead system-call auditing infrastructure that + can be used independently or with another kernel subsystem, + such as SELinux. + +config AUDIT_WATCH + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY + +config AUDIT_TREE + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY + +source "kernel/irq/Kconfig" +source "kernel/time/Kconfig" + +menu "CPU/Task time and stats accounting" + +config VIRT_CPU_ACCOUNTING + bool + +choice + prompt "Cputime accounting" + default TICK_CPU_ACCOUNTING if !PPC64 + default VIRT_CPU_ACCOUNTING_NATIVE if PPC64 + +# Kind of a stub config for the pure tick based cputime accounting +config TICK_CPU_ACCOUNTING + bool "Simple tick based cputime accounting" + depends on !S390 && !NO_HZ_FULL + help + This is the basic tick based cputime accounting that maintains + statistics about user, system and idle time spent on per jiffies + granularity. + + If unsure, say Y. + +config VIRT_CPU_ACCOUNTING_NATIVE + bool "Deterministic task and CPU time accounting" + depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL + select VIRT_CPU_ACCOUNTING + help + Select this option to enable more accurate task and CPU time + accounting. This is done by reading a CPU counter on each + kernel entry and exit and on transitions within the kernel + between system, softirq and hardirq state, so there is a + small performance impact. In the case of s390 or IBM POWER > 5, + this also enables accounting of stolen time on logically-partitioned + systems. + +config VIRT_CPU_ACCOUNTING_GEN + bool "Full dynticks CPU time accounting" + depends on HAVE_CONTEXT_TRACKING + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + select VIRT_CPU_ACCOUNTING + select CONTEXT_TRACKING + help + Select this option to enable task and CPU time accounting on full + dynticks systems. This accounting is implemented by watching every + kernel-user boundaries using the context tracking subsystem. + The accounting is thus performed at the expense of some significant + overhead. + + For now this is only useful if you are working on the full + dynticks subsystem development. + + If unsure, say N. + +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL + help + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + +endchoice + config BSD_PROCESS_ACCT bool "BSD Process Accounting" help @@ -160,72 +415,376 @@ config BSD_PROCESS_ACCT_V3 process and it's parent. Note that this file format is incompatible with previous v0/v1/v2 file formats, so you will need updated tools for processing it. A preliminary version of these tools is available - at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>. + at <http://www.gnu.org/software/acct/>. -config SYSCTL - bool "Sysctl support" - ---help--- - The sysctl interface provides a means of dynamically changing - certain kernel parameters and variables on the fly without requiring - a recompile of the kernel or reboot of the system. The primary - interface consists of a system call, but if you say Y to "/proc - file system support", a tree of modifiable sysctl entries will be - generated beneath the /proc/sys directory. They are explained in the - files in <file:Documentation/sysctl/>. Note that enabling this - option will enlarge the kernel by at least 8 KB. - - As it is generally a good thing, you should say Y here unless - building a kernel for install/rescue disks or your system is very - limited in memory. - -config AUDIT - bool "Auditing support" +config TASKSTATS + bool "Export task/process statistics through netlink" depends on NET - default y if SECURITY_SELINUX + default n help - Enable auditing infrastructure that can be used with another - kernel subsystem, such as SELinux (which requires this for - logging of avc messages output). Does not do system-call - auditing without CONFIG_AUDITSYSCALL. + Export selected statistics for tasks/processes through the + generic netlink interface. Unlike BSD process accounting, the + statistics are available during the lifetime of tasks/processes as + responses to commands. Like BSD accounting, they are sent to user + space on task exit. -config AUDITSYSCALL - bool "Enable system-call auditing support" - depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML || SPARC64) - default y if SECURITY_SELINUX + Say N if unsure. + +config TASK_DELAY_ACCT + bool "Enable per-task delay accounting" + depends on TASKSTATS help - Enable low-overhead system-call auditing infrastructure that - can be used independently or with another kernel subsystem, - such as SELinux. + Collect information on time spent by a task waiting for system + resources like cpu, synchronous block I/O completion and swapping + in pages. Such statistics can help in setting a task's priorities + relative to other tasks for cpu, io, rss limits etc. -config HOTPLUG - bool "Support for hot-pluggable devices" if !ARCH_S390 - default ARCH_S390 + Say N if unsure. + +config TASK_XACCT + bool "Enable extended accounting over taskstats" + depends on TASKSTATS help - This option is provided for the case where no in-kernel-tree - modules require HOTPLUG functionality, but a module built - outside the kernel tree does. Such modules require Y here. + Collect extended task accounting data and send the data + to userland for processing over the taskstats interface. -config KOBJECT_UEVENT - bool "Kernel Userspace Events" - depends on NET - default y + Say N if unsure. + +config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting" + depends on TASK_XACCT + help + Collect information on the number of bytes of storage I/O which this + task has caused. + + Say N if unsure. + +endmenu # "CPU/Task time and stats accounting" + +menu "RCU Subsystem" + +choice + prompt "RCU Implementation" + default TREE_RCU + +config TREE_RCU + bool "Tree-based hierarchical RCU" + depends on !PREEMPT && SMP + select IRQ_WORK + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config TREE_PREEMPT_RCU + bool "Preemptible tree-based hierarchical RCU" + depends on PREEMPT + select IRQ_WORK + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + + Select this option if you are unsure. + +config TINY_RCU + bool "UP-only small-memory-footprint RCU" + depends on !PREEMPT && !SMP help - This option enables the kernel userspace event layer, which is a - simple mechanism for kernel-to-user communication over a netlink - socket. - The goal of the kernel userspace events layer is to provide a simple - and efficient events system, that notifies userspace about kobject - state changes. This will enable applications to just listen for - events instead of polling system devices and files. - Hotplug events (kobject addition and removal) are also available on - the netlink socket in addition to the execution of /sbin/hotplug if - CONFIG_HOTPLUG is enabled. + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. - Say Y, unless you are building a system requiring minimal memory - consumption. +endchoice + +config PREEMPT_RCU + def_bool TREE_PREEMPT_RCU + help + This option enables preemptible-RCU code that is common between + the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + +config RCU_STALL_COMMON + def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) + help + This option enables RCU CPU stall code that is common between + the TINY and TREE variants of RCU. The purpose is to allow + the tiny variants to disable RCU CPU stall warnings, while + making these warnings mandatory for the tree variants. + +config CONTEXT_TRACKING + bool + +config RCU_USER_QS + bool "Consider userspace as in RCU extended quiescent state" + depends on HAVE_CONTEXT_TRACKING && SMP + select CONTEXT_TRACKING + help + This option sets hooks on kernel / userspace boundaries and + puts RCU in extended quiescent state when the CPU runs in + userspace. It means that when a CPU runs in userspace, it is + excluded from the global RCU state machine and thus doesn't + try to keep the timer tick on for RCU. + + Unless you want to hack and help the development of the full + dynticks mode, you shouldn't enable this option. It also + adds unnecessary overhead. + + If unsure say N + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on TREE_RCU || TREE_PREEMPT_RCU + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 RCU_FANOUT if 64BIT + range 2 RCU_FANOUT if !64BIT + depends on TREE_RCU || TREE_PREEMPT_RCU + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems. + + Take the default if unsure. + +config RCU_FANOUT_EXACT + bool "Disable tree-based hierarchical RCU auto-balancing" + depends on TREE_RCU || TREE_PREEMPT_RCU + default n + help + This option forces use of the exact RCU_FANOUT value specified, + regardless of imbalances in the hierarchy. This is useful for + testing RCU itself, and might one day be useful on systems with + strong NUMA behavior. + + Without RCU_FANOUT_EXACT, the code will balance the hierarchy. + + Say N if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ_COMMON && SMP + default n + help + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. + + Say N if you are unsure. + +config TREE_RCU_TRACE + def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU ) + select DEBUG_FS + help + This option provides tracing for the TREE_RCU and + TREE_PREEMPT_RCU implementations, permitting Makefile to + trivially select kernel/rcutree_trace.c. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_PRIO + int "Real-time priority to boost RCU readers to" + range 1 99 + depends on RCU_BOOST + default 1 + help + This option specifies the real-time priority to which long-term + preempted RCU readers are to be boosted. If you are working + with a real-time application that has one or more CPU-bound + threads running at a real-time priority level, you should set + RCU_BOOST_PRIO to a priority higher then the highest-priority + real-time CPU-bound thread. The default RCU_BOOST_PRIO value + of 1 is appropriate in the common case, which is real-time + applications that do not have any CPU-bound threads. + + Some real-time applications might not have a single real-time + thread that saturates a given CPU, but instead might have + multiple real-time threads that, taken together, fully utilize + that CPU. In this case, you should set RCU_BOOST_PRIO to + a priority higher than the lowest-priority thread that is + conspiring to prevent the CPU from running any non-real-time + tasks. For example, if one thread at priority 10 and another + thread at priority 5 are between themselves fully consuming + the CPU time on a given CPU, then RCU_BOOST_PRIO should be + set to priority 6 or higher. + + Specify the real-time priority, or take the default if unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || TREE_PREEMPT_RCU + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want to help to debug reduced OS jitter. + Say N here if you are unsure. + +choice + prompt "Build-forced no-CBs CPUs" + default RCU_NOCB_CPU_NONE + help + This option allows no-CBs CPUs (whose RCU callbacks are invoked + from kthreads rather than from softirq context) to be specified + at build time. Additional no-CBs CPUs may be specified by + the rcu_nocbs= boot parameter. + +config RCU_NOCB_CPU_NONE + bool "No build_forced no-CBs CPUs" + depends on RCU_NOCB_CPU && !NO_HZ_FULL + help + This option does not force any of the CPUs to be no-CBs CPUs. + Only CPUs designated by the rcu_nocbs= boot parameter will be + no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU + kthreads whose names begin with "rcuo". All other CPUs will + invoke their own RCU callbacks in softirq context. + + Select this option if you want to choose no-CBs CPUs at + boot time, for example, to allow testing of different no-CBs + configurations without having to rebuild the kernel each time. + +config RCU_NOCB_CPU_ZERO + bool "CPU 0 is a build_forced no-CBs CPU" + depends on RCU_NOCB_CPU && !NO_HZ_FULL + help + This option forces CPU 0 to be a no-CBs CPU, so that its RCU + callbacks are invoked by a per-CPU kthread whose name begins + with "rcuo". Additional CPUs may be designated as no-CBs + CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs. + All other CPUs will invoke their own RCU callbacks in softirq + context. + + Select this if CPU 0 needs to be a no-CBs CPU for real-time + or energy-efficiency reasons, but the real reason it exists + is to ensure that randconfig testing covers mixed systems. + +config RCU_NOCB_CPU_ALL + bool "All CPUs are build_forced no-CBs CPUs" + depends on RCU_NOCB_CPU + help + This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs= + boot parameter will be ignored. All CPUs' RCU callbacks will + be executed in the context of per-CPU rcuo kthreads created for + this purpose. Assuming that the kthreads whose names start with + "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter + on the remaining CPUs, but might decrease memory locality during + RCU-callback invocation, thus potentially degrading throughput. + + Select this if all CPUs need to be no-CBs CPUs for real-time + or energy-efficiency reasons. + +endchoice + +endmenu # "RCU Subsystem" config IKCONFIG - bool "Kernel .config support" + tristate "Kernel .config support" ---help--- This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation @@ -243,9 +802,119 @@ config IKCONFIG_PROC This option enables access to the kernel configuration file through /proc/config.gz. +config LOG_BUF_SHIFT + int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + range 12 21 + default 17 + help + Select kernel log buffer size as a power of 2. + Examples: + 17 => 128 KB + 16 => 64 KB + 15 => 32 KB + 14 => 16 KB + 13 => 8 KB + 12 => 4 KB + +# +# Architectures with an unreliable sched_clock() should select this: +# +config HAVE_UNSTABLE_SCHED_CLOCK + bool + +config GENERIC_SCHED_CLOCK + bool + +# +# For architectures that want to enable the support for NUMA-affine scheduler +# balancing logic: +# +config ARCH_SUPPORTS_NUMA_BALANCING + bool + +# +# For architectures that know their GCC __int128 support is sound +# +config ARCH_SUPPORTS_INT128 + bool + +# For architectures that (ab)use NUMA to represent different memory regions +# all cpu-local but of different latencies, such as SuperH. +# +config ARCH_WANT_NUMA_VARIABLE_LOCALITY + bool + +# +# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE +config ARCH_WANTS_PROT_NUMA_PROT_NONE + bool + +config ARCH_USES_NUMA_PROT_NONE + bool + default y + depends on ARCH_WANTS_PROT_NUMA_PROT_NONE + depends on NUMA_BALANCING + +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, automatic NUMA balancing will be enabled if running on a NUMA + machine. + +config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when + it has references to the node the task is running on. + + This system will be inactive on UMA systems. + +menuconfig CGROUPS + boolean "Control Group support" + select KERNFS + help + This option adds support for grouping sets of processes together, for + use with process control subsystems such as Cpusets, CFS, memory + controls or device isolation. + See + - Documentation/scheduler/sched-design-CFS.txt (CFS) + - Documentation/cgroups/ (features for grouping, isolation + and resource control) + + Say N if unsure. + +if CGROUPS + +config CGROUP_DEBUG + bool "Example debug cgroup subsystem" + default n + help + This option enables a simple cgroup subsystem that + exports useful debugging information about the cgroups + framework. + + Say N if unsure. + +config CGROUP_FREEZER + bool "Freezer cgroup subsystem" + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + +config CGROUP_DEVICE + bool "Device controller for cgroups" + help + Provides a cgroup implementing whitelists for devices which + a process in the cgroup can mknod or open. + config CPUSETS bool "Cpuset support" - depends on SMP help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -254,18 +923,435 @@ config CPUSETS Say N if unsure. +config PROC_PID_CPUSET + bool "Include legacy /proc/<pid>/cpuset file" + depends on CPUSETS + default y + +config CGROUP_CPUACCT + bool "Simple CPU accounting cgroup subsystem" + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a cgroup. + +config RESOURCE_COUNTERS + bool "Resource counters" + help + This option enables controller independent resource accounting + infrastructure that works with cgroups. + +config MEMCG + bool "Memory Resource Controller for Control Groups" + depends on RESOURCE_COUNTERS + select EVENTFD + help + Provides a memory resource controller that manages both anonymous + memory and page cache. (See Documentation/cgroups/memory.txt) + + Note that setting this option increases fixed memory overhead + associated with each page of memory in the system. By this, + 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory + usage tracking struct at boot. Total amount of this is printed out + at boot. + + Only enable when you're ok with these trade offs and really + sure you need the memory resource controller. Even when you enable + this, you can set "cgroup_disable=memory" at your boot option to + disable memory resource controller and you can avoid overheads. + (and lose benefits of memory resource controller) + +config MEMCG_SWAP + bool "Memory Resource Controller Swap Extension" + depends on MEMCG && SWAP + help + Add swap management feature to memory resource controller. When you + enable this, you can limit mem+swap usage per cgroup. In other words, + when you disable this, memory resource controller has no cares to + usage of swap...a process can exhaust all of the swap. This extension + is useful when you want to avoid exhaustion swap but this itself + adds more overheads and consumes memory for remembering information. + Especially if you use 32bit system or small memory system, please + be careful about enabling this. When memory resource controller + is disabled by boot option, this will be automatically disabled and + there will be no overhead from this. Even when you set this config=y, + if boot option "swapaccount=0" is set, swap will not be accounted. + Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page + size is 4096bytes, 512k per 1Gbytes of swap. +config MEMCG_SWAP_ENABLED + bool "Memory Resource Controller Swap Extension enabled by default" + depends on MEMCG_SWAP + default y + help + Memory Resource Controller Swap Extension comes with its price in + a bigger memory consumption. General purpose distribution kernels + which want to enable the feature but keep it disabled by default + and let the user enable it by swapaccount=1 boot command line + parameter should have this option unselected. + For those who want to have the feature enabled by default should + select this option (if, for some reason, they need to disable it + then swapaccount=0 does the trick). +config MEMCG_KMEM + bool "Memory Resource Controller Kernel Memory accounting" + depends on MEMCG + depends on SLUB || SLAB + help + The Kernel Memory extension for Memory Resource Controller can limit + the amount of memory used by kernel objects in the system. Those are + fundamentally different from the entities handled by the standard + Memory Controller, which are page-based, and can be swapped. Users of + the kmem extension can use it to guarantee that no group of processes + will ever exhaust kernel resources alone. + + WARNING: Current implementation lacks reclaim support. That means + allocation attempts will fail when close to the limit even if there + are plenty of kmem available for reclaim. That makes this option + unusable in real life so DO NOT SELECT IT unless for development + purposes. + +config CGROUP_HUGETLB + bool "HugeTLB Resource Controller for Control Groups" + depends on RESOURCE_COUNTERS && HUGETLB_PAGE + default n + help + Provides a cgroup Resource Controller for HugeTLB pages. + When you enable this, you can put a per cgroup limit on HugeTLB usage. + The limit is enforced during page fault. Since HugeTLB doesn't + support page reclaim, enforcing the limit at page fault time implies + that, the application will get SIGBUS signal if it tries to access + HugeTLB pages beyond its limit. This requires the application to know + beforehand how much HugeTLB pages it would require for its use. The + control group is tracked in the third page lru pointer. This means + that we cannot use the controller with huge page less than 3 pages. + +config CGROUP_PERF + bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" + depends on PERF_EVENTS && CGROUPS + help + This option extends the per-cpu mode to restrict monitoring to + threads which belong to the cgroup specified and run on the + designated cpu. + + Say N if unsure. + +menuconfig CGROUP_SCHED + bool "Group CPU scheduler" + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See tip/Documentation/scheduler/sched-bwc.txt for more information. + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to task groups. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. + +endif #CGROUP_SCHED + +config BLK_CGROUP + bool "Block IO controller" + depends on BLOCK + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. It is also used by bio throttling logic in + block layer to implement upper limit in IO rates on a device. + + This option only enables generic Block IO controller infrastructure. + One needs to also enable actual IO controlling logic/policy. For + enabling proportional weight division of disk bandwidth in CFQ, set + CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BLK_DEV_THROTTLING=y. + + See Documentation/cgroups/blkio-controller.txt for more information. + +config DEBUG_BLK_CGROUP + bool "Enable Block IO controller debugging" + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. + +endif # CGROUPS + +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" if EXPERT + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + +menuconfig NAMESPACES + bool "Namespaces support" if EXPERT + default !EXPERT + help + Provides the way to make tasks work with different objects using + the same id. For example same IPC id may refer to different objects + or same user id or pid may refer to different tasks when used in + different namespaces. + +if NAMESPACES + +config UTS_NS + bool "UTS namespace" + default y + help + In this namespace tasks see different info provided with the + uname() system call + +config IPC_NS + bool "IPC namespace" + depends on (SYSVIPC || POSIX_MQUEUE) + default y + help + In this namespace tasks work with IPC ids which correspond to + different IPC objects in different namespaces. + +config USER_NS + bool "User namespace" + default n + help + This allows containers, i.e. vservers, to use user namespaces + to provide different user info for different servers. + + When user namespaces are enabled in the kernel it is + recommended that the MEMCG and MEMCG_KMEM options also be + enabled and that user-space use the memory control groups to + limit the amount of memory a memory unprivileged users can + use. + + If unsure, say N. + +config PID_NS + bool "PID Namespaces" + default y + help + Support process id namespaces. This allows having multiple + processes with the same pid as long as they are in different + pid namespaces. This is a building block of containers. + +config NET_NS + bool "Network namespace" + depends on NET + default y + help + Allow user space to create what appear to be multiple instances + of the network stack. + +endif # NAMESPACES + +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + +config SYSFS_DEPRECATED + bool "Enable deprecated sysfs features to support old userspace tools" + depends on SYSFS + default n + help + This option adds code that switches the layout of the "block" class + devices, to not show up in /sys/class/block/, but only in + /sys/block/. + + This switch is only active when the sysfs.deprecated=1 boot option is + passed or the SYSFS_DEPRECATED_V2 option is set. + + This option allows new kernels to run on old distributions and tools, + which might get confused by /sys/class/block/. Since 2007/2008 all + major distributions and tools handle this just fine. + + Recent distributions and userspace tools after 2009/2010 depend on + the existence of /sys/class/block/, and will not work with this + option enabled. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. + +config SYSFS_DEPRECATED_V2 + bool "Enable deprecated sysfs features by default" + default n + depends on SYSFS + depends on SYSFS_DEPRECATED + help + Enable deprecated sysfs by default. + + See the CONFIG_SYSFS_DEPRECATED option for more details about this + option. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. Even then, odds are you would not need it + enabled, you can always pass the boot option if absolutely necessary. + +config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + help + This option enables support for relay interface support in + certain file systems (such as debugfs). + It is designed to provide an efficient mechanism for tools and + facilities to relay large amounts of data from kernel space to + user space. + + If unsure, say N. + +config BLK_DEV_INITRD + bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" + depends on BROKEN || !FRV + help + The initial RAM filesystem is a ramfs which is loaded by the + boot loader (loadlin or lilo) and that is mounted as root + before the normal boot procedure. It is typically used to + load modules needed to mount the "real" root file system, + etc. See <file:Documentation/initrd.txt> for details. + + If RAM disk support (BLK_DEV_RAM) is also included, this + also enables initial RAM disk (initrd) support and adds + 15 Kbytes (more on some other architectures) to the kernel size. + + If unsure say Y. + +if BLK_DEV_INITRD + source "usr/Kconfig" -menuconfig EMBEDDED - bool "Configure standard kernel features (for small systems)" +endif + +config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size" + help + Enabling this option will pass "-Os" instead of "-O2" to gcc + resulting in a smaller kernel. + + If unsure, say N. + +config SYSCTL + bool + +config ANON_INODES + bool + +config HAVE_UID16 + bool + +config SYSCTL_EXCEPTION_TRACE + bool + help + Enable support for /proc/sys/debug/exception-trace. + +config SYSCTL_ARCH_UNALIGN_NO_WARN + bool + help + Enable support for /proc/sys/kernel/ignore-unaligned-usertrap + Allows arch to define/use @no_unaligned_warning to possibly warn + about unaligned access emulation going on under the hood. + +config SYSCTL_ARCH_UNALIGN_ALLOW + bool + help + Enable support for /proc/sys/kernel/unaligned-trap + Allows arches to define/use @unaligned_enabled to runtime toggle + the unaligned access emulation. + see arch/parisc/kernel/unaligned.c for reference + +config HAVE_PCSPKR_PLATFORM + bool + +menuconfig EXPERT + bool "Configure standard kernel features (expert users)" + # Unhide debug options, to make the on-by-default options visible + select DEBUG_KERNEL help This option allows certain base kernel options and settings to be disabled or tweaked. This is for specialized environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config UID16 + bool "Enable 16-bit UID system calls" if EXPERT + depends on HAVE_UID16 + default y + help + This enables the legacy 16-bit UID syscall wrappers. + +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH + ---help--- + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + +config SYSFS_SYSCALL + bool "Sysfs syscall support" if EXPERT + default y + ---help--- + sys_sysfs is an obsolete system call no longer supported in libc. + Note that disabling this option is more secure but might break + compatibility with some systems. + + If unsure say Y here. + +config SYSCTL_SYSCALL + bool "Sysctl syscall support" if EXPERT + depends on PROC_SYSCTL + default n + select SYSCTL + ---help--- + sys_sysctl uses binary paths that have been found challenging + to properly maintain and use. The interface in /proc/sys + using paths with ascii names is now the primary path to this + information. + + Almost nothing using the binary sysctl interface so if you are + trying to save some space it is probably safe to disable this, + making your kernel marginally smaller. + + If unsure say N here. + config KALLSYMS - bool "Load all symbols for debugging/kksymoops" if EMBEDDED + bool "Load all symbols for debugging/ksymoops" if EXPERT default y help Say Y here to let the kernel print out symbolic crash information and @@ -276,28 +1362,23 @@ config KALLSYMS_ALL bool "Include all symbols in kallsyms" depends on DEBUG_KERNEL && KALLSYMS help - Normally kallsyms only contains the symbols of functions, for nicer - OOPS messages. Some debuggers can use kallsyms for other - symbols too: say Y here to include all symbols, if you need them - and you don't care about adding 300k to the size of your kernel. - - Say N. + Normally kallsyms only contains the symbols of functions for nicer + OOPS messages and backtraces (i.e., symbols from the text and inittext + sections). This is sufficient for most cases. And only in very rare + cases (e.g., when a debugger is used) all symbols are required (e.g., + names of variables from the data sections, etc). -config KALLSYMS_EXTRA_PASS - bool "Do an extra kallsyms pass" - depends on KALLSYMS - help - If kallsyms is not working correctly, the build will fail with - inconsistent kallsyms data. If that occurs, log a bug report and - turn on KALLSYMS_EXTRA_PASS which should result in a stable build. - Always say N here unless you find a bug in kallsyms, which must be - reported. KALLSYMS_EXTRA_PASS is only a temporary workaround while - you wait for kallsyms to be fixed. + This option makes sure that all symbols are loaded into the kernel + image (i.e., symbols from all sections) in cost of increased kernel + size (depending on the kernel configuration, it may be 300KiB or + something like this). + Say N unless you really need all symbols. config PRINTK default y - bool "Enable support for printk" if EMBEDDED + bool "Enable support for printk" if EXPERT + select IRQ_WORK help This option enables normal printk support. Removing it eliminates most of the message strings from the kernel image @@ -306,7 +1387,7 @@ config PRINTK strongly discouraged. config BUG - bool "BUG() support" if EMBEDDED + bool "BUG() support" if EXPERT default y help Disabling this option eliminates support for BUG and WARN, reducing @@ -315,43 +1396,87 @@ config BUG option for embedded systems with no facilities for reporting errors. Just say Y. +config ELF_CORE + depends on COREDUMP + default y + bool "Enable ELF core dumps" if EXPERT + help + Enable support for generating core dumps. Disabling saves about 4k. + + +config PCSPKR_PLATFORM + bool "Enable PC-Speaker support" if EXPERT + depends on HAVE_PCSPKR_PLATFORM + select I8253_LOCK + default y + help + This option allows to disable the internal PC-Speaker + support, saving some memory. + config BASE_FULL default y - bool "Enable full-sized data structures for core" if EMBEDDED + bool "Enable full-sized data structures for core" if EXPERT help Disabling this option reduces the size of miscellaneous core kernel data structures. This saves memory on small machines, but may reduce performance. config FUTEX - bool "Enable futex support" if EMBEDDED + bool "Enable futex support" if EXPERT default y + select RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config HAVE_FUTEX_CMPXCHG + bool + help + Architectures should select this if futex_atomic_cmpxchg_inatomic() + is implemented and always working. This removes a couple of runtime + checks. + config EPOLL - bool "Enable eventpoll support" if EMBEDDED + bool "Enable eventpoll support" if EXPERT default y + select ANON_INODES help Disabling this option will cause the kernel to be built without support for epoll family of system calls. -config CC_OPTIMIZE_FOR_SIZE - bool "Optimize for size" if EMBEDDED - default y if ARM || H8300 +config SIGNALFD + bool "Enable signalfd() system call" if EXPERT + select ANON_INODES + default y help - Enabling this option will pass "-Os" instead of "-O2" to gcc - resulting in a smaller kernel. + Enable the signalfd() system call that allows to receive signals + on a file descriptor. - WARNING: some versions of gcc may generate incorrect code with this - option. If problems are observed, a gcc upgrade may be needed. + If unsure, say Y. - If unsure, say N. +config TIMERFD + bool "Enable timerfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the timerfd() system call that allows to receive timer + events on a file descriptor. + + If unsure, say Y. + +config EVENTFD + bool "Enable eventfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the eventfd() system call that allows to receive both + kernel notification (ie. KAIO) or userspace notifications. + + If unsure, say Y. config SHMEM - bool "Use full shmem filesystem" if EMBEDDED + bool "Use full shmem filesystem" if EXPERT default y depends on MMU help @@ -361,58 +1486,235 @@ config SHMEM option replaces shmem and tmpfs with the much simpler ramfs code, which may be appropriate on small systems without swap. -config CC_ALIGN_FUNCTIONS - int "Function alignment" if EMBEDDED - default 0 +config AIO + bool "Enable AIO support" if EXPERT + default y + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling + this option saves about 7k. + +config PCI_QUIRKS + default y + bool "Enable PCI quirk workarounds" if EXPERT + depends on PCI + help + This enables workarounds for various PCI chipset + bugs/quirks. Disable this only if your target machine is + unaffected by PCI quirks. + +config EMBEDDED + bool "Embedded system" + option allnoconfig_y + select EXPERT + help + This option should be enabled if compiling the kernel for + an embedded system so certain expert options are available + for configuration. + +config HAVE_PERF_EVENTS + bool + help + See tools/perf/design.txt for details. + +config PERF_USE_VMALLOC + bool + help + See tools/perf/design.txt for details + +menu "Kernel Performance Events And Counters" + +config PERF_EVENTS + bool "Kernel performance events and counters" + default y if PROFILING + depends on HAVE_PERF_EVENTS + select ANON_INODES + select IRQ_WORK + help + Enable kernel support for various performance events provided + by software and hardware. + + Software events are supported either built-in or via the + use of generic tracepoints. + + Most modern CPUs support performance events via performance + counter registers. These registers count the number of certain + types of hw events: such as instructions executed, cachemisses + suffered, or branches mis-predicted - without slowing down the + kernel or applications. These registers can also trigger interrupts + when a threshold number of events have passed - and can thus be + used to profile the code that runs on that CPU. + + The Linux Performance Event subsystem provides an abstraction of + these software and hardware event capabilities, available via a + system call and used by the "perf" utility in tools/perf/. It + provides per task and per CPU counters, and it provides event + capabilities on top of those. + + Say Y if unsure. + +config DEBUG_PERF_USE_VMALLOC + default n + bool "Debug: use vmalloc to back perf mmap() buffers" + depends on PERF_EVENTS && DEBUG_KERNEL + select PERF_USE_VMALLOC + help + Use vmalloc memory to back perf mmap() buffers. + + Mostly useful for debugging the vmalloc code on platforms + that don't require it. + + Say N if unsure. + +endmenu + +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +choice + prompt "Choose SLAB allocator" + default SLUB + help + This option allows to select a slab allocator. + +config SLAB + bool "SLAB" help - Align the start of functions to the next power-of-two greater than n, - skipping up to n bytes. For instance, 32 aligns functions - to the next 32-byte boundary, but 24 would align to the next - 32-byte boundary only if this can be done by skipping 23 bytes or less. - Zero means use compiler's default. + The regular slab allocator that is established and known to work + well in all environments. It organizes cache hot objects in + per cpu and per node queues. -config CC_ALIGN_LABELS - int "Label alignment" if EMBEDDED - default 0 +config SLUB + bool "SLUB (Unqueued Allocator)" help - Align all branch targets to a power-of-two boundary, skipping - up to n bytes like ALIGN_FUNCTIONS. This option can easily - make code slower, because it must insert dummy operations for - when the branch target is reached in the usual flow of the code. - Zero means use compiler's default. + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). + Per cpu caching is realized using slabs of objects instead + of queues of objects. SLUB can use memory efficiently + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. + +config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. + +endchoice -config CC_ALIGN_LOOPS - int "Loop alignment" if EMBEDDED - default 0 +config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP + bool "SLUB per cpu partial cache" help - Align loops to a power-of-two boundary, skipping up to n bytes. - Zero means use compiler's default. + Per cpu partial caches accellerate objects allocation and freeing + that is local to a processor at the price of more indeterminism + in the latency of the free. On overflow these caches will be cleared + which requires the taking of locks that may cause latency spikes. + Typically one would choose no for a realtime system. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has it's contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/nommu-mmap.txt for more information. + +config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will + by the kernel from compiled-in data and from hardware key stores, but + userspace may only add extra keys if those keys can be verified by + keys already in the keyring. + + Keys in this keyring are used by module signature checking. -config CC_ALIGN_JUMPS - int "Jump alignment" if EMBEDDED - default 0 +config PROFILING + bool "Profiling support" help - Align branch targets to a power-of-two boundary, for branch - targets where the targets can only be reached by jumping, - skipping up to n bytes like ALIGN_FUNCTIONS. In this case, - no dummy operations need be executed. - Zero means use compiler's default. + Say Y here to enable the extended profiling support mechanisms used + by profilers such as OProfile. + +# +# Place an empty function call at each tracepoint site. Can be +# dynamically changed for a probe function. +# +config TRACEPOINTS + bool + +source "arch/Kconfig" endmenu # General setup -config TINY_SHMEM - default !SHMEM +config HAVE_GENERIC_DMA_COHERENT bool + default n + +config SLABINFO + bool + depends on PROC_FS + depends on SLAB || SLUB_DEBUG + default y + +config RT_MUTEXES + boolean config BASE_SMALL int default 0 if BASE_FULL default 1 if !BASE_FULL -menu "Loadable module support" - -config MODULES +menuconfig MODULES bool "Enable loadable module support" + option modules help Kernel modules are small pieces of compiled code which can be inserted in the running kernel, rather than being @@ -431,18 +1733,27 @@ config MODULES If unsure, say Y. +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + config MODULE_UNLOAD bool "Module unloading" - depends on MODULES help Without this option you will not be able to unload any modules (note that some modules may not be unloadable - anyway), which makes your kernel slightly smaller and - simpler. If unsure, say Y. + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. config MODULE_FORCE_UNLOAD bool "Forced module unloading" - depends on MODULE_UNLOAD && EXPERIMENTAL + depends on MODULE_UNLOAD help This option allows you to force a module to unload, even if the kernel believes it is unsafe: the kernel will remove the module @@ -450,18 +1761,8 @@ config MODULE_FORCE_UNLOAD rmmod). This is mainly for kernel developers and desperate users. If unsure, say N. -config OBSOLETE_MODPARM - bool - default y - depends on MODULES - help - You need this option to use module parameters on modules which - have not been converted to the new module parameter system yet. - If unsure, say Y. - config MODVERSIONS - bool "Module versioning support (EXPERIMENTAL)" - depends on MODULES && EXPERIMENTAL + bool "Module versioning support" help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules @@ -472,7 +1773,6 @@ config MODVERSIONS config MODULE_SRCVERSION_ALL bool "Source checksum for all modules" - depends on MODULES help Modules which contain a MODULE_VERSION get an extra "srcversion" field inserted into their modinfo section, which contains a @@ -482,17 +1782,97 @@ config MODULE_SRCVERSION_ALL the version). With this option, such a "srcversion" field will be created for all modules. If unsure, say N. -config KMOD - bool "Automatic kernel module loading" +config MODULE_SIG + bool "Module signature verification" depends on MODULES + select SYSTEM_TRUSTED_KEYRING + select KEYS + select CRYPTO + select ASYMMETRIC_KEY_TYPE + select ASYMMETRIC_PUBLIC_KEY_SUBTYPE + select PUBLIC_KEY_ALGO_RSA + select ASN1 + select OID_REGISTRY + select X509_CERTIFICATE_PARSER + help + Check modules for valid signatures upon load: the signature + is simply appended to the module. For more information see + Documentation/module-signing.txt. + + !!!WARNING!!! If you enable this option, you MUST make sure that the + module DOES NOT get stripped after being signed. This includes the + debuginfo strip done by some packagers (such as rpmbuild) and + inclusion into an initramfs that wants the module size reduced. + +config MODULE_SIG_FORCE + bool "Require modules to be validly signed" + depends on MODULE_SIG help - Normally when you have selected some parts of the kernel to - be created as kernel modules, you must load them (using the - "modprobe" command) before you can use them. If you say Y - here, some parts of the kernel will be able to load modules - automatically: when a part of the kernel needs a module, it - runs modprobe with the appropriate arguments, thereby - loading the module if it is available. If unsure, say Y. + Reject unsigned modules or signed modules for which we don't have a + key. Without this, such modules will simply taint the kernel. + +config MODULE_SIG_ALL + bool "Automatically sign all modules" + default y + depends on MODULE_SIG + help + Sign all modules during make modules_install. Without this option, + modules must be signed manually, using the scripts/sign-file tool. + +comment "Do not forget to sign required modules with scripts/sign-file" + depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL + +choice + prompt "Which hash algorithm should modules be signed with?" + depends on MODULE_SIG + help + This determines which sort of hashing algorithm will be used during + signature generation. This algorithm _must_ be built into the kernel + directly so that signature verification can take place. It is not + possible to load a signed module containing the algorithm to check + the signature on that module. + +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + +config MODULE_SIG_SHA224 + bool "Sign modules with SHA-224" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA256 + bool "Sign modules with SHA-256" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA384 + bool "Sign modules with SHA-384" + select CRYPTO_SHA512 + +config MODULE_SIG_SHA512 + bool "Sign modules with SHA-512" + select CRYPTO_SHA512 + +endchoice + +config MODULE_SIG_HASH + string + depends on MODULE_SIG + default "sha1" if MODULE_SIG_SHA1 + default "sha224" if MODULE_SIG_SHA224 + default "sha256" if MODULE_SIG_SHA256 + default "sha384" if MODULE_SIG_SHA384 + default "sha512" if MODULE_SIG_SHA512 + +endif # MODULES + +config INIT_ALL_POSSIBLE + bool + help + Back when each arch used to define their own cpu_online_mask and + cpu_possible_mask, some of them chose to initialize cpu_possible_mask + with all 1s, and others with all 0s. When they were centralised, + it was better to provide this option than to break all the archs + and have several arch maintainers pursuing me down dark alleys. config STOP_MACHINE bool @@ -500,4 +1880,28 @@ config STOP_MACHINE depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU help Need stop_machine() primitive. -endmenu + +source "block/Kconfig" + +config PREEMPT_NOTIFIERS + bool + +config PADATA + depends on SMP + bool + +# Can be selected by architectures with broken toolchains +# that get confused by correct const<->read_only section +# mappings +config BROKEN_RODATA + bool + +config ASN1 + tristate + help + Build a simple ASN.1 grammar compiler that produces a bytecode output + that can be interpreted by the ASN.1 stream decoder and used to + inform it as to what tags are to be expected in a stream and what + functions to call on what tags. + +source "kernel/Kconfig.locks" diff --git a/init/Makefile b/init/Makefile index a2300078f2b..7bc47ee31c3 100644 --- a/init/Makefile +++ b/init/Makefile @@ -2,28 +2,35 @@ # Makefile for the linux kernel. # -obj-y := main.o version.o mounts.o initramfs.o +obj-y := main.o version.o mounts.o +ifneq ($(CONFIG_BLK_DEV_INITRD),y) +obj-y += noinitramfs.o +else +obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o +endif obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o +ifneq ($(CONFIG_ARCH_INIT_TASK),y) +obj-y += init_task.o +endif + mounts-y := do_mounts.o -mounts-$(CONFIG_DEVFS_FS) += do_mounts_devfs.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o -# files to be removed upon make clean -clean-files := ../include/linux/compile.h - # dependencies on generated files need to be listed explicitly - -$(obj)/version.o: include/linux/compile.h +$(obj)/version.o: include/generated/compile.h # compile.h changes depending on hostname, generation number, etc, # so we regenerate it always. # mkcompile_h will make sure to only update the # actual file if its content has changed. -include/linux/compile.h: FORCE - @echo ' CHK $@' + chk_compile.h = : + quiet_chk_compile.h = echo ' CHK $@' +silent_chk_compile.h = : +include/generated/compile.h: FORCE + @$($(quiet)chk_compile.h) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(CFLAGS)" + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)" diff --git a/init/calibrate.c b/init/calibrate.c index d206c7548fe..520702db9ac 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -4,13 +4,15 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include <linux/sched.h> +#include <linux/jiffies.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/timex.h> +#include <linux/smp.h> +#include <linux/percpu.h> -#include <asm/timex.h> - -static unsigned long preset_lpj; +unsigned long lpj_fine; +unsigned long preset_lpj; static int __init lpj_setup(char *str) { preset_lpj = simple_strtoul(str,NULL,0); @@ -29,14 +31,17 @@ __setup("lpj=", lpj_setup); #define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100)) #define MAX_DIRECT_CALIBRATION_RETRIES 5 -static unsigned long __devinit calibrate_delay_direct(void) +static unsigned long calibrate_delay_direct(void) { unsigned long pre_start, start, post_start; unsigned long pre_end, end, post_end; unsigned long start_jiffies; - unsigned long tsc_rate_min, tsc_rate_max; - unsigned long good_tsc_sum = 0; - unsigned long good_tsc_count = 0; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; + unsigned long measured_times[MAX_DIRECT_CALIBRATION_RETRIES]; + int max = -1; /* index of measured_times with max/min values or not set */ + int min = -1; int i; if (read_current_timer(&pre_start) < 0 ) @@ -65,7 +70,7 @@ static unsigned long __devinit calibrate_delay_direct(void) pre_start = 0; read_current_timer(&start); start_jiffies = jiffies; - while (jiffies <= (start_jiffies + 1)) { + while (time_before_eq(jiffies, start_jiffies + 1)) { pre_start = start; read_current_timer(&start); } @@ -73,101 +78,227 @@ static unsigned long __devinit calibrate_delay_direct(void) pre_end = 0; end = post_start; - while (jiffies <= - (start_jiffies + 1 + DELAY_CALIBRATION_TICKS)) { + while (time_before_eq(jiffies, start_jiffies + 1 + + DELAY_CALIBRATION_TICKS)) { pre_end = end; read_current_timer(&end); } read_current_timer(&post_end); - tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS; - tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS; + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; /* - * If the upper limit and lower limit of the tsc_rate is + * If the upper limit and lower limit of the timer_rate is * >= 12.5% apart, redo calibration. */ - if (pre_start != 0 && pre_end != 0 && - (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) { - good_tsc_count++; - good_tsc_sum += tsc_rate_max; - } + if (start >= post_end) + printk(KERN_NOTICE "calibrate_delay_direct() ignoring " + "timer_rate as we had a TSC wrap around" + " start=%lu >=post_end=%lu\n", + start, post_end); + if (start < post_end && pre_start != 0 && pre_end != 0 && + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; + measured_times[i] = timer_rate_max; + if (max < 0 || timer_rate_max > measured_times[max]) + max = i; + if (min < 0 || timer_rate_max < measured_times[min]) + min = i; + } else + measured_times[i] = 0; + } - if (good_tsc_count) - return (good_tsc_sum/good_tsc_count); + /* + * Find the maximum & minimum - if they differ too much throw out the + * one with the largest difference from the mean and try again... + */ + while (good_timer_count > 1) { + unsigned long estimate; + unsigned long maxdiff; + + /* compute the estimate */ + estimate = (good_timer_sum/good_timer_count); + maxdiff = estimate >> 3; + + /* if range is within 12% let's take it */ + if ((measured_times[max] - measured_times[min]) < maxdiff) + return estimate; + + /* ok - drop the worse value and try again... */ + good_timer_sum = 0; + good_timer_count = 0; + if ((measured_times[max] - estimate) < + (estimate - measured_times[min])) { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "min bogoMips estimate %d = %lu\n", + min, measured_times[min]); + measured_times[min] = 0; + min = max; + } else { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "max bogoMips estimate %d = %lu\n", + max, measured_times[max]); + measured_times[max] = 0; + max = min; + } + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + if (measured_times[i] == 0) + continue; + good_timer_count++; + good_timer_sum += measured_times[i]; + if (measured_times[i] < measured_times[min]) + min = i; + if (measured_times[i] > measured_times[max]) + max = i; + } + + } - printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " - "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); + printk(KERN_NOTICE "calibrate_delay_direct() failed to get a good " + "estimate for loops_per_jiffy.\nProbably due to long platform " + "interrupts. Consider using \"lpj=\" boot option.\n"); return 0; } #else -static unsigned long __devinit calibrate_delay_direct(void) {return 0;} +static unsigned long calibrate_delay_direct(void) +{ + return 0; +} #endif /* * This is the number of bits of precision for the loops_per_jiffy. Each - * bit takes on average 1.5/HZ seconds. This (like the original) is a little - * better than 1% + * time we refine our estimate after the first takes 1.5/HZ seconds, so try + * to start with a good estimate. + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as + * the cpu frequency, hence do the calibration for those. */ #define LPS_PREC 8 -void __devinit calibrate_delay(void) +static unsigned long calibrate_delay_converge(void) { - unsigned long ticks, loopbit; - int lps_precision = LPS_PREC; - - if (preset_lpj) { - loops_per_jiffy = preset_lpj; - printk("Calibrating delay loop (skipped)... " - "%lu.%02lu BogoMIPS preset\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100); - } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - printk("Calibrating delay using timer specific routine.. "); - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); - } else { - loops_per_jiffy = (1<<12); - - printk(KERN_DEBUG "Calibrating delay loop... "); - while ((loops_per_jiffy <<= 1) != 0) { - /* wait for "start of" clock tick */ - ticks = jiffies; - while (ticks == jiffies) - /* nothing */; - /* Go .. */ - ticks = jiffies; - __delay(loops_per_jiffy); - ticks = jiffies - ticks; - if (ticks) - break; - } + /* First stage - slowly accelerate to find initial bounds */ + unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit; + int trials = 0, band = 0, trial_in_band = 0; - /* - * Do a binary approximation to get loops_per_jiffy set to - * equal one clock (up to lps_precision bits) - */ - loops_per_jiffy >>= 1; - loopbit = loops_per_jiffy; - while (lps_precision-- && (loopbit >>= 1)) { - loops_per_jiffy |= loopbit; - ticks = jiffies; - while (ticks == jiffies) - /* nothing */; - ticks = jiffies; - __delay(loops_per_jiffy); - if (jiffies != ticks) /* longer than 1 tick */ - loops_per_jiffy &= ~loopbit; + lpj = (1<<12); + + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + /* Go .. */ + ticks = jiffies; + do { + if (++trial_in_band == (1<<band)) { + ++band; + trial_in_band = 0; } + __delay(lpj * band); + trials += band; + } while (ticks == jiffies); + /* + * We overshot, so retreat to a clear underestimate. Then estimate + * the largest likely undershoot. This defines our chop bounds. + */ + trials -= band; + loopadd_base = lpj * band; + lpj_base = lpj * trials; + +recalibrate: + lpj = lpj_base; + loopadd = loopadd_base; + + /* + * Do a binary approximation to get lpj set to + * equal one clock (up to LPS_PREC bits) + */ + chop_limit = lpj >> LPS_PREC; + while (loopadd > chop_limit) { + lpj += loopadd; + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + ticks = jiffies; + __delay(lpj); + if (jiffies != ticks) /* longer than 1 tick */ + lpj -= loopadd; + loopadd >>= 1; + } + /* + * If we incremented every single time possible, presume we've + * massively underestimated initially, and retry with a higher + * start, and larger range. (Only seen on x86_64, due to SMIs) + */ + if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) { + lpj_base = lpj; + loopadd_base <<= 2; + goto recalibrate; + } + + return lpj; +} - /* Round the value and print it */ - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); +static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 }; + +/* + * Check if cpu calibration delay is already known. For example, + * some processors with multi-core sockets may have all cores + * with the same calibration delay. + * + * Architectures should override this function if a faster calibration + * method is available. + */ +unsigned long __attribute__((weak)) calibrate_delay_is_known(void) +{ + return 0; +} + +void calibrate_delay(void) +{ + unsigned long lpj; + static bool printed; + int this_cpu = smp_processor_id(); + + if (per_cpu(cpu_loops_per_jiffy, this_cpu)) { + lpj = per_cpu(cpu_loops_per_jiffy, this_cpu); + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "already calibrated this CPU"); + } else if (preset_lpj) { + lpj = preset_lpj; + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "preset value.. "); + } else if ((!printed) && lpj_fine) { + lpj = lpj_fine; + pr_info("Calibrating delay loop (skipped), " + "value calculated using timer frequency.. "); + } else if ((lpj = calibrate_delay_is_known())) { + ; + } else if ((lpj = calibrate_delay_direct()) != 0) { + if (!printed) + pr_info("Calibrating delay using timer " + "specific routine.. "); + } else { + if (!printed) + pr_info("Calibrating delay loop... "); + lpj = calibrate_delay_converge(); } + per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj; + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", + lpj/(500000/HZ), + (lpj/(5000/HZ)) % 100, lpj); + loops_per_jiffy = lpj; + printed = true; } diff --git a/init/do_mounts.c b/init/do_mounts.c index b27c1106440..82f22885c87 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -1,3 +1,13 @@ +/* + * Many of the syscalls used in this file expect some of the arguments + * to be __user pointers not __kernel pointers. To limit the sparse + * noise, turn off sparse checking for this file. + */ +#ifdef __CHECKER__ +#undef __CHECKER__ +#warning "Sparse checking disabled for this file" +#endif + #include <linux/module.h> #include <linux/sched.h> #include <linux/ctype.h> @@ -7,7 +17,17 @@ #include <linux/root_dev.h> #include <linux/security.h> #include <linux/delay.h> +#include <linux/genhd.h> #include <linux/mount.h> +#include <linux/device.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/initrd.h> +#include <linux/async.h> +#include <linux/fs_struct.h> +#include <linux/slab.h> +#include <linux/ramfs.h> +#include <linux/shmem_fs.h> #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> @@ -15,15 +35,13 @@ #include "do_mounts.h" -extern int get_filesystem_list(char * buf); - int __initdata rd_doload; /* 1 = load RAM disk, 0 = don't load */ -int root_mountflags = MS_RDONLY | MS_VERBOSE; -char * __initdata root_device_name; +int root_mountflags = MS_RDONLY | MS_SILENT; +static char * __initdata root_device_name; static char __initdata saved_root_name[64]; +static int root_wait; -/* this is initialized in init/main.c */ dev_t ROOT_DEV; static int __init load_ramdisk(char *str) @@ -52,68 +70,114 @@ static int __init readwrite(char *str) __setup("ro", readonly); __setup("rw", readwrite); -static dev_t try_name(char *name, int part) -{ - char path[64]; - char buf[32]; - int range; - dev_t res; - char *s; +#ifdef CONFIG_BLOCK +struct uuidcmp { + const char *uuid; int len; - int fd; - unsigned int maj, min; +}; - /* read device number from .../dev */ +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to the desired struct uuidcmp to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, const void *data) +{ + const struct uuidcmp *cmp = data; + struct hd_struct *part = dev_to_part(dev); - sprintf(path, "/sys/block/%s/dev", name); - fd = sys_open(path, 0, 0); - if (fd < 0) - goto fail; - len = sys_read(fd, buf, 32); - sys_close(fd); - if (len <= 0 || len == 32 || buf[len - 1] != '\n') - goto fail; - buf[len - 1] = '\0'; - if (sscanf(buf, "%u:%u", &maj, &min) == 2) { - /* - * Try the %u:%u format -- see print_dev_t() - */ - res = MKDEV(maj, min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto fail; + if (!part->info) + goto no_match; + + if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + goto no_match; + + return 1; +no_match: + return 0; +} + + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid_str: char array containing ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static dev_t devt_from_partuuid(const char *uuid_str) +{ + dev_t res = 0; + struct uuidcmp cmp; + struct device *dev = NULL; + struct gendisk *disk; + struct hd_struct *part; + int offset = 0; + bool clear_root_wait = false; + char *slash; + + cmp.uuid = uuid_str; + + slash = strchr(uuid_str, '/'); + /* Check for optional partition number offset attributes. */ + if (slash) { + char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(slash + 1, + "PARTNROFF=%d%c", &offset, &c) != 1) { + clear_root_wait = true; + goto done; + } + cmp.len = slash - uuid_str; } else { - /* - * Nope. Try old-style "0321" - */ - res = new_decode_dev(simple_strtoul(buf, &s, 16)); - if (*s) - goto fail; + cmp.len = strlen(uuid_str); } - /* if it's there and we are not looking for a partition - that's it */ - if (!part) - return res; + if (!cmp.len) { + clear_root_wait = true; + goto done; + } - /* otherwise read range from .../range */ - sprintf(path, "/sys/block/%s/range", name); - fd = sys_open(path, 0, 0); - if (fd < 0) - goto fail; - len = sys_read(fd, buf, 32); - sys_close(fd); - if (len <= 0 || len == 32 || buf[len - 1] != '\n') - goto fail; - buf[len - 1] = '\0'; - range = simple_strtoul(buf, &s, 10); - if (*s) - goto fail; + dev = class_find_device(&block_class, NULL, &cmp, + &match_dev_by_uuid); + if (!dev) + goto done; - /* if partition is within range - we got it */ - if (part < range) - return res + part; -fail: - return 0; + res = dev->devt; + + /* Attempt to find the partition by offset. */ + if (!offset) + goto no_offset; + + res = 0; + disk = part_to_disk(dev_to_part(dev)); + part = disk_get_part(disk, dev_to_part(dev)->partno + offset); + if (part) { + res = part_devt(part); + put_device(part_to_dev(part)); + } + +no_offset: + put_device(dev); +done: + if (clear_root_wait) { + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); + if (root_wait) + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + } + return res; } +#endif /* * Convert a name into device number. We accept the following variants: @@ -125,13 +189,21 @@ fail: * of partition - device number of disk plus the partition number * 5) /dev/<disk_name>p<decimal> - same as the above, that form is * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 8) <major>:<minor> major and minor number of the device separated by + * a colon. * - * If name doesn't have fall into the categories above, we return 0. - * Sysfs is used to check if something is a disk name - it has - * all known disks under bus/block/devices. If the disk name - * contains slashes, name of sysfs node has them replaced with - * bangs. try_name() does the actual checks, assuming that sysfs - * is mounted on rootfs /sys. + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. */ dev_t name_to_dev_t(char *name) @@ -141,10 +213,14 @@ dev_t name_to_dev_t(char *name) dev_t res = 0; int part; -#ifdef CONFIG_SYSFS - int mkdir_err = sys_mkdir("/sys", 0700); - if (sys_mount("sysfs", "/sys", "sysfs", 0, NULL) < 0) - goto out; +#ifdef CONFIG_BLOCK + if (strncmp(name, "PARTUUID=", 9) == 0) { + name += 9; + res = devt_from_partuuid(name); + if (!res) + goto fail; + goto done; + } #endif if (strncmp(name, "/dev/", 5) != 0) { @@ -161,6 +237,7 @@ dev_t name_to_dev_t(char *name) } goto done; } + name += 5; res = Root_NFS; if (strcmp(name, "nfs") == 0) @@ -175,35 +252,38 @@ dev_t name_to_dev_t(char *name) for (p = s; *p; p++) if (*p == '/') *p = '!'; - res = try_name(s, 0); + res = blk_lookup_devt(s, 0); if (res) goto done; + /* + * try non-existent, but valid partition, which may only exist + * after revalidating the disk, like partitioned md devices + */ while (p > s && isdigit(p[-1])) p--; if (p == s || !*p || *p == '0') goto fail; + + /* try disk name without <part number> */ part = simple_strtoul(p, NULL, 10); *p = '\0'; - res = try_name(s, part); + res = blk_lookup_devt(s, part); if (res) goto done; + /* try disk name without p<part number> */ if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') goto fail; p[-1] = '\0'; - res = try_name(s, part); + res = blk_lookup_devt(s, part); + if (res) + goto done; + +fail: + return 0; done: -#ifdef CONFIG_SYSFS - sys_umount("/sys", 0); -out: - if (!mkdir_err) - sys_rmdir("/sys"); -#endif return res; -fail: - res = 0; - goto done; } static int __init root_dev_setup(char *line) @@ -214,6 +294,16 @@ static int __init root_dev_setup(char *line) __setup("root=", root_dev_setup); +static int __init rootwait_setup(char *str) +{ + if (*str) + return 0; + root_wait = 1; + return 1; +} + +__setup("rootwait", rootwait_setup); + static char * __initdata root_mount_data; static int __init root_data_setup(char *str) { @@ -268,24 +358,33 @@ static void __init get_fs_names(char *page) static int __init do_mount_root(char *name, char *fs, int flags, void *data) { + struct super_block *s; int err = sys_mount(name, "/root", fs, flags, data); if (err) return err; sys_chdir("/root"); - ROOT_DEV = current->fs->pwdmnt->mnt_sb->s_dev; - printk("VFS: Mounted root (%s filesystem)%s.\n", - current->fs->pwdmnt->mnt_sb->s_type->name, - current->fs->pwdmnt->mnt_sb->s_flags & MS_RDONLY ? - " readonly" : ""); + s = current->fs->pwd.dentry->d_sb; + ROOT_DEV = s->s_dev; + printk(KERN_INFO + "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", + s->s_type->name, + s->s_flags & MS_RDONLY ? " readonly" : "", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); return 0; } void __init mount_block_root(char *name, int flags) { - char *fs_names = __getname(); + struct page *page = alloc_page(GFP_KERNEL | + __GFP_NOTRACK_FALSE_POSITIVE); + char *fs_names = page_address(page); char *p; +#ifdef CONFIG_BLOCK char b[BDEVNAME_SIZE]; +#else + const char *b = name; +#endif get_fs_names(fs_names); retry: @@ -303,28 +402,73 @@ retry: /* * Allow the user to distinguish between failed sys_open * and bad superblock on root device. + * and give them a list of the available devices */ +#ifdef CONFIG_BLOCK __bdevname(ROOT_DEV, b); - printk("VFS: Cannot open root device \"%s\" or %s\n", - root_device_name, b); - printk("Please append a correct \"root=\" boot option\n"); - +#endif + printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", + root_device_name, b, err); + printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); + + printk_all_partitions(); +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " + "explicit textual name for \"root=\" boot option.\n"); +#endif panic("VFS: Unable to mount root fs on %s", b); } - panic("VFS: Unable to mount root fs on %s", __bdevname(ROOT_DEV, b)); + + printk("List of all partitions:\n"); + printk_all_partitions(); + printk("No filesystem could mount root, tried: "); + for (p = fs_names; *p; p += strlen(p)+1) + printk(" %s", p); + printk("\n"); +#ifdef CONFIG_BLOCK + __bdevname(ROOT_DEV, b); +#endif + panic("VFS: Unable to mount root fs on %s", b); out: - putname(fs_names); + put_page(page); } #ifdef CONFIG_ROOT_NFS + +#define NFSROOT_TIMEOUT_MIN 5 +#define NFSROOT_TIMEOUT_MAX 30 +#define NFSROOT_RETRY_MAX 5 + static int __init mount_nfs_root(void) { - void *data = nfs_root_data(); + char *root_dev, *root_data; + unsigned int timeout; + int try, err; + + err = nfs_root_data(&root_dev, &root_data); + if (err != 0) + return 0; - create_dev("/dev/root", ROOT_DEV, NULL); - if (data && - do_mount_root("/dev/root", "nfs", root_mountflags, data) == 0) - return 1; + /* + * The server or network may not be ready, so try several + * times. Stop after a few tries in case the client wants + * to fall back to other boot methods. + */ + timeout = NFSROOT_TIMEOUT_MIN; + for (try = 1; ; try++) { + err = do_mount_root(root_dev, "nfs", + root_mountflags, root_data); + if (err == 0) + return 1; + if (try > NFSROOT_RETRY_MAX) + break; + + /* Wait, in case the server refused us immediately */ + ssleep(timeout); + timeout <<= 1; + if (timeout > NFSROOT_TIMEOUT_MAX) + timeout = NFSROOT_TIMEOUT_MAX; + } return 0; } #endif @@ -362,7 +506,7 @@ void __init change_floppy(char *fmt, ...) void __init mount_root(void) { #ifdef CONFIG_ROOT_NFS - if (MAJOR(ROOT_DEV) == UNNAMED_MAJOR) { + if (ROOT_DEV == Root_NFS) { if (mount_nfs_root()) return; @@ -382,8 +526,10 @@ void __init mount_root(void) change_floppy("root floppy"); } #endif - create_dev("/dev/root", ROOT_DEV, root_device_name); +#ifdef CONFIG_BLOCK + create_dev("/dev/root", ROOT_DEV); mount_block_root("/dev/root", root_mountflags); +#endif } /* @@ -393,37 +539,99 @@ void __init prepare_namespace(void) { int is_floppy; - mount_devfs(); - if (root_delay) { - printk(KERN_INFO "Waiting %dsec before mounting root device...\n", + printk(KERN_INFO "Waiting %d sec before mounting root device...\n", root_delay); ssleep(root_delay); } + /* + * wait for the known devices to complete their probing + * + * Note: this is a potential source of long boot delays. + * For example, it is not atypical to wait 5 seconds here + * for the touchpad of a laptop to initialize. + */ + wait_for_device_probe(); + md_run_setup(); if (saved_root_name[0]) { root_device_name = saved_root_name; + if (!strncmp(root_device_name, "mtd", 3) || + !strncmp(root_device_name, "ubi", 3)) { + mount_block_root(root_device_name, root_mountflags); + goto out; + } ROOT_DEV = name_to_dev_t(root_device_name); if (strncmp(root_device_name, "/dev/", 5) == 0) root_device_name += 5; } - is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; - if (initrd_load()) goto out; + /* wait for any asynchronous scanning to complete */ + if ((ROOT_DEV == 0) && root_wait) { + printk(KERN_INFO "Waiting for root device %s...\n", + saved_root_name); + while (driver_probe_done() != 0 || + (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) + msleep(100); + async_synchronize_full(); + } + + is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; + if (is_floppy && rd_doload && rd_load_disk(0)) ROOT_DEV = Root_RAM0; mount_root(); out: - umount_devfs("/dev"); + devtmpfs_mount("dev"); sys_mount(".", "/", NULL, MS_MOVE, NULL); sys_chroot("."); - security_sb_post_mountroot(); - mount_devfs_fs (); } +static bool is_tmpfs; +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static unsigned long once; + void *fill = ramfs_fill_super; + + if (test_and_set_bit(0, &once)) + return ERR_PTR(-ENODEV); + + if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs) + fill = shmem_fill_super; + + return mount_nodev(fs_type, flags, data, fill); +} + +static struct file_system_type rootfs_fs_type = { + .name = "rootfs", + .mount = rootfs_mount, + .kill_sb = kill_litter_super, +}; + +int __init init_rootfs(void) +{ + int err = register_filesystem(&rootfs_fs_type); + + if (err) + return err; + + if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] && + (!root_fs_names || strstr(root_fs_names, "tmpfs"))) { + err = shmem_init(); + is_tmpfs = true; + } else { + err = init_ramfs_fs(); + } + + if (err) + unregister_filesystem(&rootfs_fs_type); + + return err; +} diff --git a/init/do_mounts.h b/init/do_mounts.h index e0a7ac9649e..f5b978a9bb9 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -1,6 +1,5 @@ -#include <linux/config.h> #include <linux/kernel.h> -#include <linux/devfs_fs_kernel.h> +#include <linux/blkdev.h> #include <linux/init.h> #include <linux/syscalls.h> #include <linux/unistd.h> @@ -13,27 +12,13 @@ void change_floppy(char *fmt, ...); void mount_block_root(char *name, int flags); void mount_root(void); extern int root_mountflags; -extern char *root_device_name; -#ifdef CONFIG_DEVFS_FS - -void mount_devfs(void); -void umount_devfs(char *path); -int create_dev(char *name, dev_t dev, char *devfs_name); - -#else - -static inline void mount_devfs(void) {} -static inline void umount_devfs(const char *path) {} - -static inline int create_dev(char *name, dev_t dev, char *devfs_name) +static inline int create_dev(char *name, dev_t dev) { sys_unlink(name); return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); } -#endif - #if BITS_PER_LONG == 32 static inline u32 bstat(char *name) { diff --git a/init/do_mounts_devfs.c b/init/do_mounts_devfs.c deleted file mode 100644 index cc526474690..00000000000 --- a/init/do_mounts_devfs.c +++ /dev/null @@ -1,137 +0,0 @@ - -#include <linux/kernel.h> -#include <linux/dirent.h> -#include <linux/string.h> - -#include "do_mounts.h" - -void __init mount_devfs(void) -{ - sys_mount("devfs", "/dev", "devfs", 0, NULL); -} - -void __init umount_devfs(char *path) -{ - sys_umount(path, 0); -} - -/* - * If the dir will fit in *buf, return its length. If it won't fit, return - * zero. Return -ve on error. - */ -static int __init do_read_dir(int fd, void *buf, int len) -{ - long bytes, n; - char *p = buf; - sys_lseek(fd, 0, 0); - - for (bytes = 0; bytes < len; bytes += n) { - n = sys_getdents64(fd, (struct linux_dirent64 *)(p + bytes), - len - bytes); - if (n < 0) - return n; - if (n == 0) - return bytes; - } - return 0; -} - -/* - * Try to read all of a directory. Returns the contents at *p, which - * is kmalloced memory. Returns the number of bytes read at *len. Returns - * NULL on error. - */ -static void * __init read_dir(char *path, int *len) -{ - int size; - int fd = sys_open(path, 0, 0); - - *len = 0; - if (fd < 0) - return NULL; - - for (size = 1 << 9; size <= (PAGE_SIZE << MAX_ORDER); size <<= 1) { - void *p = kmalloc(size, GFP_KERNEL); - int n; - if (!p) - break; - n = do_read_dir(fd, p, size); - if (n > 0) { - sys_close(fd); - *len = n; - return p; - } - kfree(p); - if (n == -EINVAL) - continue; /* Try a larger buffer */ - if (n < 0) - break; - } - sys_close(fd); - return NULL; -} - -/* - * recursively scan <path>, looking for a device node of type <dev> - */ -static int __init find_in_devfs(char *path, unsigned dev) -{ - char *end = path + strlen(path); - int rest = path + 64 - end; - int size; - char *p = read_dir(path, &size); - char *s; - - if (!p) - return -1; - for (s = p; s < p + size; s += ((struct linux_dirent64 *)s)->d_reclen) { - struct linux_dirent64 *d = (struct linux_dirent64 *)s; - if (strlen(d->d_name) + 2 > rest) - continue; - switch (d->d_type) { - case DT_BLK: - sprintf(end, "/%s", d->d_name); - if (bstat(path) != dev) - break; - kfree(p); - return 0; - case DT_DIR: - if (strcmp(d->d_name, ".") == 0) - break; - if (strcmp(d->d_name, "..") == 0) - break; - sprintf(end, "/%s", d->d_name); - if (find_in_devfs(path, dev) < 0) - break; - kfree(p); - return 0; - } - } - kfree(p); - return -1; -} - -/* - * create a device node called <name> which points to - * <devfs_name> if possible, otherwise find a device node - * which matches <dev> and make <name> a symlink pointing to it. - */ -int __init create_dev(char *name, dev_t dev, char *devfs_name) -{ - char path[64]; - - sys_unlink(name); - if (devfs_name && devfs_name[0]) { - if (strncmp(devfs_name, "/dev/", 5) == 0) - devfs_name += 5; - sprintf(path, "/dev/%s", devfs_name); - if (sys_access(path, 0) == 0) - return sys_symlink(devfs_name, name); - } - if (!dev) - return -1; - strcpy(path, "/dev"); - if (find_in_devfs(path, new_encode_dev(dev)) < 0) - return -1; - return sys_symlink(path + 5, name); -} diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a05cabd0fd1..3e0878e8a80 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -1,19 +1,28 @@ -#define __KERNEL_SYSCALLS__ +/* + * Many of the syscalls used in this file expect some of the arguments + * to be __user pointers not __kernel pointers. To limit the sparse + * noise, turn off sparse checking for this file. + */ +#ifdef __CHECKER__ +#undef __CHECKER__ +#warning "Sparse checking disabled for this file" +#endif + #include <linux/unistd.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/minix_fs.h> -#include <linux/ext2_fs.h> #include <linux/romfs_fs.h> #include <linux/initrd.h> #include <linux/sched.h> +#include <linux/freezer.h> +#include <linux/kmod.h> #include "do_mounts.h" unsigned long initrd_start, initrd_end; int initrd_below_start_ok; unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ -static int __initdata old_fd, root_fd; static int __initdata mount_initrd = 1; static int __init no_initrd(char *str) @@ -24,59 +33,63 @@ static int __init no_initrd(char *str) __setup("noinitrd", no_initrd); -static int __init do_linuxrc(void * shell) +static int init_linuxrc(struct subprocess_info *info, struct cred *new) { - static char *argv[] = { "linuxrc", NULL, }; - extern char * envp_init[]; - - sys_close(old_fd);sys_close(root_fd); - sys_close(0);sys_close(1);sys_close(2); + sys_unshare(CLONE_FS | CLONE_FILES); + /* stdin/stdout/stderr for /linuxrc */ + sys_open("/dev/console", O_RDWR, 0); + sys_dup(0); + sys_dup(0); + /* move initrd over / and chdir/chroot in initrd root */ + sys_chdir("/root"); + sys_mount(".", "/", NULL, MS_MOVE, NULL); + sys_chroot("."); sys_setsid(); - (void) sys_open("/dev/console",O_RDWR,0); - (void) sys_dup(0); - (void) sys_dup(0); - return execve(shell, argv, envp_init); + return 0; } static void __init handle_initrd(void) { + struct subprocess_info *info; + static char *argv[] = { "linuxrc", NULL, }; + extern char *envp_init[]; int error; - int pid; real_root_dev = new_encode_dev(ROOT_DEV); - create_dev("/dev/root.old", Root_RAM0, NULL); + create_dev("/dev/root.old", Root_RAM0); /* mount initrd on rootfs' /root */ mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); sys_mkdir("/old", 0700); - root_fd = sys_open("/", 0, 0); - old_fd = sys_open("/old", 0, 0); - /* move initrd over / and chdir/chroot in initrd root */ - sys_chdir("/root"); - sys_mount(".", "/", NULL, MS_MOVE, NULL); - sys_chroot("."); - mount_devfs_fs (); + sys_chdir("/old"); - pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); - if (pid > 0) { - while (pid != sys_wait4(-1, NULL, 0, NULL)) - yield(); - } + /* try loading default modules from initrd */ + load_default_modules(); + + /* + * In case that a resume from disk is carried out by linuxrc or one of + * its children, we need to tell the freezer not to wait for us. + */ + current->flags |= PF_FREEZER_SKIP; + + info = call_usermodehelper_setup("/linuxrc", argv, envp_init, + GFP_KERNEL, init_linuxrc, NULL, NULL); + if (!info) + return; + call_usermodehelper_exec(info, UMH_WAIT_PROC); + + current->flags &= ~PF_FREEZER_SKIP; /* move initrd to rootfs' /old */ - sys_fchdir(old_fd); - sys_mount("/", ".", NULL, MS_MOVE, NULL); + sys_mount("..", ".", NULL, MS_MOVE, NULL); /* switch root and cwd back to / of rootfs */ - sys_fchdir(root_fd); - sys_chroot("."); - sys_close(old_fd); - sys_close(root_fd); - umount_devfs("/old/dev"); + sys_chroot(".."); if (new_decode_dev(real_root_dev) == Root_RAM0) { sys_chdir("/old"); return; } + sys_chdir("/"); ROOT_DEV = new_decode_dev(real_root_dev); mount_root(); @@ -106,7 +119,7 @@ static void __init handle_initrd(void) int __init initrd_load(void) { if (mount_initrd) { - create_dev("/dev/ram", Root_RAM0, NULL); + create_dev("/dev/ram", Root_RAM0); /* * Load the initrd data into /dev/ram0. Execute it as initrd * unless /dev/ram0 is supposed to be our actual root device, diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 3fbc3555ce9..8cb6db54285 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -1,5 +1,16 @@ +/* + * Many of the syscalls used in this file expect some of the arguments + * to be __user pointers not __kernel pointers. To limit the sparse + * noise, turn off sparse checking for this file. + */ +#ifdef __CHECKER__ +#undef __CHECKER__ +#warning "Sparse checking disabled for this file" +#endif -#include <linux/raid/md.h> +#include <linux/delay.h> +#include <linux/raid/md_u.h> +#include <linux/raid/md_p.h> #include "do_mounts.h" @@ -12,19 +23,23 @@ * The code for that is here. */ -static int __initdata raid_noautodetect, raid_autopart; +#ifdef CONFIG_MD_AUTODETECT +static int __initdata raid_noautodetect; +#else +static int __initdata raid_noautodetect=1; +#endif +static int __initdata raid_autopart; static struct { int minor; int partitioned; - int pers; + int level; int chunk; char *device_names; -} md_setup_args[MAX_MD_DEVS] __initdata; +} md_setup_args[256] __initdata; static int md_setup_ents __initdata; -extern int mdp_major; /* * Parse the command-line parameters given our kernel, but do not * actually try to invoke the MD device now; that is handled by @@ -47,7 +62,7 @@ extern int mdp_major; */ static int __init md_setup(char *str) { - int minor, level, factor, fault, pers, partitioned = 0; + int minor, level, factor, fault, partitioned = 0; char *pername = ""; char *str1; int ent; @@ -61,10 +76,6 @@ static int __init md_setup(char *str) return 0; } str1 = str; - if (minor >= MAX_MD_DEVS) { - printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); - return 0; - } for (ent=0 ; ent< md_setup_ents ; ent++) if (md_setup_args[ent].minor == minor && md_setup_args[ent].partitioned == partitioned) { @@ -72,13 +83,13 @@ static int __init md_setup(char *str) "Replacing previous definition.\n", partitioned?"d":"", minor); break; } - if (ent >= MAX_MD_DEVS) { + if (ent >= ARRAY_SIZE(md_setup_args)) { printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor); return 0; } if (ent >= md_setup_ents) md_setup_ents++; - switch (get_option(&str, &level)) { /* RAID Personality */ + switch (get_option(&str, &level)) { /* RAID level */ case 2: /* could be 0 or -1.. */ if (level == 0 || level == LEVEL_LINEAR) { if (get_option(&str, &factor) != 2 || /* Chunk Size */ @@ -86,16 +97,12 @@ static int __init md_setup(char *str) printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); return 0; } - md_setup_args[ent].pers = level; + md_setup_args[ent].level = level; md_setup_args[ent].chunk = 1 << (factor+12); - if (level == LEVEL_LINEAR) { - pers = LINEAR; + if (level == LEVEL_LINEAR) pername = "linear"; - } else { - pers = RAID0; + else pername = "raid0"; - } - md_setup_args[ent].pers = pers; break; } /* FALL THROUGH */ @@ -103,7 +110,7 @@ static int __init md_setup(char *str) str = str1; /* FALL THROUGH */ case 0: - md_setup_args[ent].pers = 0; + md_setup_args[ent].level = LEVEL_NONE; pername="super-block"; } @@ -116,8 +123,6 @@ static int __init md_setup(char *str) return 1; } -#define MdpMinorShift 6 - static void __init md_setup_drive(void) { int minor, i, ent, partitioned; @@ -129,20 +134,19 @@ static void __init md_setup_drive(void) int err = 0; char *devname; mdu_disk_info_t dinfo; - char name[16], devfs_name[16]; + char name[16]; minor = md_setup_args[ent].minor; partitioned = md_setup_args[ent].partitioned; devname = md_setup_args[ent].device_names; sprintf(name, "/dev/md%s%d", partitioned?"_d":"", minor); - sprintf(devfs_name, "/dev/md/%s%d", partitioned?"d":"", minor); if (partitioned) dev = MKDEV(mdp_major, minor << MdpMinorShift); else dev = MKDEV(MD_MAJOR, minor); - create_dev(name, dev, devfs_name); - for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + create_dev(name, dev); + for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) { char *p; char comp_name[64]; u32 rdev; @@ -190,10 +194,10 @@ static void __init md_setup_drive(void) continue; } - if (md_setup_args[ent].pers) { + if (md_setup_args[ent].level != LEVEL_NONE) { /* non-persistent */ mdu_array_info_t ainfo; - ainfo.level = pers_to_level(md_setup_args[ent].pers); + ainfo.level = md_setup_args[ent].level; ainfo.size = 0; ainfo.nr_disks =0; ainfo.raid_disks =0; @@ -262,6 +266,8 @@ static int __init raid_setup(char *str) if (!strncmp(str, "noautodetect", wlen)) raid_noautodetect = 1; + if (!strncmp(str, "autodetect", wlen)) + raid_noautodetect = 0; if (strncmp(str, "partitionable", wlen)==0) raid_autopart = 1; if (strncmp(str, "part", wlen)==0) @@ -274,17 +280,33 @@ static int __init raid_setup(char *str) __setup("raid=", raid_setup); __setup("md=", md_setup); +static void __init autodetect_raid(void) +{ + int fd; + + /* + * Since we don't want to detect and use half a raid array, we need to + * wait for the known devices to complete their probing + */ + printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n"); + printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n"); + + wait_for_device_probe(); + + fd = sys_open("/dev/md0", 0, 0); + if (fd >= 0) { + sys_ioctl(fd, RAID_AUTORUN, raid_autopart); + sys_close(fd); + } +} + void __init md_run_setup(void) { - create_dev("/dev/md0", MKDEV(MD_MAJOR, 0), "md/0"); + create_dev("/dev/md0", MKDEV(MD_MAJOR, 0)); + if (raid_noautodetect) - printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); - else { - int fd = sys_open("/dev/md0", 0, 0); - if (fd >= 0) { - sys_ioctl(fd, RAID_AUTORUN, raid_autopart); - sys_close(fd); - } - } + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n"); + else + autodetect_raid(); md_setup_drive(); } diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index c10b08a8098..a8227022e3a 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -1,16 +1,28 @@ +/* + * Many of the syscalls used in this file expect some of the arguments + * to be __user pointers not __kernel pointers. To limit the sparse + * noise, turn off sparse checking for this file. + */ +#ifdef __CHECKER__ +#undef __CHECKER__ +#warning "Sparse checking disabled for this file" +#endif #include <linux/kernel.h> #include <linux/fs.h> #include <linux/minix_fs.h> #include <linux/ext2_fs.h> #include <linux/romfs_fs.h> -#include <linux/cramfs_fs.h> +#include <uapi/linux/cramfs_fs.h> #include <linux/initrd.h> #include <linux/string.h> +#include <linux/slab.h> #include "do_mounts.h" +#include "../fs/squashfs/squashfs_fs.h" + +#include <linux/decompress/generic.h> -#define BUILD_CRAMDISK int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */ @@ -30,7 +42,7 @@ static int __init ramdisk_start_setup(char *str) } __setup("ramdisk_start=", ramdisk_start_setup); -static int __init crd_load(int in_fd, int out_fd); +static int __init crd_load(int in_fd, int out_fd, decompress_fn deco); /* * This routine tries to find a RAM disk image to load, and returns the @@ -39,46 +51,55 @@ static int __init crd_load(int in_fd, int out_fd); * numbers could not be found. * * We currently check for the following magic numbers: - * minix - * ext2 + * minix + * ext2 * romfs * cramfs - * gzip + * squashfs + * gzip + * bzip2 + * lzma + * xz + * lzo + * lz4 */ -static int __init -identify_ramdisk_image(int fd, int start_block) +static int __init +identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) { const int size = 512; struct minix_super_block *minixsb; - struct ext2_super_block *ext2sb; struct romfs_super_block *romfsb; struct cramfs_super *cramfsb; + struct squashfs_super_block *squashfsb; int nblocks = -1; unsigned char *buf; + const char *compress_name; + unsigned long n; buf = kmalloc(size, GFP_KERNEL); - if (buf == 0) - return -1; + if (!buf) + return -ENOMEM; minixsb = (struct minix_super_block *) buf; - ext2sb = (struct ext2_super_block *) buf; romfsb = (struct romfs_super_block *) buf; cramfsb = (struct cramfs_super *) buf; + squashfsb = (struct squashfs_super_block *) buf; memset(buf, 0xe5, size); /* - * Read block 0 to test for gzipped kernel + * Read block 0 to test for compressed kernel */ sys_lseek(fd, start_block * BLOCK_SIZE, 0); sys_read(fd, buf, size); - /* - * If it matches the gzip magic numbers, return -1 - */ - if (buf[0] == 037 && ((buf[1] == 0213) || (buf[1] == 0236))) { - printk(KERN_NOTICE - "RAMDISK: Compressed image found at block %d\n", - start_block); + *decompressor = decompress_method(buf, size, &compress_name); + if (compress_name) { + printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n", + compress_name, start_block); + if (!*decompressor) + printk(KERN_EMERG + "RAMDISK: %s decompressor not configured!\n", + compress_name); nblocks = 0; goto done; } @@ -101,6 +122,30 @@ identify_ramdisk_image(int fd, int start_block) goto done; } + /* squashfs is at block zero too */ + if (le32_to_cpu(squashfsb->s_magic) == SQUASHFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: squashfs filesystem found at block %d\n", + start_block); + nblocks = (le64_to_cpu(squashfsb->bytes_used) + BLOCK_SIZE - 1) + >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read 512 bytes further to check if cramfs is padded + */ + sys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0); + sys_read(fd, buf, size); + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + /* * Read block 1 to test for minix and ext2 superblock */ @@ -118,19 +163,19 @@ identify_ramdisk_image(int fd, int start_block) } /* Try ext2 */ - if (ext2sb->s_magic == cpu_to_le16(EXT2_SUPER_MAGIC)) { + n = ext2_image_size(buf); + if (n) { printk(KERN_NOTICE "RAMDISK: ext2 filesystem found at block %d\n", start_block); - nblocks = le32_to_cpu(ext2sb->s_blocks_count) << - le32_to_cpu(ext2sb->s_log_block_size); + nblocks = n; goto done; } printk(KERN_NOTICE "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n", start_block); - + done: sys_lseek(fd, start_block * BLOCK_SIZE, 0); kfree(buf); @@ -145,7 +190,8 @@ int __init rd_load_image(char *from) int nblocks, i, disk; char *buf = NULL; unsigned short rotate = 0; -#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES) + decompress_fn decompressor = NULL; +#if !defined(CONFIG_S390) char rotator[4] = { '|' , '/' , '-' , '\\' }; #endif @@ -157,19 +203,13 @@ int __init rd_load_image(char *from) if (in_fd < 0) goto noclose_input; - nblocks = identify_ramdisk_image(in_fd, rd_image_start); + nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor); if (nblocks < 0) goto done; if (nblocks == 0) { -#ifdef BUILD_CRAMDISK - if (crd_load(in_fd, out_fd) == 0) + if (crd_load(in_fd, out_fd, decompressor) == 0) goto successful_load; -#else - printk(KERN_NOTICE - "RAMDISK: Kernel does not support compressed " - "RAM disk images\n"); -#endif goto done; } @@ -194,7 +234,7 @@ int __init rd_load_image(char *from) nblocks, rd_blocks); goto done; } - + /* * OK, time to copy in the data */ @@ -212,7 +252,7 @@ int __init rd_load_image(char *from) } buf = kmalloc(BLOCK_SIZE, GFP_KERNEL); - if (buf == 0) { + if (!buf) { printk(KERN_ERR "RAMDISK: could not allocate buffer\n"); goto done; } @@ -237,7 +277,7 @@ int __init rd_load_image(char *from) } sys_read(in_fd, buf, BLOCK_SIZE); sys_write(out_fd, buf, BLOCK_SIZE); -#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES) +#if !defined(CONFIG_S390) if (!(i % 16)) { printk("%c\b", rotator[rotate & 0x3]); rotate++; @@ -262,168 +302,60 @@ int __init rd_load_disk(int n) { if (rd_prompt) change_floppy("root floppy disk to be loaded into RAM disk"); - create_dev("/dev/root", ROOT_DEV, root_device_name); - create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n), NULL); + create_dev("/dev/root", ROOT_DEV); + create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n)); return rd_load_image("/dev/root"); } -#ifdef BUILD_CRAMDISK - -/* - * gzip declarations - */ - -#define OF(args) args - -#ifndef memzero -#define memzero(s, n) memset ((s), 0, (n)) -#endif - -typedef unsigned char uch; -typedef unsigned short ush; -typedef unsigned long ulg; - -#define INBUFSIZ 4096 -#define WSIZE 0x8000 /* window size--must be a power of two, and */ - /* at least 32K for zip's deflate method */ - -static uch *inbuf; -static uch *window; - -static unsigned insize; /* valid bytes in inbuf */ -static unsigned inptr; /* index of next byte to be processed in inbuf */ -static unsigned outcnt; /* bytes in output buffer */ static int exit_code; -static int unzip_error; -static long bytes_out; +static int decompress_error; static int crd_infd, crd_outfd; -#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) - -/* Diagnostic functions (stubbed out) */ -#define Assert(cond,msg) -#define Trace(x) -#define Tracev(x) -#define Tracevv(x) -#define Tracec(c,x) -#define Tracecv(c,x) - -#define STATIC static -#define INIT __init - -static int __init fill_inbuf(void); -static void __init flush_window(void); -static void __init *malloc(size_t size); -static void __init free(void *where); -static void __init error(char *m); -static void __init gzip_mark(void **); -static void __init gzip_release(void **); - -#include "../lib/inflate.c" - -static void __init *malloc(size_t size) -{ - return kmalloc(size, GFP_KERNEL); -} - -static void __init free(void *where) -{ - kfree(where); -} - -static void __init gzip_mark(void **ptr) -{ -} - -static void __init gzip_release(void **ptr) +static int __init compr_fill(void *buf, unsigned int len) { + int r = sys_read(crd_infd, buf, len); + if (r < 0) + printk(KERN_ERR "RAMDISK: error while reading compressed data"); + else if (r == 0) + printk(KERN_ERR "RAMDISK: EOF while reading compressed data"); + return r; } - -/* =========================================================================== - * Fill the input buffer. This is called only when the buffer is empty - * and at least one byte is really needed. - * Returning -1 does not guarantee that gunzip() will ever return. - */ -static int __init fill_inbuf(void) +static int __init compr_flush(void *window, unsigned int outcnt) { - if (exit_code) return -1; - - insize = sys_read(crd_infd, inbuf, INBUFSIZ); - if (insize == 0) { - error("RAMDISK: ran out of compressed data"); + int written = sys_write(crd_outfd, window, outcnt); + if (written != outcnt) { + if (decompress_error == 0) + printk(KERN_ERR + "RAMDISK: incomplete write (%d != %d)\n", + written, outcnt); + decompress_error = 1; return -1; } - - inptr = 1; - - return inbuf[0]; -} - -/* =========================================================================== - * Write the output window window[0..outcnt-1] and update crc and bytes_out. - * (Used for the decompressed data only.) - */ -static void __init flush_window(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n, written; - uch *in, ch; - - written = sys_write(crd_outfd, window, outcnt); - if (written != outcnt && unzip_error == 0) { - printk(KERN_ERR "RAMDISK: incomplete write (%d != %d) %ld\n", - written, outcnt, bytes_out); - unzip_error = 1; - } - in = window; - for (n = 0; n < outcnt; n++) { - ch = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; + return outcnt; } static void __init error(char *x) { printk(KERN_ERR "%s\n", x); exit_code = 1; - unzip_error = 1; + decompress_error = 1; } -static int __init crd_load(int in_fd, int out_fd) +static int __init crd_load(int in_fd, int out_fd, decompress_fn deco) { int result; - - insize = 0; /* valid bytes in inbuf */ - inptr = 0; /* index of next byte to be processed in inbuf */ - outcnt = 0; /* bytes in output buffer */ - exit_code = 0; - bytes_out = 0; - crc = (ulg)0xffffffffL; /* shift register contents */ - crd_infd = in_fd; crd_outfd = out_fd; - inbuf = kmalloc(INBUFSIZ, GFP_KERNEL); - if (inbuf == 0) { - printk(KERN_ERR "RAMDISK: Couldn't allocate gzip buffer\n"); - return -1; - } - window = kmalloc(WSIZE, GFP_KERNEL); - if (window == 0) { - printk(KERN_ERR "RAMDISK: Couldn't allocate gzip window\n"); - kfree(inbuf); - return -1; + + if (!deco) { + pr_emerg("Invalid ramdisk decompression routine. " + "Select appropriate config option.\n"); + panic("Could not decompress initial ramdisk image."); } - makecrc(); - result = gunzip(); - if (unzip_error) + + result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error); + if (decompress_error) result = 1; - kfree(inbuf); - kfree(window); return result; } - -#endif /* BUILD_CRAMDISK */ diff --git a/init/init_task.c b/init/init_task.c new file mode 100644 index 00000000000..ba0a7f362d9 --- /dev/null +++ b/init/init_task.c @@ -0,0 +1,26 @@ +#include <linux/init_task.h> +#include <linux/export.h> +#include <linux/mqueue.h> +#include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/pgtable.h> +#include <asm/uaccess.h> + +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); + +/* Initial task structure */ +struct task_struct init_task = INIT_TASK(init_task); +EXPORT_SYMBOL(init_task); + +/* + * Initial thread structure. Alignment of this is handled by a special + * linker map entry. + */ +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/init/initramfs.c b/init/initramfs.c index 0c5d9a3f951..a8497fab1c3 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -1,3 +1,13 @@ +/* + * Many of the syscalls used in this file expect some of the arguments + * to be __user pointers not __kernel pointers. To limit the sparse + * noise, turn off sparse checking for this file. + */ +#ifdef __CHECKER__ +#undef __CHECKER__ +#warning "Sparse checking disabled for this file" +#endif + #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> @@ -5,7 +15,9 @@ #include <linux/fcntl.h> #include <linux/delay.h> #include <linux/string.h> +#include <linux/dirent.h> #include <linux/syscalls.h> +#include <linux/utime.h> static __initdata char *message; static void __init error(char *x) @@ -14,22 +26,15 @@ static void __init error(char *x) message = x; } -static void __init *malloc(size_t size) -{ - return kmalloc(size, GFP_KERNEL); -} - -static void __init free(void *where) -{ - kfree(where); -} - /* link hash */ +#define N_ALIGN(len) ((((len) + 1) & ~3) + 2) + static __initdata struct hash { int ino, minor, major; + umode_t mode; struct hash *next; - char *name; + char name[N_ALIGN(PATH_MAX)]; } *head[32]; static inline int hash(int major, int minor, int ino) @@ -39,7 +44,8 @@ static inline int hash(int major, int minor, int ino) return tmp & 31; } -static char __init *find_link(int major, int minor, int ino, char *name) +static char __init *find_link(int major, int minor, int ino, + umode_t mode, char *name) { struct hash **p, *q; for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { @@ -49,15 +55,18 @@ static char __init *find_link(int major, int minor, int ino, char *name) continue; if ((*p)->major != major) continue; + if (((*p)->mode ^ mode) & S_IFMT) + continue; return (*p)->name; } - q = (struct hash *)malloc(sizeof(struct hash)); + q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) panic("can't allocate link hash entry"); - q->ino = ino; - q->minor = minor; q->major = major; - q->name = name; + q->minor = minor; + q->ino = ino; + q->mode = mode; + strcpy(q->name, name); q->next = NULL; *p = q; return NULL; @@ -70,15 +79,58 @@ static void __init free_hash(void) while (*p) { q = *p; *p = q->next; - free(q); + kfree(q); } } } +static long __init do_utime(char *filename, time_t mtime) +{ + struct timespec t[2]; + + t[0].tv_sec = mtime; + t[0].tv_nsec = 0; + t[1].tv_sec = mtime; + t[1].tv_nsec = 0; + + return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW); +} + +static __initdata LIST_HEAD(dir_list); +struct dir_entry { + struct list_head list; + char *name; + time_t mtime; +}; + +static void __init dir_add(const char *name, time_t mtime) +{ + struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + if (!de) + panic("can't allocate dir_entry buffer"); + INIT_LIST_HEAD(&de->list); + de->name = kstrdup(name, GFP_KERNEL); + de->mtime = mtime; + list_add(&de->list, &dir_list); +} + +static void __init dir_utime(void) +{ + struct dir_entry *de, *tmp; + list_for_each_entry_safe(de, tmp, &dir_list, list) { + list_del(&de->list); + do_utime(de->name, de->mtime); + kfree(de->name); + kfree(de); + } +} + +static __initdata time_t mtime; + /* cpio header parsing */ static __initdata unsigned long ino, major, minor, nlink; -static __initdata mode_t mode; +static __initdata umode_t mode; static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; @@ -100,6 +152,7 @@ static void __init parse_header(char *s) uid = parsed[2]; gid = parsed[3]; nlink = parsed[4]; + mtime = parsed[5]; body_len = parsed[6]; major = parsed[7]; minor = parsed[8]; @@ -124,17 +177,14 @@ static __initdata char *victim; static __initdata unsigned count; static __initdata loff_t this_header, next_header; -static __initdata int dry_run; - -static inline void eat(unsigned n) +static inline void __init eat(unsigned n) { victim += n; this_header += n; count -= n; } -#define N_ALIGN(len) ((((len) + 1) & ~3) + 2) - +static __initdata char *vcollected; static __initdata char *collected; static __initdata int remains; static __initdata char *collect; @@ -177,6 +227,10 @@ static int __init do_collect(void) static int __init do_header(void) { + if (memcmp(collected, "070707", 6)==0) { + error("incorrect cpio method used: use -H newc option"); + return 1; + } if (memcmp(collected, "070701", 6)) { error("no cpio magic"); return 1; @@ -184,10 +238,6 @@ static int __init do_header(void) parse_header(collected); next_header = this_header + N_ALIGN(name_len) + body_len; next_header = (next_header + 3) & ~3; - if (dry_run) { - read_into(name_buf, N_ALIGN(name_len), GotName); - return 0; - } state = SkipIt; if (name_len <= 0 || name_len > PATH_MAX) return 0; @@ -229,13 +279,25 @@ static int __init do_reset(void) static int __init maybe_link(void) { if (nlink >= 2) { - char *old = find_link(major, minor, ino, collected); + char *old = find_link(major, minor, ino, mode, collected); if (old) return (sys_link(old, collected) < 0) ? -1 : 1; } return 0; } +static void __init clean_path(char *path, umode_t mode) +{ + struct stat st; + + if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { + if (S_ISDIR(st.st_mode)) + sys_rmdir(path); + else + sys_unlink(path); + } +} + static __initdata int wfd; static int __init do_name(void) @@ -246,14 +308,21 @@ static int __init do_name(void) free_hash(); return 0; } - if (dry_run) - return 0; + clean_path(collected, mode); if (S_ISREG(mode)) { - if (maybe_link() >= 0) { - wfd = sys_open(collected, O_WRONLY|O_CREAT, mode); + int ml = maybe_link(); + if (ml >= 0) { + int openflags = O_WRONLY|O_CREAT; + if (ml != 1) + openflags |= O_TRUNC; + wfd = sys_open(collected, openflags, mode); + if (wfd >= 0) { sys_fchown(wfd, uid, gid); sys_fchmod(wfd, mode); + if (body_len) + sys_ftruncate(wfd, body_len); + vcollected = kstrdup(collected, GFP_KERNEL); state = CopyFile; } } @@ -261,12 +330,14 @@ static int __init do_name(void) sys_mkdir(collected, mode); sys_chown(collected, uid, gid); sys_chmod(collected, mode); + dir_add(collected, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { sys_mknod(collected, mode, rdev); sys_chown(collected, uid, gid); sys_chmod(collected, mode); + do_utime(collected, mtime); } } return 0; @@ -277,6 +348,8 @@ static int __init do_copy(void) if (count >= body_len) { sys_write(wfd, victim, body_len); sys_close(wfd); + do_utime(vcollected, mtime); + kfree(vcollected); eat(body_len); state = SkipIt; return 0; @@ -291,8 +364,10 @@ static int __init do_copy(void) static int __init do_symlink(void) { collected[N_ALIGN(name_len) + body_len] = '\0'; + clean_path(collected, 0); sys_symlink(collected + N_ALIGN(name_len), collected); sys_lchown(collected, uid, gid); + do_utime(collected, mtime); state = SkipIt; next_state = Reset; return 0; @@ -319,11 +394,13 @@ static int __init write_buffer(char *buf, unsigned len) return len - count; } -static void __init flush_buffer(char *buf, unsigned len) +static int __init flush_buffer(void *bufv, unsigned len) { + char *buf = (char *) bufv; int written; + int origLen = len; if (message) - return; + return -1; while ((written = write_buffer(buf, len)) < len && !message) { char c = buf[written]; if (c == '0') { @@ -337,92 +414,27 @@ static void __init flush_buffer(char *buf, unsigned len) } else error("junk in compressed archive"); } + return origLen; } -/* - * gzip declarations - */ - -#define OF(args) args - -#ifndef memzero -#define memzero(s, n) memset ((s), 0, (n)) -#endif - -typedef unsigned char uch; -typedef unsigned short ush; -typedef unsigned long ulg; - -#define WSIZE 0x8000 /* window size--must be a power of two, and */ - /* at least 32K for zip's deflate method */ +static unsigned my_inptr; /* index of next byte to be processed in inbuf */ -static uch *inbuf; -static uch *window; +#include <linux/decompress/generic.h> -static unsigned insize; /* valid bytes in inbuf */ -static unsigned inptr; /* index of next byte to be processed in inbuf */ -static unsigned outcnt; /* bytes in output buffer */ -static long bytes_out; - -#define get_byte() (inptr < insize ? inbuf[inptr++] : -1) - -/* Diagnostic functions (stubbed out) */ -#define Assert(cond,msg) -#define Trace(x) -#define Tracev(x) -#define Tracevv(x) -#define Tracec(c,x) -#define Tracecv(c,x) - -#define STATIC static -#define INIT __init - -static void __init flush_window(void); -static void __init error(char *m); -static void __init gzip_mark(void **); -static void __init gzip_release(void **); - -#include "../lib/inflate.c" - -static void __init gzip_mark(void **ptr) +static char * __init unpack_to_rootfs(char *buf, unsigned len) { -} + int written, res; + decompress_fn decompress; + const char *compress_name; + static __initdata char msg_buf[64]; -static void __init gzip_release(void **ptr) -{ -} - -/* =========================================================================== - * Write the output window window[0..outcnt-1] and update crc and bytes_out. - * (Used for the decompressed data only.) - */ -static void __init flush_window(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - - flush_buffer(window, outcnt); - in = window; - for (n = 0; n < outcnt; n++) { - ch = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} + header_buf = kmalloc(110, GFP_KERNEL); + symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); + name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); -static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) -{ - int written; - dry_run = check_only; - header_buf = malloc(110); - symlink_buf = malloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1); - name_buf = malloc(N_ALIGN(PATH_MAX)); - window = malloc(WSIZE); - if (!window || !header_buf || !symlink_buf || !name_buf) + if (!header_buf || !symlink_buf || !name_buf) panic("can't allocate buffers"); + state = Start; this_header = 0; message = NULL; @@ -442,67 +454,175 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) continue; } this_header = 0; - insize = len; - inbuf = buf; - inptr = 0; - outcnt = 0; /* bytes in output buffer */ - bytes_out = 0; - crc = (ulg)0xffffffffL; /* shift register contents */ - makecrc(); - gunzip(); + decompress = decompress_method(buf, len, &compress_name); + pr_debug("Detected %s compressed data\n", compress_name); + if (decompress) { + res = decompress(buf, len, NULL, flush_buffer, NULL, + &my_inptr, error); + if (res) + error("decompressor failed"); + } else if (compress_name) { + if (!message) { + snprintf(msg_buf, sizeof msg_buf, + "compression method %s not configured", + compress_name); + message = msg_buf; + } + } else + error("junk in compressed archive"); if (state != Reset) - error("junk in gzipped archive"); - this_header = saved_offset + inptr; - buf += inptr; - len -= inptr; + error("junk in compressed archive"); + this_header = saved_offset + my_inptr; + buf += my_inptr; + len -= my_inptr; } - free(window); - free(name_buf); - free(symlink_buf); - free(header_buf); + dir_utime(); + kfree(name_buf); + kfree(symlink_buf); + kfree(header_buf); return message; } -extern char __initramfs_start[], __initramfs_end[]; -#ifdef CONFIG_BLK_DEV_INITRD +static int __initdata do_retain_initrd; + +static int __init retain_initrd_param(char *str) +{ + if (*str) + return 0; + do_retain_initrd = 1; + return 1; +} +__setup("retain_initrd", retain_initrd_param); + +extern char __initramfs_start[]; +extern unsigned long __initramfs_size; #include <linux/initrd.h> +#include <linux/kexec.h> static void __init free_initrd(void) { - free_initrd_mem(initrd_start, initrd_end); +#ifdef CONFIG_KEXEC + unsigned long crashk_start = (unsigned long)__va(crashk_res.start); + unsigned long crashk_end = (unsigned long)__va(crashk_res.end); +#endif + if (do_retain_initrd) + goto skip; + +#ifdef CONFIG_KEXEC + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (initrd_start < crashk_end && initrd_end > crashk_start) { + /* + * Initialize initrd memory region since the kexec boot does + * not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + } else +#endif + free_initrd_mem(initrd_start, initrd_end); +skip: initrd_start = 0; initrd_end = 0; } +#ifdef CONFIG_BLK_DEV_RAM +#define BUF_SIZE 1024 +static void __init clean_rootfs(void) +{ + int fd; + void *buf; + struct linux_dirent64 *dirp; + int num; + + fd = sys_open("/", O_RDONLY, 0); + WARN_ON(fd < 0); + if (fd < 0) + return; + buf = kzalloc(BUF_SIZE, GFP_KERNEL); + WARN_ON(!buf); + if (!buf) { + sys_close(fd); + return; + } + + dirp = buf; + num = sys_getdents64(fd, dirp, BUF_SIZE); + while (num > 0) { + while (num > 0) { + struct stat st; + int ret; + + ret = sys_newlstat(dirp->d_name, &st); + WARN_ON_ONCE(ret); + if (!ret) { + if (S_ISDIR(st.st_mode)) + sys_rmdir(dirp->d_name); + else + sys_unlink(dirp->d_name); + } + + num -= dirp->d_reclen; + dirp = (void *)dirp + dirp->d_reclen; + } + dirp = buf; + memset(buf, 0, BUF_SIZE); + num = sys_getdents64(fd, dirp, BUF_SIZE); + } + + sys_close(fd); + kfree(buf); +} #endif -void __init populate_rootfs(void) +static int __init populate_rootfs(void) { - char *err = unpack_to_rootfs(__initramfs_start, - __initramfs_end - __initramfs_start, 0); + char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) - panic(err); -#ifdef CONFIG_BLK_DEV_INITRD + panic("%s", err); /* Failed to decompress INTERNAL initramfs */ if (initrd_start) { +#ifdef CONFIG_BLK_DEV_RAM int fd; - printk(KERN_INFO "checking if image is initramfs..."); + printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start, 1); + initrd_end - initrd_start); if (!err) { - printk(" it is\n"); - unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start, 0); free_initrd(); - return; + goto done; + } else { + clean_rootfs(); + unpack_to_rootfs(__initramfs_start, __initramfs_size); } - printk("it isn't (%s); looks like an initrd\n", err); - fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 700); + printk(KERN_INFO "rootfs image is not initramfs (%s)" + "; looks like an initrd\n", err); + fd = sys_open("/initrd.image", + O_WRONLY|O_CREAT, 0700); if (fd >= 0) { sys_write(fd, (char *)initrd_start, initrd_end - initrd_start); sys_close(fd); free_initrd(); } - } + done: +#else + printk(KERN_INFO "Unpacking initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, + initrd_end - initrd_start); + if (err) + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); + free_initrd(); #endif + /* + * Try loading default modules from initramfs. This gives + * us a chance to load before device_initcalls. + */ + load_default_modules(); + } + return 0; } +rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c index f142d403534..e8ae1fef090 100644 --- a/init/main.c +++ b/init/main.c @@ -9,32 +9,30 @@ * Simplified starting of init: Michael A. Griffith <grif@acm.org> */ -#define __KERNEL_SYSCALLS__ +#define DEBUG /* Enable initcall_debug */ -#include <linux/config.h> #include <linux/types.h> #include <linux/module.h> #include <linux/proc_fs.h> -#include <linux/devfs_fs_kernel.h> #include <linux/kernel.h> #include <linux/syscalls.h> +#include <linux/stackprotector.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/delay.h> -#include <linux/utsname.h> #include <linux/ioport.h> #include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/initrd.h> -#include <linux/hdreg.h> #include <linux/bootmem.h> +#include <linux/acpi.h> #include <linux/tty.h> -#include <linux/gfp.h> #include <linux/percpu.h> #include <linux/kmod.h> +#include <linux/vmalloc.h> #include <linux/kernel_stat.h> +#include <linux/start_kernel.h> #include <linux/security.h> -#include <linux/workqueue.h> +#include <linux/smp.h> #include <linux/profile.h> #include <linux/rcupdate.h> #include <linux/moduleparam.h> @@ -42,70 +40,74 @@ #include <linux/writeback.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/cgroup.h> #include <linux/efi.h> +#include <linux/tick.h> +#include <linux/interrupt.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> #include <linux/unistd.h> #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> -#include <net/sock.h> +#include <linux/buffer_head.h> +#include <linux/page_cgroup.h> +#include <linux/debug_locks.h> +#include <linux/debugobjects.h> +#include <linux/lockdep.h> +#include <linux/kmemleak.h> +#include <linux/pid_namespace.h> +#include <linux/device.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/signal.h> +#include <linux/idr.h> +#include <linux/kgdb.h> +#include <linux/ftrace.h> +#include <linux/async.h> +#include <linux/kmemcheck.h> +#include <linux/sfi.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <linux/file.h> +#include <linux/ptrace.h> +#include <linux/blkdev.h> +#include <linux/elevator.h> +#include <linux/sched_clock.h> +#include <linux/context_tracking.h> +#include <linux/random.h> +#include <linux/list.h> #include <asm/io.h> #include <asm/bugs.h> #include <asm/setup.h> #include <asm/sections.h> - -/* - * This is one of the first .c files built. Error out early - * if we have compiler trouble.. - */ -#if __GNUC__ == 2 && __GNUC_MINOR__ == 96 -#ifdef CONFIG_FRAME_POINTER -#error This compiler cannot compile correctly with frame pointers enabled -#endif -#endif +#include <asm/cacheflush.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> #endif -/* - * Versions of gcc older than that listed below may actually compile - * and link okay, but the end product can have subtle run time bugs. - * To avoid associated bogus bug reports, we flatly refuse to compile - * with a gcc that is known to be too old from the very beginning. - */ -#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95) -#error Sorry, your GCC is too old. It builds incorrect kernels. -#endif - -static int init(void *); +static int kernel_init(void *); extern void init_IRQ(void); extern void fork_init(unsigned long); -extern void mca_init(void); -extern void sbus_init(void); -extern void sysctl_init(void); -extern void signals_init(void); -extern void buffer_init(void); -extern void pidhash_init(void); -extern void pidmap_init(void); -extern void prio_tree_init(void); extern void radix_tree_init(void); -extern void free_initmem(void); -extern void populate_rootfs(void); -extern void driver_init(void); -extern void prepare_namespace(void); -#ifdef CONFIG_ACPI -extern void acpi_early_init(void); -#else -static inline void acpi_early_init(void) { } +#ifndef CONFIG_DEBUG_RODATA +static inline void mark_rodata_ro(void) { } #endif -#ifdef CONFIG_TC -extern void tc_init(void); -#endif +/* + * Debug helper: via this flag we know that we are in 'early bootup code' + * where only the boot processor is running with IRQ disabled. This means + * two things - IRQ must not be enabled before the flag is cleared and some + * operations which are not allowed with IRQ disabled are allowed while the + * flag is set. + */ +bool early_boot_irqs_disabled __read_mostly; -enum system_states system_state; +enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); /* @@ -116,73 +118,80 @@ EXPORT_SYMBOL(system_state); extern void time_init(void); /* Default late time init is NULL. archs can override this later. */ -void (*late_time_init)(void); -extern void softirq_init(void); +void (*__initdata late_time_init)(void); -/* Untouched command line (eg. for /proc) saved by arch-specific code. */ -char saved_command_line[COMMAND_LINE_SIZE]; +/* Untouched command line saved by arch-specific code. */ +char __initdata boot_command_line[COMMAND_LINE_SIZE]; +/* Untouched saved command line (eg. for /proc) */ +char *saved_command_line; +/* Command line for parameter parsing */ +static char *static_command_line; +/* Command line for per-initcall parameter parsing */ +static char *initcall_command_line; static char *execute_command; static char *ramdisk_execute_command; -/* Setup configured maximum number of CPUs to activate */ -static unsigned int max_cpus = NR_CPUS; +/* + * Used to generate warnings if static_key manipulation functions are used + * before jump_label_init is called. + */ +bool static_key_initialized __read_mostly = false; +EXPORT_SYMBOL_GPL(static_key_initialized); /* - * Setup routine for controlling SMP activation - * - * Command-line option of "nosmp" or "maxcpus=0" will disable SMP - * activation entirely (the MPS table probe still happens, though). + * If set, this is an indication to the drivers that reset the underlying + * device before going ahead with the initialization otherwise driver might + * rely on the BIOS and skip the reset operation. * - * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer - * greater than 0, limits the maximum number of CPUs activated in - * SMP mode to <NUM>. + * This is useful if kernel is booting in an unreliable environment. + * For ex. kdump situaiton where previous kernel has crashed, BIOS has been + * skipped and devices will be in unknown state. */ -static int __init nosmp(char *str) -{ - max_cpus = 0; - return 1; -} - -__setup("nosmp", nosmp); +unsigned int reset_devices; +EXPORT_SYMBOL(reset_devices); -static int __init maxcpus(char *str) +static int __init set_reset_devices(char *str) { - get_option(&str, &max_cpus); + reset_devices = 1; return 1; } -__setup("maxcpus=", maxcpus); +__setup("reset_devices", set_reset_devices); -static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; -char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; -extern struct obs_kernel_param __setup_start[], __setup_end[]; +extern const struct obs_kernel_param __setup_start[], __setup_end[]; static int __init obsolete_checksetup(char *line) { - struct obs_kernel_param *p; + const struct obs_kernel_param *p; + int had_early_param = 0; p = __setup_start; do { int n = strlen(p->str); - if (!strncmp(line, p->str, n)) { + if (parameqn(line, p->str, n)) { if (p->early) { - /* Already done in parse_early_param? (Needs - * exact match on param part) */ + /* Already done in parse_early_param? + * (Needs exact match on param part). + * Keep iterating, as we can have early + * params and __setups of same names 8( */ if (line[n] == '\0' || line[n] == '=') - return 1; + had_early_param = 1; } else if (!p->setup_func) { - printk(KERN_WARNING "Parameter %s is obsolete," - " ignored\n", p->str); + pr_warn("Parameter %s is obsolete, ignored\n", + p->str); return 1; } else if (p->setup_func(line + n)) return 1; } p++; } while (p < __setup_end); - return 0; + + return had_early_param; } /* @@ -195,38 +204,41 @@ EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) { - if (*str) - return 0; - console_loglevel = 10; - return 1; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; + return 0; } static int __init quiet_kernel(char *str) { - if (*str) - return 0; - console_loglevel = 4; - return 1; + console_loglevel = CONSOLE_LOGLEVEL_QUIET; + return 0; } -__setup("debug", debug_kernel); -__setup("quiet", quiet_kernel); +early_param("debug", debug_kernel); +early_param("quiet", quiet_kernel); static int __init loglevel(char *str) { - get_option(&str, &console_loglevel); - return 1; + int newlevel; + + /* + * Only update loglevel value when a correct setting was passed, + * to prevent blind crashes (when loglevel being set to 0) that + * are quite hard to debug + */ + if (get_option(&str, &newlevel)) { + console_loglevel = newlevel; + return 0; + } + + return -EINVAL; } -__setup("loglevel=", loglevel); +early_param("loglevel", loglevel); -/* - * Unknown boot options get handed to init, unless they look like - * failed parameters - */ -static int __init unknown_bootoption(char *param, char *val) +/* Change NUL term back to "=", to make "param" the whole string. */ +static int __init repair_env_string(char *param, char *val, const char *unused) { - /* Change NUL term back to "=", to make "param" the whole string. */ if (val) { /* param=val or param="val"? */ if (val == param+strlen(param)+1) @@ -238,19 +250,45 @@ static int __init unknown_bootoption(char *param, char *val) } else BUG(); } + return 0; +} + +/* Anything after -- gets handed straight to init. */ +static int __init set_init_arg(char *param, char *val, const char *unused) +{ + unsigned int i; + + if (panic_later) + return 0; + + repair_env_string(param, val, unused); + + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "init"; + panic_param = param; + return 0; + } + } + argv_init[i] = param; + return 0; +} + +/* + * Unknown boot options get handed to init, unless they look like + * unused parameters (modprobe will find them in /proc/cmdline). + */ +static int __init unknown_bootoption(char *param, char *val, const char *unused) +{ + repair_env_string(param, val, unused); /* Handle obsolete-style parameters */ if (obsolete_checksetup(param)) return 0; - /* - * Preemptive maintenance for "why didn't my mispelled command - * line work?" - */ - if (strchr(param, '.') && (!val || strchr(param, '.') < val)) { - printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param); + /* Unused module parameter. */ + if (strchr(param, '.') && (!val || strchr(param, '.') < val)) return 0; - } if (panic_later) return 0; @@ -260,7 +298,7 @@ static int __init unknown_bootoption(char *param, char *val) unsigned int i; for (i = 0; envp_init[i]; i++) { if (i == MAX_INIT_ENVS) { - panic_later = "Too many boot env vars at `%s'"; + panic_later = "env"; panic_param = param; } if (!strncmp(param, envp_init[i], val - param)) @@ -272,7 +310,7 @@ static int __init unknown_bootoption(char *param, char *val) unsigned int i; for (i = 0; argv_init[i]; i++) { if (i == MAX_INIT_ARGS) { - panic_later = "Too many boot init vars at `%s'"; + panic_later = "init"; panic_param = param; } } @@ -310,10 +348,8 @@ static int __init rdinit_setup(char *str) } __setup("rdinit=", rdinit_setup); -extern void setup_arch(char **); - #ifndef CONFIG_SMP - +static const unsigned int setup_max_cpus = NR_CPUS; #ifdef CONFIG_X86_LOCAL_APIC static void __init smp_init(void) { @@ -323,62 +359,27 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } +static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } - -#else - -#ifdef __GENERIC_PER_CPU -unsigned long __per_cpu_offset[NR_CPUS]; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; #endif - ptr = alloc_bootmem(size * NR_CPUS); - - for (i = 0; i < NR_CPUS; i++, ptr += size) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } -} -#endif /* !__GENERIC_PER_CPU */ - -/* Called by boot processor to activate the rest. */ -static void __init smp_init(void) +/* + * We need to store the untouched command line for future reference. + * We also need to store the touched command line since the parameter + * parsing is performed in place, and we should allow a component to + * store reference of name/value for future reference. + */ +static void __init setup_command_line(char *command_line) { - unsigned int i; - - /* FIXME: This should be done in userspace --RR */ - for_each_present_cpu(i) { - if (num_online_cpus() >= max_cpus) - break; - if (!cpu_online(i)) - cpu_up(i); - } - - /* Any cleanup work */ - printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(max_cpus); -#if 0 - /* Get other processors into their bootup holding patterns. */ - - smp_commence(); -#endif + saved_command_line = + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + initcall_command_line = + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); + strcpy (saved_command_line, boot_command_line); + strcpy (static_command_line, command_line); } -#endif - /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to @@ -388,39 +389,59 @@ static void __init smp_init(void) * gcc-3.4 accidentally inlines this function, so use noinline. */ -static void noinline rest_init(void) - __releases(kernel_lock) +static __initdata DECLARE_COMPLETION(kthreadd_done); + +static noinline void __init_refok rest_init(void) { - kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); + int pid; + + rcu_scheduler_starting(); + /* + * We need to spawn init first so that it obtains pid 1, however + * the init task will end up wanting to create kthreads, which, if + * we schedule it before we create kthreadd, will OOPS. + */ + kernel_thread(kernel_init, NULL, CLONE_FS); numa_default_policy(); - unlock_kernel(); - preempt_enable_no_resched(); + pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + rcu_read_lock(); + kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); + rcu_read_unlock(); + complete(&kthreadd_done); /* * The boot idle thread must execute schedule() - * at least one to get things moving: + * at least once to get things moving: */ - schedule(); - - cpu_idle(); -} + init_idle_bootup_task(current); + schedule_preempt_disabled(); + /* Call into cpu_idle with preempt disabled */ + cpu_startup_entry(CPUHP_ONLINE); +} /* Check for early params. */ -static int __init do_early_param(char *param, char *val) +static int __init do_early_param(char *param, char *val, const char *unused) { - struct obs_kernel_param *p; + const struct obs_kernel_param *p; for (p = __setup_start; p < __setup_end; p++) { - if (p->early && strcmp(param, p->str) == 0) { + if ((p->early && parameq(param, p->str)) || + (strcmp(param, "console") == 0 && + strcmp(p->str, "earlycon") == 0) + ) { if (p->setup_func(val) != 0) - printk(KERN_WARNING - "Malformed early option '%s'\n", param); + pr_warn("Malformed early option '%s'\n", param); } } /* We accept everything at this stage. */ return 0; } +void __init parse_early_options(char *cmdline) +{ + parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param); +} + /* Arch code calls this early on, or if not, just before other parsing. */ void __init parse_early_param(void) { @@ -431,8 +452,8 @@ void __init parse_early_param(void) return; /* All fall through to do_early_param. */ - strlcpy(tmp_cmdline, saved_command_line, COMMAND_LINE_SIZE); - parse_args("early options", tmp_cmdline, NULL, 0, do_early_param); + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_early_options(tmp_cmdline); done = 1; } @@ -440,26 +461,105 @@ void __init parse_early_param(void) * Activate the first processor. */ -asmlinkage void __init start_kernel(void) +static void __init boot_cpu_init(void) +{ + int cpu = smp_processor_id(); + /* Mark the boot cpu "present", "online" etc for SMP and UP case */ + set_cpu_online(cpu, true); + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); +} + +void __init __weak smp_setup_processor_id(void) +{ +} + +# if THREAD_SIZE >= PAGE_SIZE +void __init __weak thread_info_cache_init(void) +{ +} +#endif + +/* + * Set up kernel memory allocators + */ +static void __init mm_init(void) +{ + /* + * page_cgroup requires contiguous pages, + * bigger than MAX_ORDER unless SPARSEMEM. + */ + page_cgroup_init_flatmem(); + mem_init(); + kmem_cache_init(); + percpu_init_late(); + pgtable_init(); + vmalloc_init(); +} + +asmlinkage __visible void __init start_kernel(void) { - char * command_line; - extern struct kernel_param __start___param[], __stop___param[]; + char * command_line, *after_dashes; + extern const struct kernel_param __start___param[], __stop___param[]; + + /* + * Need to run as early as possible, to initialize the + * lockdep hash: + */ + lockdep_init(); + smp_setup_processor_id(); + debug_objects_early_init(); + + /* + * Set up the the initial canary ASAP: + */ + boot_init_stack_canary(); + + cgroup_init_early(); + + local_irq_disable(); + early_boot_irqs_disabled = true; + /* * Interrupts are still disabled. Do necessary setups, then * enable them */ - lock_kernel(); + boot_cpu_init(); page_address_init(); - printk(KERN_NOTICE); - printk(linux_banner); + pr_notice("%s", linux_banner); setup_arch(&command_line); + mm_init_cpumask(&init_mm); + setup_command_line(command_line); + setup_nr_cpu_ids(); setup_per_cpu_areas(); + smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + + build_all_zonelists(NULL, NULL); + page_alloc_init(); + + pr_notice("Kernel command line: %s\n", boot_command_line); + parse_early_param(); + after_dashes = parse_args("Booting kernel", + static_command_line, __start___param, + __stop___param - __start___param, + -1, -1, &unknown_bootoption); + if (after_dashes) + parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, + set_init_arg); + + jump_label_init(); /* - * Mark the boot cpu "online" so that it can call console drivers in - * printk() and can access its per-cpu storage. + * These use large bootmem allocations and must precede + * kmem_cache_init() */ - smp_prepare_boot_cpu(); + setup_log_buf(0); + pidhash_init(); + vfs_caches_init_early(); + sort_main_extable(); + trap_init(); + mm_init(); /* * Set up the scheduler prior starting any interrupts (such as the @@ -472,21 +572,31 @@ asmlinkage void __init start_kernel(void) * fragile until we cpu_idle() for the first time. */ preempt_disable(); - build_all_zonelists(); - page_alloc_init(); - printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line); - parse_early_param(); - parse_args("Booting kernel", command_line, __start___param, - __stop___param - __start___param, - &unknown_bootoption); - sort_main_extable(); - trap_init(); + if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) + local_irq_disable(); + idr_init_cache(); rcu_init(); + tick_nohz_init(); + context_tracking_init(); + radix_tree_init(); + /* init some links before init_ISA_irqs() */ + early_irq_init(); init_IRQ(); - pidhash_init(); + tick_init(); init_timers(); + hrtimers_init(); softirq_init(); + timekeeping_init(); time_init(); + sched_clock_postinit(); + perf_event_init(); + profile_init(); + call_function_init(); + WARN(!irqs_disabled(), "Interrupts were enabled early\n"); + early_boot_irqs_disabled = false; + local_irq_enable(); + + kmem_cache_init_late(); /* * HACK ALERT! This is early. We're enabling the console before @@ -495,103 +605,263 @@ asmlinkage void __init start_kernel(void) */ console_init(); if (panic_later) - panic(panic_later, panic_param); - profile_init(); - local_irq_enable(); + panic("Too many boot %s vars at `%s'", panic_later, + panic_param); + + lockdep_info(); + + /* + * Need to run this when irqs are enabled, because it wants + * to self-test [hard/soft]-irqs on/off lock inversion bugs + * too: + */ + locking_selftest(); + #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && - initrd_start < min_low_pfn << PAGE_SHIFT) { - printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " - "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); + page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { + pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", + page_to_pfn(virt_to_page((void *)initrd_start)), + min_low_pfn); initrd_start = 0; } #endif - vfs_caches_init_early(); - mem_init(); - kmem_cache_init(); + page_cgroup_init(); + debug_objects_mem_init(); + kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) late_time_init(); + sched_clock_init(); calibrate_delay(); pidmap_init(); - pgtable_cache_init(); - prio_tree_init(); anon_vma_init(); + acpi_early_init(); #ifdef CONFIG_X86 - if (efi_enabled) + if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); #endif - fork_init(num_physpages); +#ifdef CONFIG_X86_ESPFIX64 + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); +#endif + thread_info_cache_init(); + cred_init(); + fork_init(totalram_pages); proc_caches_init(); buffer_init(); - unnamed_dev_init(); key_init(); security_init(); - vfs_caches_init(num_physpages); - radix_tree_init(); + dbg_late_init(); + vfs_caches_init(totalram_pages); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); -#ifdef CONFIG_PROC_FS proc_root_init(); -#endif + cgroup_init(); cpuset_init(); + taskstats_init_early(); + delayacct_init(); check_bugs(); - acpi_early_init(); /* before LAPIC and SMP init */ + sfi_init_late(); + + if (efi_enabled(EFI_RUNTIME_SERVICES)) { + efi_late_init(); + efi_free_boot_services(); + } + + ftrace_init(); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } -static int __initdata initcall_debug; +/* Call all constructor functions linked into the kernel. */ +static void __init do_ctors(void) +{ +#ifdef CONFIG_CONSTRUCTORS + ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; + + for (; fn < (ctor_fn_t *) __ctors_end; fn++) + (*fn)(); +#endif +} + +bool initcall_debug; +core_param(initcall_debug, initcall_debug, bool, 0644); + +#ifdef CONFIG_KALLSYMS +struct blacklist_entry { + struct list_head next; + char *buf; +}; + +static __initdata_or_module LIST_HEAD(blacklisted_initcalls); -static int __init initcall_debug_setup(char *str) +static int __init initcall_blacklist(char *str) { - initcall_debug = 1; - return 1; + char *str_entry; + struct blacklist_entry *entry; + + /* str argument is a comma-separated list of functions */ + do { + str_entry = strsep(&str, ","); + if (str_entry) { + pr_debug("blacklisting initcall %s\n", str_entry); + entry = alloc_bootmem(sizeof(*entry)); + entry->buf = alloc_bootmem(strlen(str_entry) + 1); + strcpy(entry->buf, str_entry); + list_add(&entry->next, &blacklisted_initcalls); + } + } while (str_entry); + + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + struct list_head *tmp; + struct blacklist_entry *entry; + char *fn_name; + + fn_name = kasprintf(GFP_KERNEL, "%pf", fn); + if (!fn_name) + return false; + + list_for_each(tmp, &blacklisted_initcalls) { + entry = list_entry(tmp, struct blacklist_entry, next); + if (!strcmp(fn_name, entry->buf)) { + pr_debug("initcall %s blacklisted\n", fn_name); + kfree(fn_name); + return true; + } + } + + kfree(fn_name); + return false; +} +#else +static int __init initcall_blacklist(char *str) +{ + pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); + return 0; } -__setup("initcall_debug", initcall_debug_setup); -struct task_struct *child_reaper = &init_task; +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + return false; +} +#endif +__setup("initcall_blacklist=", initcall_blacklist); -extern initcall_t __initcall_start[], __initcall_end[]; +static int __init_or_module do_one_initcall_debug(initcall_t fn) +{ + ktime_t calltime, delta, rettime; + unsigned long long duration; + int ret; + + printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); + calltime = ktime_get(); + ret = fn(); + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", + fn, ret, duration); + + return ret; +} -static void __init do_initcalls(void) +int __init_or_module do_one_initcall(initcall_t fn) { - initcall_t *call; int count = preempt_count(); + int ret; + char msgbuf[64]; - for (call = __initcall_start; call < __initcall_end; call++) { - char *msg; + if (initcall_blacklisted(fn)) + return -EPERM; - if (initcall_debug) { - printk(KERN_DEBUG "Calling initcall 0x%p", *call); - print_fn_descriptor_symbol(": %s()", (unsigned long) *call); - printk("\n"); - } + if (initcall_debug) + ret = do_one_initcall_debug(fn); + else + ret = fn(); - (*call)(); + msgbuf[0] = 0; - msg = NULL; - if (preempt_count() != count) { - msg = "preemption imbalance"; - preempt_count() = count; - } - if (irqs_disabled()) { - msg = "disabled interrupts"; - local_irq_enable(); - } - if (msg) { - printk(KERN_WARNING "error in initcall at 0x%p: " - "returned with %s\n", *call, msg); - } + if (preempt_count() != count) { + sprintf(msgbuf, "preemption imbalance "); + preempt_count_set(count); + } + if (irqs_disabled()) { + strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); + local_irq_enable(); } + WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); + + return ret; +} + + +extern initcall_t __initcall_start[]; +extern initcall_t __initcall0_start[]; +extern initcall_t __initcall1_start[]; +extern initcall_t __initcall2_start[]; +extern initcall_t __initcall3_start[]; +extern initcall_t __initcall4_start[]; +extern initcall_t __initcall5_start[]; +extern initcall_t __initcall6_start[]; +extern initcall_t __initcall7_start[]; +extern initcall_t __initcall_end[]; + +static initcall_t *initcall_levels[] __initdata = { + __initcall0_start, + __initcall1_start, + __initcall2_start, + __initcall3_start, + __initcall4_start, + __initcall5_start, + __initcall6_start, + __initcall7_start, + __initcall_end, +}; + +/* Keep these in sync with initcalls in include/linux/init.h */ +static char *initcall_level_names[] __initdata = { + "early", + "core", + "postcore", + "arch", + "subsys", + "fs", + "device", + "late", +}; + +static void __init do_initcall_level(int level) +{ + extern const struct kernel_param __start___param[], __stop___param[]; + initcall_t *fn; + + strcpy(initcall_command_line, saved_command_line); + parse_args(initcall_level_names[level], + initcall_command_line, __start___param, + __stop___param - __start___param, + level, level, + &repair_env_string); - /* Make sure there is no pending stuff from the initcall sequence */ - flush_scheduled_work(); + for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) + do_one_initcall(*fn); +} + +static void __init do_initcalls(void) +{ + int level; + + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) + do_initcall_level(level); } /* @@ -603,93 +873,142 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { - /* drivers will send hotplug events */ - init_workqueues(); + cpuset_init_smp(); usermodehelper_init(); + shmem_init(); driver_init(); - -#ifdef CONFIG_SYSCTL - sysctl_init(); -#endif - - /* Networking initialization needs a process context */ - sock_init(); - + init_irq_proc(); + do_ctors(); + usermodehelper_enable(); do_initcalls(); + random_int_secret_init(); } -static void do_pre_smp_initcalls(void) +static void __init do_pre_smp_initcalls(void) { - extern int spawn_ksoftirqd(void); -#ifdef CONFIG_SMP - extern int migration_init(void); + initcall_t *fn; - migration_init(); -#endif - spawn_ksoftirqd(); - spawn_softlockup_task(); + for (fn = __initcall_start; fn < __initcall0_start; fn++) + do_one_initcall(*fn); } -static void run_init_process(char *init_filename) +/* + * This function requests modules which should be loaded by default and is + * called twice right after initrd is mounted and right before init is + * exec'd. If such modules are on either initrd or rootfs, they will be + * loaded before control is passed to userland. + */ +void __init load_default_modules(void) +{ + load_default_elevator_module(); +} + +static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - execve(init_filename, argv_init, envp_init); + return do_execve(getname_kernel(init_filename), + (const char __user *const __user *)argv_init, + (const char __user *const __user *)envp_init); } -static inline void fixup_cpu_present_map(void) +static int try_to_run_init_process(const char *init_filename) { -#ifdef CONFIG_SMP - int i; + int ret; + + ret = run_init_process(init_filename); + + if (ret && ret != -ENOENT) { + pr_err("Starting init: %s exists but couldn't execute it (error %d)\n", + init_filename, ret); + } + + return ret; +} + +static noinline void __init kernel_init_freeable(void); + +static int __ref kernel_init(void *unused) +{ + int ret; + + kernel_init_freeable(); + /* need to finish all async __init code before freeing the memory */ + async_synchronize_full(); + free_initmem(); + mark_rodata_ro(); + system_state = SYSTEM_RUNNING; + numa_default_policy(); + + flush_delayed_fput(); + + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) + return 0; + pr_err("Failed to execute %s (error %d)\n", + ramdisk_execute_command, ret); + } /* - * If arch is not hotplug ready and did not populate - * cpu_present_map, just make cpu_present_map same as cpu_possible_map - * for other cpu bringup code to function as normal. e.g smp_init() etc. + * We try each of these until one succeeds. + * + * The Bourne shell can be used instead of init if we are + * trying to recover a really broken machine. */ - if (cpus_empty(cpu_present_map)) { - for_each_cpu(i) { - cpu_set(i, cpu_present_map); - } + if (execute_command) { + ret = run_init_process(execute_command); + if (!ret) + return 0; + pr_err("Failed to execute %s (error %d). Attempting defaults...\n", + execute_command, ret); } -#endif + if (!try_to_run_init_process("/sbin/init") || + !try_to_run_init_process("/etc/init") || + !try_to_run_init_process("/bin/init") || + !try_to_run_init_process("/bin/sh")) + return 0; + + panic("No working init found. Try passing init= option to kernel. " + "See Linux Documentation/init.txt for guidance."); } -static int init(void * unused) +static noinline void __init kernel_init_freeable(void) { - lock_kernel(); /* - * init can run on any cpu. + * Wait until kthreadd is all set-up. + */ + wait_for_completion(&kthreadd_done); + + /* Now the scheduler is fully set up and can do blocking allocations */ + gfp_allowed_mask = __GFP_BITS_MASK; + + /* + * init can allocate pages on any node */ - set_cpus_allowed(current, CPU_MASK_ALL); + set_mems_allowed(node_states[N_MEMORY]); /* - * Tell the world that we're going to be the grim - * reaper of innocent orphaned children. - * - * We don't want people to have to make incorrect - * assumptions about where in the task array this - * can be found. + * init can run on any cpu. */ - child_reaper = current; + set_cpus_allowed_ptr(current, cpu_all_mask); - /* Sets up cpus_possible() */ - smp_prepare_cpus(max_cpus); + cad_pid = task_pid(current); + + smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); + lockup_detector_init(); - fixup_cpu_present_map(); smp_init(); sched_init_smp(); - cpuset_init_smp(); - - /* - * Do this before initcalls, because some drivers want to access - * firmware files. - */ - populate_rootfs(); - do_basic_setup(); + /* Open the /dev/console on the rootfs, this should never fail */ + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + pr_err("Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); /* * check if there is an early userspace init. If yes, let it do all * the work @@ -708,38 +1027,7 @@ static int init(void * unused) * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ - free_initmem(); - unlock_kernel(); - system_state = SYSTEM_RUNNING; - numa_default_policy(); - - if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) - printk(KERN_WARNING "Warning: unable to open an initial console.\n"); - - (void) sys_dup(0); - (void) sys_dup(0); - - if (ramdisk_execute_command) { - run_init_process(ramdisk_execute_command); - printk(KERN_WARNING "Failed to execute %s\n", - ramdisk_execute_command); - } - - /* - * We try each of these until one succeeds. - * - * The Bourne shell can be used instead of init if we are - * trying to recover a really broken machine. - */ - if (execute_command) { - run_init_process(execute_command); - printk(KERN_WARNING "Failed to execute %s. Attempting " - "defaults...\n", execute_command); - } - run_init_process("/sbin/init"); - run_init_process("/etc/init"); - run_init_process("/bin/init"); - run_init_process("/bin/sh"); - panic("No init found. Try passing init= option to kernel."); + /* rootfs is available now, try loading default modules */ + load_default_modules(); } diff --git a/init/noinitramfs.c b/init/noinitramfs.c new file mode 100644 index 00000000000..267739d8517 --- /dev/null +++ b/init/noinitramfs.c @@ -0,0 +1,52 @@ +/* + * init/noinitramfs.c + * + * Copyright (C) 2006, NXP Semiconductors, All Rights Reserved + * Author: Jean-Paul Saman <jean-paul.saman@nxp.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/kdev_t.h> +#include <linux/syscalls.h> + +/* + * Create a simple rootfs that is similar to the default initramfs + */ +static int __init default_rootfs(void) +{ + int err; + + err = sys_mkdir((const char __user __force *) "/dev", 0755); + if (err < 0) + goto out; + + err = sys_mknod((const char __user __force *) "/dev/console", + S_IFCHR | S_IRUSR | S_IWUSR, + new_encode_dev(MKDEV(5, 1))); + if (err < 0) + goto out; + + err = sys_mkdir((const char __user __force *) "/root", 0700); + if (err < 0) + goto out; + + return 0; + +out: + printk(KERN_WARNING "Failed to create a rootfs\n"); + return err; +} +rootfs_initcall(default_rootfs); diff --git a/init/version.c b/init/version.c index 3ddc3ceec2f..1a4718e500f 100644 --- a/init/version.c +++ b/init/version.c @@ -6,28 +6,45 @@ * May be freely distributed as part of Linux. */ -#include <linux/compile.h> +#include <generated/compile.h> #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> +#include <generated/utsrelease.h> #include <linux/version.h> +#include <linux/proc_ns.h> +#ifndef CONFIG_KALLSYMS #define version(a) Version_ ## a #define version_string(a) version(a) +extern int version_string(LINUX_VERSION_CODE); int version_string(LINUX_VERSION_CODE); +#endif -struct new_utsname system_utsname = { - .sysname = UTS_SYSNAME, - .nodename = UTS_NODENAME, - .release = UTS_RELEASE, - .version = UTS_VERSION, - .machine = UTS_MACHINE, - .domainname = UTS_DOMAINNAME, +struct uts_namespace init_uts_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, + .user_ns = &init_user_ns, + .proc_inum = PROC_UTS_INIT_INO, }; +EXPORT_SYMBOL_GPL(init_uts_ns); -EXPORT_SYMBOL(system_utsname); - +/* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; + +const char linux_proc_banner[] = + "%s version %s" + " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")" + " (" LINUX_COMPILER ") %s\n"; |
