diff options
Diffstat (limited to 'arch/x86_64')
64 files changed, 1798 insertions, 5509 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index e9b4f058a49..145bb824b2a 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -415,13 +415,13 @@ config OUT_OF_LINE_PFN_TO_PAGE depends on DISCONTIGMEM config NR_CPUS - int "Maximum number of CPUs (2-256)" + int "Maximum number of CPUs (2-255)" range 2 255 depends on SMP default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. Current maximum is 256 CPUs due to + kernel will support. Current maximum is 255 CPUs due to APIC addressing limits. Less depending on the hardware. This is purely to save memory - each supported CPU requires @@ -565,23 +565,56 @@ config CRASH_DUMP PHYSICAL_START. For more details see Documentation/kdump/kdump.txt +config RELOCATABLE + bool "Build a relocatable kernel(EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Builds a relocatable kernel. This enables loading and running + a kernel binary from a different physical address than it has + been compiled for. + + One use is for the kexec on panic case where the recovery kernel + must live at a different physical address than the primary + kernel. + + Note: If CONFIG_RELOCATABLE=y, then kernel run from the address + it has been loaded at and compile time physical address + (CONFIG_PHYSICAL_START) is ignored. + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if CRASH_DUMP default "0x200000" help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x200000 (2MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as + This gives the physical address where the kernel is loaded. It + should be aligned to 2MB boundary. + + If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then + bzImage will decompress itself to above physical address and + run from there. Otherwise, bzImage will run from the address where + it has been loaded by the boot loader and will ignore above physical + address. + + In normal kdump cases one does not have to set/change this option + as now bzImage can be compiled as a completely relocatable image + (CONFIG_RELOCATABLE=y) and be used to load and run from a different + address. This option is mainly useful for the folks who don't want + to use a bzImage for capturing the crash dump and want to use a + vmlinux instead. + + So if you are using bzImage for capturing the crash dump, leave + the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y. + Otherwise if you plan to use vmlinux for capturing the crash dump + change this value to start of the reserved region (Typically 16MB + 0x1000000). In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed kernel. Typically this parameter is set as crashkernel=64M@16M. Please take a look at Documentation/kdump/kdump.txt for more details about crash dumps. + Usage of bzImage for capturing the crash dump is advantageous as + one does not have to build two kernels. Same kernel can be used + as production kernel and capture kernel. + Don't change this unless you know what you are doing. config SECCOMP @@ -627,14 +660,6 @@ config CC_STACKPROTECTOR_ALL source kernel/Kconfig.hz -config REORDER - bool "Function reordering" - default n - help - This option enables the toolchain to reorder functions for a more - optimal TLB usage. If you have pretty much any version of binutils, - this can increase your kernel build time by roughly one minute. - config K8_NB def_bool y depends on AGP_AMD64 || IOMMU || (PCI && NUMA) diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 2941a915d4e..29617ae3926 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -40,10 +40,6 @@ cflags-y += -m64 cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe -cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections -# this makes reading assembly source easier, but produces worse code -# actually it makes the kernel smaller too. -cflags-y += -fno-reorder-blocks cflags-y += -Wno-sign-compare cflags-y += -fno-asynchronous-unwind-tables ifneq ($(CONFIG_DEBUG_INFO),y) diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index deb063e7762..ee6f6505f95 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -36,7 +36,7 @@ subdir- := compressed/ #Let make clean descend in compressed/ # --------------------------------------------------------------------------- $(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index e70fa6e1da0..705a3e33d7e 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -8,16 +8,14 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o EXTRA_AFLAGS := -traditional -AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 -CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -LDFLAGS := -m elf_i386 +CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin +LDFLAGS := -m elf_x86_64 -LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 - -$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE +LDFLAGS_vmlinux := -T +$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T +LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 6f55565e4d4..f9d5692a010 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -26,116 +26,279 @@ #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/pgtable.h> #include <asm/page.h> +#include <asm/msr.h> +.section ".text.head" .code32 .globl startup_32 - + startup_32: cld cli - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - - lss stack_start,%esp - xorl %eax,%eax -1: incl %eax # check that A20 really IS enabled - movl %eax,0x000000 # loop forever if it isn't - cmpl %eax,0x100000 - je 1b + movl $(__KERNEL_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + +/* Calculate the delta between where we were compiled to run + * at and where we were actually loaded at. This can only be done + * with a short local call on x86. Nothing else will tell us what + * address we are running at. The reserved chunk of the real-mode + * data at 0x34-0x3f are used as the stack for this calculation. + * Only 4 bytes are needed. + */ + leal 0x40(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp + +/* setup a stack and make sure cpu supports long mode. */ + movl $user_stack_end, %eax + addl %ebp, %eax + movl %eax, %esp + + call verify_cpu + testl %eax, %eax + jnz no_longmode + +/* Compute the delta between where we were compiled to run at + * and where the code will actually run at. + */ +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. + */ + +#ifdef CONFIG_RELOCATABLE + movl %ebp, %ebx + addl $(LARGE_PAGE_SIZE -1), %ebx + andl $LARGE_PAGE_MASK, %ebx +#else + movl $CONFIG_PHYSICAL_START, %ebx +#endif + + /* Replace the compressed data size with the uncompressed size */ + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx + /* Add 8 bytes for every 32K input block */ + shrl $12, %eax + addl %eax, %ebx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addl $(32768 + 18 + 4095), %ebx + andl $~4095, %ebx /* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. + * Prepare for entering 64 bit mode */ - pushl $0 - popfl + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + leal gdt(%ebp), %eax + movl %eax, gdt+2(%ebp) + lgdt gdt(%ebp) + + /* Enable PAE mode */ + xorl %eax, %eax + orl $(1 << 5), %eax + movl %eax, %cr4 + + /* + * Build early 4G boot pagetable + */ + /* Initialize Page tables to 0*/ + leal pgtable(%ebx), %edi + xorl %eax, %eax + movl $((4096*6)/4), %ecx + rep stosl + + /* Build Level 4 */ + leal pgtable + 0(%ebx), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + + /* Build Level 3 */ + leal pgtable + 0x1000(%ebx), %edi + leal 0x1007(%edi), %eax + movl $4, %ecx +1: movl %eax, 0x00(%edi) + addl $0x00001000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Build Level 2 */ + leal pgtable + 0x2000(%ebx), %edi + movl $0x00000183, %eax + movl $2048, %ecx +1: movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Enable the boot page tables */ + leal pgtable(%ebx), %eax + movl %eax, %cr3 + + /* Enable Long mode in EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Setup for the jump to 64bit mode + * + * When the jump is performend we will be in long mode but + * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 + * (and in turn EFER.LMA = 1). To jump into 64bit mode we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + * We place all of the values on our mini stack so lret can + * used to perform that far jump. + */ + pushl $__KERNEL_CS + leal startup_64(%ebp), %eax + pushl %eax + + /* Enter paged protected Mode, activating Long Mode */ + movl $0x80000001, %eax /* Enable Paging and Protected mode */ + movl %eax, %cr0 + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret + +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + + /* Be careful here startup_64 needs to be at a predictable + * address so I can export it in an ELF header. Bootloaders + * should look at the ELF header to find this address, as + * it may change in the future. + */ + .code64 + .org 0x200 +ENTRY(startup_64) + /* We come here either from startup_32 or directly from a + * 64bit bootloader. If we come here from a bootloader we depend on + * an identity mapped page table being provied that maps our + * entire text+data+bss and hopefully all of memory. + */ + + /* Setup data segments. */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Compute the decompressed kernel start address. It is where + * we were loaded at aligned to a 2M boundary. %rbp contains the + * decompressed kernel start address. + * + * If it is a relocatable kernel then decompress and run the kernel + * from load address aligned to 2MB addr, otherwise decompress and + * run the kernel from CONFIG_PHYSICAL_START + */ + + /* Start with the delta to where the kernel will run at. */ +#ifdef CONFIG_RELOCATABLE + leaq startup_32(%rip) /* - $startup_32 */, %rbp + addq $(LARGE_PAGE_SIZE - 1), %rbp + andq $LARGE_PAGE_MASK, %rbp + movq %rbp, %rbx +#else + movq $CONFIG_PHYSICAL_START, %rbp + movq %rbp, %rbx +#endif + + /* Replace the compressed data size with the uncompressed size */ + movl input_len(%rip), %eax + subq %rax, %rbx + movl output_len(%rip), %eax + addq %rax, %rbx + /* Add 8 bytes for every 32K input block */ + shrq $12, %rax + addq %rax, %rbx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addq $(32768 + 18 + 4095), %rbx + andq $~4095, %rbx + +/* Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. + */ + leaq _end(%rip), %r8 + leaq _end(%rbx), %r9 + movq $_end /* - $startup_32 */, %rcx +1: subq $8, %r8 + subq $8, %r9 + movq 0(%r8), %rax + movq %rax, 0(%r9) + subq $8, %rcx + jnz 1b + +/* + * Jump to the relocated address. + */ + leaq relocated(%rbx), %rax + jmp *%rax + +.section ".text" +relocated: + /* * Clear BSS */ - xorl %eax,%eax - movl $_edata,%edi - movl $_end,%ecx - subl %edi,%ecx + xorq %rax, %rax + leaq _edata(%rbx), %rdi + leaq _end(%rbx), %rcx + subq %rdi, %rcx cld rep stosb + + /* Setup the stack */ + leaq user_stack_end(%rip), %rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* * Do the decompression, and jump to the new kernel.. */ - subl $16,%esp # place for structure on the stack - movl %esp,%eax - pushl %esi # real mode pointer as second arg - pushl %eax # address of structure as first arg - call decompress_kernel - orl %eax,%eax - jnz 3f - addl $8,%esp - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START + pushq %rsi # Save the real mode argument + movq %rsi, %rdi # real mode address + leaq _heap(%rip), %rsi # _heap + leaq input_data(%rip), %rdx # input_data + movl input_len(%rip), %eax + movq %rax, %rcx # input_len + movq %rbp, %r8 # output + call decompress_kernel + popq %rsi -/* - * We come here, if we were loaded high. - * We need to move the move-in-place routine down to 0x1000 - * and then start it with the buffer addresses in registers, - * which we got from the stack. - */ -3: - movl %esi,%ebx - movl $move_routine_start,%esi - movl $0x1000,%edi - movl $move_routine_end,%ecx - subl %esi,%ecx - addl $3,%ecx - shrl $2,%ecx - cld - rep - movsl - - popl %esi # discard the address - addl $4,%esp # real mode pointer - popl %esi # low_buffer_start - popl %ecx # lcount - popl %edx # high_buffer_start - popl %eax # hcount - movl $__PHYSICAL_START,%edi - cli # make sure we don't get interrupted - ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine /* - * Routine (template) for moving the decompressed kernel in place, - * if we were high loaded. This _must_ PIC-code ! + * Jump to the decompressed kernel. */ -move_routine_start: - movl %ecx,%ebp - shrl $2,%ecx - rep - movsl - movl %ebp,%ecx - andl $3,%ecx - rep - movsb - movl %edx,%esi - movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 - addl $3,%ecx - shrl $2,%ecx - rep - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START -move_routine_end: + jmp *%rbp - -/* Stack for uncompression */ - .align 32 -user_stack: + .data +gdt: + .word gdt_end - gdt + .long gdt + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +gdt_end: + .bss +/* Stack for uncompression */ + .balign 4 +user_stack: .fill 4096,4,0 -stack_start: - .long user_stack+4096 - .word __KERNEL_DS - +user_stack_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index 3755b2e394d..f932b0e8909 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -9,10 +9,95 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#define _LINUX_STRING_H_ 1 +#define __LINUX_BITMAP_H 1 + +#include <linux/linkage.h> #include <linux/screen_info.h> #include <asm/io.h> #include <asm/page.h> +/* WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically + * at run time, but no relocation processing is performed. + * This means that it is not safe to place pointers in static structures. + */ + +/* + * Getting to provable safe in place decompression is hard. + * Worst case behaviours need to be analized. + * Background information: + * + * The file layout is: + * magic[2] + * method[1] + * flags[1] + * timestamp[4] + * extraflags[1] + * os[1] + * compressed data blocks[N] + * crc[4] orig_len[4] + * + * resulting in 18 bytes of non compressed data overhead. + * + * Files divided into blocks + * 1 bit (last block flag) + * 2 bits (block type) + * + * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. + * The smallest block type encoding is always used. + * + * stored: + * 32 bits length in bytes. + * + * fixed: + * magic fixed tree. + * symbols. + * + * dynamic: + * dynamic tree encoding. + * symbols. + * + * + * The buffer for decompression in place is the length of the + * uncompressed data, plus a small amount extra to keep the algorithm safe. + * The compressed data is placed at the end of the buffer. The output + * pointer is placed at the start of the buffer and the input pointer + * is placed where the compressed data starts. Problems will occur + * when the output pointer overruns the input pointer. + * + * The output pointer can only overrun the input pointer if the input + * pointer is moving faster than the output pointer. A condition only + * triggered by data whose compressed form is larger than the uncompressed + * form. + * + * The worst case at the block level is a growth of the compressed data + * of 5 bytes per 32767 bytes. + * + * The worst case internal to a compressed block is very hard to figure. + * The worst case can at least be boundined by having one bit that represents + * 32764 bytes and then all of the rest of the bytes representing the very + * very last byte. + * + * All of which is enough to compute an amount of extra data that is required + * to be safe. To avoid problems at the block level allocating 5 extra bytes + * per 32767 bytes of data is sufficient. To avoind problems internal to a block + * adding an extra 32767 bytes (the worst case uncompressed block size) is + * sufficient, to ensure that in the worst case the decompressed data for + * block will stop the byte before the compressed data for a block begins. + |