diff options
61 files changed, 1429 insertions, 942 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 363e348bff9..da0e0773ca9 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -594,6 +594,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. is selected automatically. Check Documentation/kdump/kdump.txt for further details. + crashkernel_low=size[KMG] + [KNL, x86] parts under 4G. + crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 406d82d5d2b..0e383169839 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -57,6 +57,10 @@ Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment Protocol 2.11: (Kernel 3.6) Added a field for offset of EFI handover protocol entry point. +Protocol 2.12: (Kernel 3.9) Added the xloadflags field and extension fields + to struct boot_params for for loading bzImage and ramdisk + above 4G in 64bit. + **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -182,7 +186,7 @@ Offset Proto Name Meaning 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 0235/1 2.10+ min_alignment Minimum alignment, as a power of two -0236/2 N/A pad3 Unused +0236/2 2.12+ xloadflags Boot protocol option flags 0238/4 2.06+ cmdline_size Maximum size of the kernel command line 023C/4 2.07+ hardware_subarch Hardware subarchitecture 0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data @@ -582,6 +586,27 @@ Protocol: 2.10+ misaligned kernel. Therefore, a loader should typically try each power-of-two alignment from kernel_alignment down to this alignment. +Field name: xloadflags +Type: read +Offset/size: 0x236/2 +Protocol: 2.12+ + + This field is a bitmask. + + Bit 0 (read): XLF_KERNEL_64 + - If 1, this kernel has the legacy 64-bit entry point at 0x200. + + Bit 1 (read): XLF_CAN_BE_LOADED_ABOVE_4G + - If 1, kernel/boot_params/cmdline/ramdisk can be above 4G. + + Bit 2 (read): XLF_EFI_HANDOVER_32 + - If 1, the kernel supports the 32-bit EFI handoff entry point + given at handover_offset. + + Bit 3 (read): XLF_EFI_HANDOVER_64 + - If 1, the kernel supports the 64-bit EFI handoff entry point + given at handover_offset + 0x200. + Field name: cmdline_size Type: read Offset/size: 0x238/4 @@ -1029,6 +1054,44 @@ must have read/write permission; CS must be __BOOT_CS and DS, ES, SS must be __BOOT_DS; interrupt must be disabled; %esi must hold the base address of the struct boot_params; %ebp, %edi and %ebx must be zero. +**** 64-bit BOOT PROTOCOL + +For machine with 64bit cpus and 64bit kernel, we could use 64bit bootloader +and we need a 64-bit boot protocol. + +In 64-bit boot protocol, the first step in loading a Linux kernel +should be to setup the boot parameters (struct boot_params, +traditionally known as "zero page"). The memory for struct boot_params +could be allocated anywhere (even above 4G) and initialized to all zero. +Then, the setup header at offset 0x01f1 of kernel image on should be +loaded into struct boot_params and examined. The end of setup header +can be calculated as follows: + + 0x0202 + byte value at offset 0x0201 + +In addition to read/modify/write the setup header of the struct +boot_params as that of 16-bit boot protocol, the boot loader should +also fill the additional fields of the struct boot_params as described +in zero-page.txt. + +After setting up the struct boot_params, the boot loader can load +64-bit kernel in the same way as that of 16-bit boot protocol, but +kernel could be loaded above 4G. + +In 64-bit boot protocol, the kernel is started by jumping to the +64-bit kernel entry point, which is the start address of loaded +64-bit kernel plus 0x200. + +At entry, the CPU must be in 64-bit mode with paging enabled. +The range with setup_header.init_size from start address of loaded +kernel and zero page and command line buffer get ident mapping; +a GDT must be loaded with the descriptors for selectors +__BOOT_CS(0x10) and __BOOT_DS(0x18); both descriptors must be 4G flat +segment; __BOOT_CS must have execute/read permission, and __BOOT_DS +must have read/write permission; CS must be __BOOT_CS and DS, ES, SS +must be __BOOT_DS; interrupt must be disabled; %rsi must hold the base +address of the struct boot_params. + **** EFI HANDOVER PROTOCOL This protocol allows boot loaders to defer initialisation to the EFI diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index cf5437deda8..199f453cb4d 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -19,6 +19,9 @@ Offset Proto Name Meaning 090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!! 0A0/010 ALL sys_desc_table System description table (struct sys_desc_table) 0B0/010 ALL olpc_ofw_header OLPC's OpenFirmware CIF and friends +0C0/004 ALL ext_ramdisk_image ramdisk_image high 32bits +0C4/004 ALL ext_ramdisk_size ramdisk_size high 32bits +0C8/004 ALL ext_cmd_line_ptr cmd_line_ptr high 32bits 140/080 ALL edid_info Video mode setup (struct edid_info) 1C0/020 ALL efi_info EFI 32 information (struct efi_info) 1E0/004 ALL alk_mem_k Alternative mem check, in KB @@ -27,6 +30,7 @@ Offset Proto Name Meaning 1E9/001 ALL eddbuf_entries Number of entries in eddbuf (below) 1EA/001 ALL edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer (below) +1EF/001 ALL sentinel Used to detect broken bootloaders 290/040 ALL edd_mbr_sig_buffer EDD MBR signatures 2D0/A00 ALL e820_map E820 memory map table (array of struct e820entry) diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c index 41dd0088497..02f24447520 100644 --- a/arch/mips/cavium-octeon/dma-octeon.c +++ b/arch/mips/cavium-octeon/dma-octeon.c @@ -317,7 +317,8 @@ void __init plat_swiotlb_setup(void) octeon_swiotlb = alloc_bootmem_low_pages(swiotlbsize); - swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1); + if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM) + panic("Cannot allocate SWIOTLB buffer"); mips_dma_map_ops = &octeon_linear_dma_map_ops.dma_map_ops; } diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index c3b72423c84..fc5a7c4bd9e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void) flushi(&valid_addr_bitmap_insn[0]); } +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + int i; + + for_each_online_node(i) + if (NODE_DATA(i)->node_spanned_pages) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} void __init mem_init(void) { unsigned long codepages, datapages, initpages; @@ -2038,20 +2048,8 @@ void __init mem_init(void) high_memory = __va(last_valid_pfn << PAGE_SHIFT); -#ifdef CONFIG_NEED_MULTIPLE_NODES - { - int i; - for_each_online_node(i) { - if (NODE_DATA(i)->node_spanned_pages != 0) { - totalram_pages += - free_all_bootmem_node(NODE_DATA(i)); - } - } - totalram_pages += free_low_memory_core_early(MAX_NUMNODES); - } -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif /* We subtract one to account for the mem_map_zero page * allocated below. diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 18997e5a105..5b7531966b8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -285,16 +285,26 @@ struct biosregs { void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); /* cmdline.c */ -int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); -int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); +int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize); +int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option); static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + if (cmd_line_ptr >= 0x100000) + return -1; /* inaccessible */ + + return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize); } static inline int cmdline_find_option_bool(const char *option) { - return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + if (cmd_line_ptr >= 0x100000) + return -1; /* inaccessible */ + + return __cmdline_find_option_bool(cmd_line_ptr, option); } diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 6b3b6f708c0..625d21b0cd3 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -27,7 +27,7 @@ static inline int myisspace(u8 c) * Returns the length of the argument (regardless of if it was * truncated to fit in the buffer), or -1 on not found. */ -int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) +int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize) { addr_t cptr; char c; @@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int st_bufcpy /* Copying this to buffer */ } state = st_wordstart; - if (!cmdline_ptr || cmdline_ptr >= 0x100000) - return -1; /* No command line, or inaccessible */ + if (!cmdline_ptr) + return -1; /* No command line */ cptr = cmdline_ptr & 0xf; set_fs(cmdline_ptr >> 4); @@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int * Returns the position of that option (starts counting with 1) * or 0 on not found */ -int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) +int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option) { addr_t cptr; char c; @@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) st_wordskip, /* Miscompare, skip */ } state = st_wordstart; - if (!cmdline_ptr || cmdline_ptr >= 0x100000) - return -1; /* No command line, or inaccessible */ + if (!cmdline_ptr) + return -1; /* No command line */ cptr = cmdline_ptr & 0xf; set_fs(cmdline_ptr >> 4); diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 10f6b1178c6..bffd73b45b1 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} int cmdline_find_option(const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); + return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize); } int cmdline_find_option_bool(const char *option) { - return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); + return __cmdline_find_option_bool(get_cmd_line_ptr(), option); } #endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2c4b171eec3..d9ae9a4ffcb 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -37,6 +37,12 @@ __HEAD .code32 ENTRY(startup_32) + /* + * 32bit entry is 0 and it is ABI so immutable! + * If we come here directly from a bootloader, + * kernel(text+data+bss+brk) ramdisk, zero_page, command line + * all need to be under the 4G limit. + */ cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -154,6 +160,12 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr + /* After gdt is loaded */ + xorl %eax, %eax + lldt %ax + movl $0x20, %eax + ltr %ax + /* * Setup for the jump to 64bit mode * @@ -176,28 +188,18 @@ ENTRY(startup_32) lret ENDPROC(startup_32) -no_longmode: - /* This isn't an x86-64 CPU so hang */ -1: - hlt - jmp 1b - -#include "../../kernel/verify_cpu.S" - - /* - * Be careful here startup_64 needs to be at a predictable - * address so I can export it in an ELF header. Bootloaders - * should look at the ELF header to find this address, as - * it may change in the future. - */ .code64 .org 0x200 ENTRY(startup_64) /* + * 64bit entry is 0x200 and it is ABI so immutable! * We come here either from startup_32 or directly from a - * 64bit bootloader. If we come here from a bootloader we depend on - * an identity mapped page table being provied that maps our - * entire text+data+bss and hopefully all of memory. + * 64bit bootloader. + * If we come here from a bootloader, kernel(text+data+bss+brk), + * ramdisk, zero_page, command line could be above 4G. + * We depend on an identity mapped page table being provided + * that maps our entire kernel(text+data+bss+brk), zero page + * and command line. */ #ifdef CONFIG_EFI_STUB /* @@ -247,9 +249,6 @@ preferred_addr: movl %eax, %ss movl %eax, %fs movl %eax, %gs - lldt %ax - movl $0x20, %eax - ltr %ax /* * Compute the decompressed kernel start address. It is where @@ -349,6 +348,15 @@ relocated: */ jmp *%rbp + .code32 +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + .data gdt: .word gdt_end - gdt diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 88f7ff6da40..7cb56c6ca35 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,6 +325,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; + sanitize_boot_params(real_mode); + if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0e6dc0ee0ee..674019d8e23 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -18,6 +18,7 @@ #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> +#include <asm/bootparam_utils.h> #define BOOT_BOOT_H #include "../ctype.h" diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8c132a625b9..9ec06a1f6d6 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -21,6 +21,7 @@ #include <asm/e820.h> #include <asm/page_types.h> #include <asm/setup.h> +#include <asm/bootparam.h> #include "boot.h" #include "voffset.h" #include "zoffset.h" @@ -255,6 +256,9 @@ section_table: # header, from the old boot sector. .section ".header", "a" + .globl sentinel +sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */ + .globl hdr hdr: setup_sects: .byte 0 /* Filled in by build.c */ @@ -279,7 +283,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020b # header version number (>= 0x0105) + .word 0x020c # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -297,13 +301,7 @@ type_of_loader: .byte 0 # 0 means ancient bootloader, newer # flags, unused bits must be zero (RFU) bit within loadflags loadflags: -LOADED_HIGH = 1 # If set, the kernel is loaded high -CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free - .byte LOADED_HIGH + .byte LOADED_HIGH # The kernel is to be loaded high setup_move_size: .word 0x8000 # size to move, when setup is not # loaded at 0x90000. We will move setup @@ -369,7 +367,31 @@ relocatable_kernel: .byte 1 relocatable_kernel: .byte 0 #endif min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment -pad3: .word 0 + +xloadflags: +#ifdef CONFIG_X86_64 +# define XLF0 XLF_KERNEL_64 /* 64-bit kernel */ +#else +# define XLF0 0 +#endif + +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) + /* kernel/boot_param/ramdisk could be loaded above 4g */ +# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G +#else +# define XLF1 0 +#endif + +#ifdef CONFIG_EFI_STUB +# ifdef CONFIG_X86_64 +# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */ +# else +# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */ +# endif +#else +# define XLF23 0 +#endif + .word XLF0 | XLF1 | XLF23 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol @@ -397,8 +419,13 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr #define INIT_SIZE VO_INIT_SIZE #endif init_size: .long INIT_SIZE # kernel initialization size -handover_offset: .long 0x30 # offset to the handover +handover_offset: +#ifdef CONFIG_EFI_STUB + .long 0x30 # offset to the handover # protocol entry point +#else + .long 0 +#endif # End of setup header ##################################################### diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 03c0683636b..96a6c756353 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -13,7 +13,7 @@ SECTIONS .bstext : { *(.bstext) } .bsdata : { *(.bsdata) } - . = 497; + . = 495; .header : { *(.header) } .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h new file mode 100644 index 00000000000..5b5e9cb774b --- /dev/null +++ b/arch/x86/include/asm/bootparam_utils.h @@ -0,0 +1,38 @@ +#ifndef _ASM_X86_BOOTPARAM_UTILS_H +#define _ASM_X86_BOOTPARAM_UTILS_H + +#include <asm/bootparam.h> + +/* + * This file is included from multiple environments. Do not + * add completing #includes to make it standalone. + */ + +/* + * Deal with bootloaders which fail to initialize unknown fields in + * boot_params to zero. The list fields in this list are taken from + * analysis of kexec-tools; if other broken bootloaders initialize a + * different set of fields we will need to figure out how to disambiguate. + * + */ +static void sanitize_boot_params(struct boot_params *boot_params) +{ + if (boot_params->sentinel) { + /*fields in boot_params are not valid, clear them */ + memset(&boot_params->olpc_ofw_header, 0, + (char *)&boot_params->alt_mem_k - + (char *)&boot_params->olpc_ofw_header); + memset(&boot_params->kbd_status, 0, + (char *)&boot_params->hdr - + (char *)&boot_params->kbd_status); + memset(&boot_params->_pad7[0], 0, + (char *)&boot_params->edd_mbr_sig_buffer[0] - + (char *)&boot_params->_pad7[0]); + memset(&boot_params->_pad8[0], 0, + (char *)&boot_params->eddbuf[0] - + (char *)&boot_params->_pad8[0]); + memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9)); + } +} + +#endif /* _ASM_X86_BOOTPARAM_UTILS_H */ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index adcc0ae73d0..223042086f4 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,20 +1,14 @@ -#ifndef _ASM_X86_INIT_32_H -#define _ASM_X86_INIT_32_H +#ifndef _ASM_X86_INIT_H +#define _ASM_X86_INIT_H -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif +struct x86_mapping_info { + void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ + void *context; /* context for alloc_pgt_page */ + unsigned long pmd_flag; /* page flag for PMD entry */ + bool kernel_mapping; /* kernel mapping or ident mapping */ +}; -extern void __init zone_sizes_init(void); +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end); -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - - -extern unsigned long __initdata pgt_buf_start; -extern unsigned long __meminitdata pgt_buf_end; -extern unsigned long __meminitdata pgt_buf_top; - -#endif /* _ASM_X86_INIT_32_H */ +#endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6080d2694ba..17483a492f1 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -48,11 +48,11 @@ # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) #else /* Maximum physical address we can use pages from */ -# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can reach in physical address mode */ -# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can use for the control pages */ -# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) /* Allocate one page for the pdp and the second for the code */ # define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 49119fcea2d..52560a2038e 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_X86_32 # include <asm/numa_32.h> -#else -# include <asm/numa_64.h> #endif #ifdef CONFIG_NUMA diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h deleted file mode 100644 index 0c05f7ae46e..00000000000 --- a/arch/x86/include/asm/numa_64.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_X86_NUMA_64_H -#define _ASM_X86_NUMA_64_H - -extern unsigned long numa_free_all_bootmem(void); - -#endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 3698a6a0a94..c87892442e5 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -17,6 +17,10 @@ struct page; +#include <linux/range.h> +extern struct range pfn_mapped[]; +extern int nr_pfn_mapped; + static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index e21fdd10479..54c97879195 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index bc28e6fe705..b6e41b8cd65 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -616,6 +616,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; +void init_mem_mapping(void); +void early_alloc_pgt_buf(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbb..2d883440cb9 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H +#include <asm/sparsemem.h> + #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define EARLY_DYNAMIC_PAGE_TABLES 64 + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 9f82690f81e..e6423002c10 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 888184b2fc8..bdee8bd318e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -731,6 +731,7 @@ extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void early_trap_init(void); +void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fe1ec5bcd84..9c6b890d5e7 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,6 +58,7 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif -extern void __init setup_real_mode(void); +void reserve_real_mode(void); +void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 57693498519..3b2ce8fc995 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -69,17 +69,6 @@ struct x86_init_oem { }; /** - * struct x86_init_mapping - platform specific initial kernel pagetable setup - * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage - * - * For more details on the purpose of this hook, look in - * init_memory_mapping and the commit that added it. - */ -struct x86_init_mapping { - void (*pagetable_reserve)(u64 start, u64 end); -}; - -/** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call to setup * the kernel pagetables and prepare accessors functions. @@ -136,7 +125,6 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; - struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 92862cd9020..c15ddaf9071 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -1,6 +1,31 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H +/* setup_data types */ +#define SETUP_NONE 0 +#define SETUP_E820_EXT 1 +#define SETUP_DTB 2 +#define SETUP_PCI 3 + +/* ram_size flags */ +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +/* loadflags */ +#define LOADED_HIGH (1<<0) +#define QUIET_FLAG (1<<5) +#define KEEP_SEGMENTS (1<<6) +#define CAN_USE_HEAP (1<<7) + +/* xloadflags */ +#define XLF_KERNEL_64 (1<<0) +#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) +#define XLF_EFI_HANDOVER_32 (1<<2) +#define XLF_EFI_HANDOVER_64 (1<<3) + +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <linux/screen_info.h> #include <linux/apm_bios.h> @@ -9,12 +34,6 @@ #include <asm/ist.h> #include <video/edid.h> -/* setup data types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 -#define SETUP_PCI 3 - /* extensible setup data list node */ struct setup_data { __u64 next; @@ -28,9 +47,6 @@ struct setup_header { __u16 root_flags; __u32 syssize; __u16 ram_size; -#define RAMDISK_IMAGE_START_MASK 0x07FF -#define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 __u16 vid_mode; __u16 root_dev; __u16 boot_flag; @@ -42,10 +58,6 @@ struct setup_header { __u16 kernel_version; __u8 type_of_loader; __u8 loadflags; -#define LOADED_HIGH (1<<0) -#define QUIET_FLAG (1<<5) -#define KEEP_SEGMENTS (1<<6) -#define CAN_USE_HEAP (1<<7) __u16 setup_move_size; __u32 code32_start; __u32 ramdisk_image; @@ -58,7 +70,8 @@ struct setup_header { __u32 initrd_addr_max; __u32 kernel_alignment; __u8 relocatable_kernel; - __u8 _pad2[3]; + __u8 min_alignment; + __u16 xloadflags; __u32 cmdline_size; __u32 hardware_subarch; __u64 hardware_subarch_data; @@ -106,7 +119,10 @@ struct boot_params { __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ struct sys_desc_table sys_desc_table; /* 0x0a0 */ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ - __u8 _pad4[128]; /* 0x0c0 */ + __u32 ext_ramdisk_image; /* 0x0c0 */ + __u32 ext_ramdisk_size; /* 0x0c4 */ + __u32 ext_cmd_line_ptr; /* 0x0c8 */ + __u8 _pad4[116]; /* 0x0cc */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ @@ -115,7 +131,20 @@ struct boot_params { __u8 eddbuf_entries; /* 0x1e9 */ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ __u8 kbd_status; /* 0x1eb */ - __u8 _pad6[5]; /* 0x1ec */ + __u8 _pad5[3]; /* 0x1ec */ + /* + * The sentinel is set to a nonzero value (0xff) in header.S. + * + * A bootloader is supposed to only take setup_header and put + * it into a clean boot_params buffer. If it turns out that + * it is clumsy or too generous with the buffer, it most + * probably will pick up the sentinel variable too. The fact + * that this variable then is still 0xff will let kernel + * know that some variables in boot_params are invalid and + * kernel should zero out certain portions of boot_params. + */ + __u8 sentinel; /* 0x1ef */ + __u8 _pad6[1]; /* 0x1f0 */ struct setup_header hdr; /* setup header */ /* 0x1f1 */ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ @@ -134,6 +163,6 @@ enum { X86_NR_SUBARCHS, }; - +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index bacf4b0d91f..cfc755dc160 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include <asm/proto.h> -# include <asm/numa_64.h> #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e66311200cb..b574b295a2f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -768,10 +768,9 @@ int __init gart_iommu_init(void) aper_base = info.aper_base; end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); - } pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 15239fffd6f..eafb084e80f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,6 @@ #include <asm/pci-direct.h> #ifdef CONFIG_X86_64 -# include <asm/numa_64.h> # include <asm/mmconfig.h> # include <asm/cacheflush.h> #endif @@ -685,12 +684,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fdfefa27b94..1905ce98bee 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -17,7 +17,6 @@ #ifdef CONFIG_X86_64 #include <linux/topology.h> -#include <asm/numa_64.h> #endif #include "cpu.h" diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df06ade26be..d32abeabbda 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -835,7 +835,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_opt(char *p) +static int __init parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p) return *p == '\0' ? 0 : -EINVAL; } +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index e1755483299..138463a2487 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,6 +18,7 @@ #include <asm/io_apic.h> #include <asm/bios_ebda.h> #include <asm/tlbflush.h> +#include <asm/bootparam_utils.h> static void __init i386_default_early_setup(void) { @@ -30,19 +31,7 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_reserve(__pa_symbol(_text), - (unsigned long)__bss_stop - (unsigned long)_text); - -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif + sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { @@ -57,11 +46,5 @@ void __init i386_start_kernel(void) break; } - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7b215a50ec1..57334f4cd3a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,12 +25,83 @@ #include <asm/kdebug.h> #include <asm/e820.h> #include <asm/bios_ebda.h> +#include <asm/bootparam_utils.h> -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) +{ + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t pud, *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + +again: + pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd) + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pud_p += pud_index(address); + pud = *pud_p; + + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_p[i] = 0; + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + pmd_p[pmd_index(address)] = pmd; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -41,13 +112,25 @@ static void __init clear_bss(void) (unsigned long) __bss_stop - (unsigned long) __bss_start); } +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} + static void __init copy_bootdata(char *real_mode_data) { char * command_line; + unsigned long cmd_line_ptr; memcpy(&boot_params, real_mode_data, sizeof boot_params); - if (boot_params.hdr.cmd_line_ptr) { - command_line = __va(boot_params.hdr.cmd_line_ptr); + sanitize_boot_params(&boot_params); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } } @@ -70,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { #ifdef CONFIG_EARLY_PRINTK set_intr_gate(i, &early_idt_handlers[i]); @@ -87,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data) } load_idt((const struct desc_ptr *)&idt_descr); + copy_bootdata(__va(real_mode_data)); + if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } void __init x86_64_start_reservations(char *real_mode_data) { - copy_bootdata(__va(real_mode_data)); - - memblock_reserve(__pa_symbol(_text), - (unsigned long)__bss_stop - (unsigned long)_text); - -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); reserve_ebda_region(); - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9c..d94f6d68be2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,45 +78,62 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete + shrq $PGDIR_SHIFT, %rax - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + addq $4096, %rdx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + + addq $8192, %rbx + movq %rdi, %rax + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b /* * Fixup the kernel text+data virtual addresses. Note that @@ -124,7 +141,6 @@ ident_complete: * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -139,17 +155,14 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f ENTRY(secondary_startup_64) /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax - movq %rax, %cr4 + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax addq phys_base(%rip), %rax movq %rax, %cr3 @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. */ movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder @@ -270,13 +303,13 @@ ENDPROC(start_cpu0) /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -284,7 +317,7 @@ ENDPROC(start_cpu0) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -321,14 +354,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -350,7 +391,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -364,6 +405,8 @@ ENTRY(early_idt_handler) decl early_recursion_flag(%rip) INTERRUPT_RETURN + __INITDATA + .balign 4 early_recursion_flag: .long 0 @@ -374,11 +417,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -388,24 +430,37 @@ ENTRY(name) i = i + 1 ; \ .endr + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ + +#ifndef CONFIG_XEN NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -472,6 +517,5 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b3ea9db39db..4eabc160696 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -16,125 +16,12 @@ #include <linux/io.h> #include <linux/suspend.h> +#include <asm/init.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> -static int init_one_level2_page(struct kimage *image, pgd_t *pgd, - unsigned long addr) -{ - pud_t *pud; - pmd_t *pmd; - struct page *page; - int result = -ENOMEM; - - addr &= PMD_MASK; - pgd += pgd_index(addr); - if (!pgd_present(*pgd)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pud = (pud_t *)page_address(page); - clear_page(pud); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pmd = (pmd_t *)page_address(page); - clear_page(pmd); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - result = 0; -out: - return result; -} - -static void init_level2_page(pmd_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - - addr &= PAGE_MASK; - end_addr = addr + PUD_SIZE; - while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - addr += PMD_SIZE; - } -} - -static int init_level3_page(struct kimage *image, pud_t *level3p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + PGDIR_SIZE; - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pmd_t *level2p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); - addr += PUD_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pud_clear(level3p++); - addr += PUD_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, pgd_t *level4p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pud_t *level3p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (pud_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) - goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pgd_clear(level4p++); - addr += PGDIR_SIZE; - } -out: - return result; -} - static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); @@ -184,22 +71,62 @@ err: return result; } +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long mstart, mend; pgd_t *level4p; int result; + int i; + level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); - if (result) - return result; + clear_page(level4p); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } + /* - * image->start may be outside 0 ~ max_pfn, for example when - * jump back to original kernel from kexeced kernel + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. */ - result = init_one_level2_page(image, level4p, image->start); - if (result) - return result; + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + + if (result) + return result; + } + return init_transition_pgtable(image, level4p); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8354399b3aa..be6e435cfc0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,17 +108,16 @@ #include <asm/topology.h> #include <asm/apicdef.h> #include <asm/amd_nb.h> -#ifdef CONFIG_X86_64 -#include <asm/numa_64.h> -#endif #include <asm/mce.h> #include <asm/alternative.h> #include <asm/prom.h> /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align) return ret; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} +#ifdef CONFIG_X86_32 static void __init cleanup_highmap(void) { } @@ -306,27 +294,43 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + + return ramdisk_size; +} + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; - /* We need to move the initrd down into lowmem */ - ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, - PAGE_SIZE); + /* We need to move the initrd down into directly mapped mem */ + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + area_size, PAGE_SIZE); if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the lowmem currently occupied by + /* Note: this includes all the mem currently occupied by the initrd, we rely on that fact to keep the data intact. */ memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; @@ -336,17 +340,7 @@ static void __init relocate_initrd(void) q = (char *)initrd_start; - /* Copy any lowmem portion of the initrd */ - if (ramdisk_image < end_of_lowmem) { - clen = end_of_lowmem - ramdisk_image; - p = (char *)__va(ramdisk_image); - memcpy(q, p, clen); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - - /* Copy the highmem portion of the initrd */ + /* Copy the initrd */ while (ramdisk_size) { slop = ramdisk_image & ~PAGE_MASK; clen = ramdisk_size; @@ -360,22 +354,35 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } - /* high pages is not converted by early_res_to_bootmem */ - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_image = get_ramdisk_image(); + ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -383,22 +390,18 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= (end_of_lowmem>>1)) { + mapped_size = memblock_mem_size(max_pfn_mapped); + if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, end_of_lowmem>>1); - } + ramdisk_size, mapped_size>>1); printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ - /* - * don't need to reserve again, already reserved early - * in i386_start_kernel - */ + if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), + PFN_DOWN(ramdisk_end))) { + /* All are mapped, easy case */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; return; @@ -409,6 +412,9 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else +static void __init early_reserve_initrd(void) +{ +} static void __init reserve_initrd(void) { } @@ -419,8 +425,6 @@ static void __init parse_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { u32 data_len, map_len; @@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void) u64 pa_data; int found = 0; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this - * limit once kexec-tools are fixed. */ #ifdef CONFIG_X86_32 # define CRASH_KERNEL_ADDR_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX (896 << 20) +# define CRASH_KERNEL_ADDR_MAX MAXMEM #endif +static void __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long low_base = 0, low_size = 0; + unsigned long total_low_mem; + unsigned long long base; + int ret; + + total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + ret = parse_crashkernel_low(boot_command_line, total_low_mem, + &low_size, &base); + if (ret != 0 || low_size <= 0) + return; + + low_base = memblock_find_in_range(low_size, (1ULL<<32), + low_size, alignment); + + if (!low_base) { + pr_info("crashkernel low reservation failed - No suitable area found.\n"); + + return; + } + + memblock_reserve(low_base, low_size); + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif +} + static void __init reserve_crashkernel(void) { + const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; @@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - const unsigned long long alignment = 16<<20; /* 16M */ - /* * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ @@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void) pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } + } else { unsigned long long start; @@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); + + if (crash_base >= (1ULL<<32)) + reserve_crashkernel_low(); } #else static void __init reserve_crashkernel(void) @@ -711,6 +747,27 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820_all_mapped(start, start + size, E820_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + e820_remove_range(start, size, E820_RAM, 0); + e820_add_region(start, size, E820_RAM); +} + static int __init parse_reservelow(char *p) { unsigned long long size; @@ -748,6 +805,17 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { + memblock_reserve(__pa_symbol(_text), + (unsigned long)__bss_stop - (unsigned long)_text); + + early_reserve_initrd(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -906,6 +974,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); + e820_add_kernel_range(); trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { @@ -955,6 +1024,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with @@ -964,7 +1035,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = get_max_mapped(); + memblock.current_limit = ISA_END_ADDRESS; memblock_x86_fill(); /* @@ -981,41 +1052,21 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif +#ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped<<PAGE_SHIFT) - 1); +#endif - setup_real_mode(); + reserve_real_mode(); trim_platform_memory_ranges(); - init_gbpages(); - - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); - max_pfn_mapped = max_low_pfn_mapped; - -#ifdef CONFIG_X86_64 - if (max_pfn > max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { + init_mem_mapping(); - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; + early_trap_pf_init(); - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } + setup_real_mode(); - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e..68bda7a8415 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -688,10 +688,19 @@ void __init early_trap_init(void) set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, &page_fault); +#endif load_idt(&idt_descr); } +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, &page_fault); +#endif +} + void __init trap_init(void) { int i; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7a3d075a814..50cf83ecd32 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d7aea41563b..d41815265a0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,86 +17,132 @@ #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +#include "mm_internal.h" -int after_bootmem; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; +static unsigned long min_pfn_mapped; -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; +static bool __initdata can_use_brk_pgt = true; /* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) +__ref void *alloc_low_pages(unsigned int num) { + unsigned long pfn; int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; - phys_addr_t base; - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; + if (after_bootmem) { + unsigned int order; - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | + __GFP_ZERO, order); + } - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE * num , PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", + pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); + } - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); } - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + return __va(pfn << PAGE_SHIFT); +} -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - good_end = max_pfn_mapped << PAGE_SHIFT; +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); + base = __pa(extend_brk(tables, PAGE_SIZE)); pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); +static void __init init_gbpages(void) +{ +#ifdef CONFIG_X86_64 + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif } -void __init native_pagetable_reserve(u64 start, u64 end) +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +static int page_size_mask; + +static void __init probe_page_size_mask(void) { - memblock_reserve(start, end - start); + init_gbpages(); + +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } } #ifdef CONFIG_X86_32 @@ -122,58 +168,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, } /* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, + int nr_range) { - unsigned long page_size_mask = 0; - unsigned long start_pfn, end_pfn; - unsigned long ret = 0; - unsigned long pos; - - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - int use_pse, use_gbpages; + int i; - printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", - start, end - 1); + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<<PG_LEVEL_2M)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { + unsigned long start = round_down(mr[i].start, PMD_SIZE); + unsigned long end = round_up(mr[i].end, PMD_SIZE); -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - use_pse = use_gbpages = 0; -#else - use_pse = cpu_has_pse; - use_gbpages = direct_gbpages; +#ifdef CONFIG_X86_32 + if ((end >> PAGE_SHIFT) > max_low_pfn) + continue; #endif - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_2M; + } + if ((page_size_mask & (1<<PG_LEVEL_1G)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { + unsigned long start = round_down(mr[i].start, PUD_SIZE); + unsigned long end = round_up(mr[i].end, PUD_SIZE); - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_1G; + } } +} - if (use_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; - if (use_pse) - page_size_mask |= 1 << PG_LEVEL_2M; +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, limit_pfn; + unsigned long pfn; + int i; - memset(mr, 0, sizeof(mr)); - nr_range = 0; + limit_pfn = PFN_DOWN(end); /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; + pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -181,66 +220,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + if (pfn == 0) + end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } #ifdef CONFIG_X86_64 /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } #endif /* tail is not big page (2M) alignment */ - start_pfn = pos>>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; + start_pfn = pfn; + end_pfn = limit_pfn; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ @@ -257,59 +290,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, nr_range--; } + if (!after_bootmem) + adjust_range_page_size_mask(mr, nr_range); + for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(mr, nr_range); + return nr_range; +} + +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + struct map_range mr[NR_RANGE_MR]; + unsigned long ret = 0; + int nr_range, i; + + pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); + + memset(mr, 0, sizeof(mr)); + nr_range = split_mem_range(mr, 0, start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#ifdef CONFIG_X86_32 - early_ioremap_page_table_range_init(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); - load_cr3(swapper_pg_dir); -#endif + return ret >> PAGE_SHIFT; +} - __flush_tlb_all(); +/* + * would have hole in the middle or ends, and only ram parts will be mapped. + */ +static unsigned long __init init_range_memory_mapping( + unsigned long r_start, + unsigned long r_end) +{ + unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; + int i; - /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. - * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. - * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. - */ - if (!after_bootmem && pgt_buf_end > pgt_buf_start) - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) + continue; - if (!after_bootmem) - early_memtest(start, end); + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= + min(end, (u64)pgt_buf_top<<PAGE_SHIFT); + init_memory_mapping(start, end); + mapped_ram_size += end - start; + can_use_brk_pgt = true; + } - return ret >> PAGE_SHIFT; + return mapped_ram_size; } +/* (PUD_SHIFT-PMD_SHIFT)/2 */ +#define STEP_SIZE_SHIFT 5 +void __init init_mem_mapping(void) +{ + unsigned long end, real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; + + probe_page_size_mask(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + /* xen has big range in reserved near end of ram, skip it at first */ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, + PAGE_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + while (last_start > ISA_END_ADDRESS) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; + } else + start = ISA_END_ADDRESS; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size <<= STEP_SIZE_SHIFT; + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < end) + init_range_memory_mapping(real_end, end); + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#else + early_ioremap_page_table_range_init(); +#endif + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); +} /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 745d66b843c..b299724f6e3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -53,25 +53,14 @@ #include <asm/page_types.h> #include <asm/init.h> +#include "mm_internal.h" + unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ - unsigned long pfn = pgt_buf_end++; - void *adr; - - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry @@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); - else - pmd_table = (pmd_t *)alloc_low_page(); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = NULL; - - if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif - if (!page_table) - page_table = - (pte_t *)alloc_bootmem_pages(PAGE_SIZE); - } else - page_table = (pte_t *)alloc_low_page(); + pte_t *page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) return one_page_table_init(pmd) + pte_idx; } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, - unsigned long vaddr, pte_t *lastpte) + unsigned long vaddr, pte_t *lastpte, + void **adr) { #ifdef CONFIG_HIGHMEM /* @@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin - && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start - || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { pte_t *newpte; int i; BUG_ON(after_bootmem); - newpte = alloc_low_page(); + newpte = *adr; for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); vaddr = start; pgd_idx = pgd_index(vaddr); @@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { pte = page_table_kmap_check(one_page_table_init(pmd), - pmd, vaddr, pte); + pmd, vaddr, pte, &adr); vaddr += PMD_SIZE; } @@ -310,6 +321,7 @@ repeat: __pgprot(PTE_IDENT_ATTR | _PAGE_PSE); + pfn &= PMD_MASK >> PAGE_SHIFT; addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; @@ -455,9 +467,14 @@ void __init native_pagetable_init(void) /* * Remove any mappings which extend past the end of physical - * memory from the boot time page table: + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. */ - for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); pgd = base + pgd_index(va); if (!pgd_present(*pgd)) @@ -468,10 +485,19 @@ void __init native_pagetable_init(void) if (!pmd_present(*pmd)) break; + /* should not be large page here */ + if (pmd_large(*pmd)) { + pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", + pfn, pmd, __pa(pmd)); + BUG_ON(1); + } + pte = pte_offset_kernel(pmd, va); if (!pte_present(*pte)) break; + printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n", + pfn, pmd, __pa(pmd), pte, __pa(pte)); pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); @@ -550,7 +576,7 @@ early_param("highmem", parse_highmem); * artificially via the highmem=x boot parameter then create * it: */ -void __init lowmem_pfn_init(void) +static void __init lowmem_pfn_init(void) { /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; @@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void) * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ -void __init highmem_pfn_init(void) +static void __init highmem_pfn_init(void) { max_low_pfn = MAXMEM_PFN; @@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void) printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); - - after_bootmem = 1; } /* @@ -753,6 +777,8 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; + after_bootmem = 1; + codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 287c6d6a9ef..edaa2daf4b3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,82 @@ #include <asm/uv/uv.h> #include <asm/setup.h> +#include "mm_internal.h" + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; @@ -314,69 +398,24 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = pgt_buf_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - *phys = __pa(adr); - - return adr; - } - - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); - - adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - clear_page(adr); - *phys = pfn * PAGE_SIZE; - return adr; -} - -static __ref void *map_low_page(void *virt) -{ - void *adr; - unsigned long phys, left; - - if (after_bootmem) - return virt; - - phys = __pa(virt); - left = phys & (PAGE_SIZE - 1); - adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); - adr = (void *)(((unsigned long)adr) | left); - - return adr; -} - -static __ref void unmap_low_page(void *adr) -{ - if (after_bootmem) - return; - - early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i; pte_t *pte = pte_page + pte_index(addr); - for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; if (addr >= end) { - if (!after_bootmem) { - for(; i < PTRS_PER_PTE; i++, pte++) - set_pte(pte, __pte(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; } /* @@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address = next) { - unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; pgprot_t new_prot = prot; + next = (address & PMD_MASK) + PMD_SIZE; if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; } - next = (address & PMD_MASK) + PMD_SIZE; - if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + pte = (pte_t *)pmd_page_vaddr(*pmd); last_map_addr = phys_pte_init(pte, address, end, prot); - unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; continue; } - pte = alloc_low_page(&pte_phys); + pte = alloc_low_page(); last_map_addr = phys_pte_init(pte, address, end, new_prot); - unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = next) { - unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - if (addr >= end) - break; - next = (addr & PUD_MASK) + PUD_SIZE; - - if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { - set_pud(pud, __pud(0)); + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } if (pud_val(*pud)) { if (!pud_large(*pud)) { - pmd = map_low_page(pmd_offset(pud, 0)); + pmd = pmd_offset(pud, 0); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); __flush_tlb_all(); continue; } @@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; continue; } - pmd = alloc_low_page(&pmd_phys); + pmd = alloc_low_page(); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start, for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; pud_t *pud; - next = (start + PGDIR_SIZE) & PGDIR_MASK; - if (next > end) - next = end; + next = (start & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { - pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + pud = (pud_t *)pgd_page_vaddr(*pgd); last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); continue; } - pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + pud = alloc_low_page(); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, __va(pud_phys)); + pgd_populate(&init_mm, pgd, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); @@ -686,6 +711,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static struct kcore_list kcore_vsyscall; +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; @@ -698,11 +733,8 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif absent_pages = absent_pages_in_range(0, max_pfn); reservedpages = max_pfn - totalram_pages - absent_pages; @@ -776,6 +808,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long all_end = PFN_ALIGN(&_end); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -784,10 +817,10 @@ void mark_rodata_ro(void) kernel_set_to_readonly = 1; /* - * The rodata section (but not the kernel text!) should also be - * not-executable. + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. */ - set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 00000000000..6b563a11889 --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,19 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} + +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + +extern int after_bootmem; + +#endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 92e27119ee1..9405ffc9150 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -10,16 +10,3 @@ void __init initmem_init(void) { x86_numa_init(); } - -unsigned long __init numa_free_all_bootmem(void) -{ - unsigned long pages = 0; - int i; - - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); - - pages += free_low_memory_core_early(MAX_NUMNODES); - - return pages; -} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6d13d2a3f82..a1b1c88f9ca 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -579,16 +579,10 @@ static int split_large_page(pte_t *kpte, unsigned long address) for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); - if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), + PFN_DOWN(__pa(address)) + 1)) split_page_count(level); -#ifdef CONFIG_X86_64 - if (address >= (unsigned long)__va(1UL<<32) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) - split_page_count(level); -#endif - /* * Install the new, split up pagetable. * @@ -757,13 +751,9 @@ static int cpa_process_alias(struct cpa_data *cpa) unsigned long vaddr; int ret; - if (cpa->pfn >= max_pfn_mapped) + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; -#ifdef CONFIG_X86_64 - if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) - return 0; -#endif /* * No need to redo, when the primary call touched the direct * mapping already: diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1b600266265..1743c1c9241 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, end_pfn; + u64 end, systab, start_pfn, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; + start_pfn = PFN_DOWN(md->phys_addr); end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) { + if (pfn_range_is_mapped(start_pfn, end_pfn)) { va = __va(md->phys_addr); if (!(md->attribute & EFI_MEMORY_WB)) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 460f314d13e..a0fde91c16c 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,8 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> + +#include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt; void *relocated_restore_code; -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void *alloc_pgt_page(void *context) { - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; + return (void *)get_safe_page(GFP_ATOMIC); } static int set_up_temporary_mappings(void) { - unsigned long start, end, next; - int error; + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernel_mapping = true, + }; + unsigned long mstart, mend; + int result; + int i; temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); if (!temp_level4_pgt) @@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void) init_level4_pgt[pgd_index(__START_KERNEL_map)]); /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(max_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, temp_level4_pgt, + mstart, mend); + + if (result) + return result; } + return 0; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 80450261215..a44f457e70a 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -8,9 +8,26 @@ struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; -void __init setup_real_mode(void) +void __init reserve_real_mode(void) { phys_addr_t mem; + unsigned char *base; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + + /* Has to be under 1M so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + if (!mem) + panic("Cannot allocate trampoline\n"); + + base = __va(mem); + memblock_reserve(mem, size); + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); +} + +void __init setup_real_mode(void) +{ u16 real_mode_seg; u32 *rel; u32 count; @@ -25,16 +42,7 @@ void __init setup_real_mode(void) u64 efer; #endif - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - base = __va(mem); - memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); + base = (unsigned char *)real_mode_header; memcpy(base, real_mode_blob, size); @@ -78,16 +86,18 @@ void __init setup_real_mode(void) *trampoline_cr4_features = read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa_symbol(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa_symbol(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; + trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif } /* - * set_real_mode_permissions() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page + * reserve_real_mode() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. + * function. Also trampoline code will be executed by APs so we + * need to mark it executable at do_pre_smp_initcalls() at least, + * thus run it as a early_initcall(). */ static int __init set_real_mode_permissions(void) { @@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void) return 0; } - -arch_initcall(set_real_mode_permissions); +early_initcall(set_real_mode_permissions); diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5a1847d6193..79d67bd507f 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -814,12 +814,14 @@ int main(int argc, char **argv) read_relocs(fp); if (show_absolute_syms) { print_absolute_symbols(); - return 0; + goto out; } if (show_absolute_relocs) { print_absolute_relocs(); - return 0; + goto out; } emit_relocs(as_text, use_real_mode); +out: + fclose(fp); return 0; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 01de35c7722..f5e86eee4e0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -{ - /* reserve the range used */ - native_pagetable_reserve(start, end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, - PFN_PHYS(pgt_buf_top)); - while (end < PFN_PHYS(pgt_buf_top)) { - make_lowmem_page_readwrite(__va(end)); - end += PAGE_SIZE; - } -} - #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - /* - * If the new pfn is within the range of the newly allocated - * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot as a freshly allocated page, make sure - * it is RO. - */ - if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_top)) || - (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) - pte = pte_wrprotect(pte); - return pte; } #endif /* CONFIG_X86_64 */ @@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index af47e759446..1d94316f0ea 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -231,7 +231,9 @@ retry: } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); if (early) { - swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + verbose)) + panic("Cannot allocate SWIOTLB buffer"); rc = 0; } else rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 3f778c27f82..3cd16ba82f1 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -99,6 +99,9 @@ void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); +void *__alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -132,6 +135,8 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define alloc_bootmem_low(x) \ __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) +#define alloc_bootmem_low_pages_nopanic(x) \ + __alloc_bootmem_low_nopanic(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages(x) \ __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages_node(pgdat, x) \ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d0b8458a703..d2e6927bbaa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -191,6 +191,7 @@ extern struct kimage *kexec_crash_image; /* Location of a reserved region to hold the crash kernel. */ extern struct resource crashk_res; +extern struct resource crashk_low_res; typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4]; extern note_buf_t __percpu *crash_notes; extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; @@ -199,6 +200,8 @@ extern size_t vmcoreinfo_max_size; int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d452ee19106..f388203db7e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -155,6 +155,7 @@ phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t memblock_phys_mem_size(void); +phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); diff --git a/include/linux/mm.h b/include/linux/mm.h index 66e2f7c61e5..9d9dcc35d6a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1386,7 +1386,6 @@ extern void __init mmap_init(void); extern void show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -extern int after_bootmem; extern __printf(3, 4) void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 071d62c214a..2de42f9401d 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -23,7 +23,7 @@ extern int swiotlb_force; #define IO_TLB_SHIFT 11 extern void swiotlb_init(int verbose); -extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); +int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5..2436ffcec91 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = { .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource crashk_low_res = { + .name = "Crash kernel low", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; int kexec_should_crash(struct task_struct *p) { @@ -1369,10 +1375,11 @@ static int __init parse_crashkernel_simple(char *cmdline, * That function is the entry point for command line parsing and should be * called from the arch-specific code. */ -int __init parse_crashkernel(char *cmdline, +static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + const char *name) { char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; @@ -1382,16 +1389,16 @@ int __init parse_crashkernel(char *cmdline, *crash_base = 0; /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); + p = strstr(p, name); while (p) { ck_cmdline = p; - p = strstr(p+1, "crashkernel="); + p = strstr(p+1, name); } if (!ck_cmdline) return -EINVAL; - ck_cmdline += 12; /* strlen("crashkernel=") */ + ck_cmdline += strlen(name); /* * if the commandline contains a ':', then that's the extended @@ -1409,6 +1416,23 @@ int __init parse_crashkernel(char *cmdline, return 0; } +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel="); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_low="); +} static void update_vmcoreinfo_note(void) { diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 196b06984de..bfe02b8fc55 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -122,11 +122,18 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return phys_to_dma(hwdev, virt_to_phys(address)); } +static bool no_iotlb_memory; + void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; unsigned char *vstart, *vend; + if (no_iotlb_memory) { + pr_warn("software IO TLB: No low mem\n"); + return; + } + vstart = phys_to_virt(io_tlb_start); vend = phys_to_virt(io_tlb_end); @@ -136,7 +143,7 @@ void swiotlb_print_info(void) bytes >> 20, vstart, vend - 1); } -void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { void *v_overflow_buffer; unsigned long i, bytes; @@ -150,9 +157,10 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) /* * Get the overflow emergency buffer */ - v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); + v_overflow_buffer = alloc_bootmem_low_pages_nopanic( + PAGE_ALIGN(io_tlb_overflow)); if (!v_overflow_buffer) - panic("Cannot allocate SWIOTLB overflow buffer!\n"); + return -ENOMEM; io_tlb_overflow_buffer = __pa(v_overflow_buffer); @@ -169,15 +177,19 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (verbose) swiotlb_print_info(); + + return 0; } /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ -static void __init -swiotlb_init_with_default_size(size_t default_size, int verbose) +void __init +swiotlb_init(int verbose) { + /* default to 64MB */ + size_t default_size = 64UL<<20; unsigned char *vstart; unsigned long bytes; @@ -188,20 +200,16 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; - /* - * Get IO TLB memory from the low pages - */ - vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); - if (!vstart) - panic("Cannot allocate SWIOTLB buffer"); - - swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose); -} + /* Get IO TLB memory from the low pages */ + vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); + if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + return; -void __init -swiotlb_init(int verbose) -{ - swiotlb_init_with_default_size(64 * (1<<20), verbose); /* default to 64MB */ + if (io_tlb_start) + free_bootmem(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + pr_warn("Cannot allocate SWIOTLB buffer"); + no_iotlb_memory = true; } /* @@ -405,6 +413,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, unsigned long offset_slots; unsigned long max_slots; + if (no_iotlb_memory) + panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + mask = dma_get_seg_boundary(hwdev); tbl_dma_addr &= mask; diff --git a/mm/bootmem.c b/mm/bootmem.c index b93376c39b6..2b0bcb019ec 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -833,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from diff --git a/mm/memblock.c b/mm/memblock.c index 88adc8afb61..b8d9147e5c0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -828,6 +828,23 @@ phys_addr_t __init memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) +{ + unsigned long pages = 0; + struct memblock_region *r; + unsigned long start_pfn, end_pfn; + + for_each_memblock(memory, r) { + start_pfn = memblock_region_memory_base_pfn(r); + end_pfn = memblock_region_memory_end_pfn(r); + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + pages += end_pfn - start_pfn; + } + + return (phys_addr_t)pages << PAGE_SHIFT; +} + /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index b8294fc03df..5e07d36e381 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -154,21 +154,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) } /** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) -{ - register_page_bootmem_info_node(pgdat); - reset_node_lowmem_managed_pages(pgdat); - - /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -} - -/** * free_all_bootmem - release free pages to the buddy allocator * * Returns the number of pages actually released. @@ -406,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from |