diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 12:05:30 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 12:05:30 -0800 |
commit | 972d5e7e5b66f5a143026fcdd4b2be2f519c0f12 (patch) | |
tree | 6c1c5bb79fe163b3b48254605b54532099b74cff | |
parent | 5d4863e4cc4dc12d1d5e42da3cb5d38c535e4ad6 (diff) | |
parent | ef0b8b9a521c65201bfca9747ee1bf374296133c (diff) |
Merge branch 'x86-efi-kexec-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 EFI changes from Ingo Molnar:
"This consists of two main parts:
- New static EFI runtime services virtual mapping layout which is
groundwork for kexec support on EFI (Borislav Petkov)
- EFI kexec support itself (Dave Young)"
* 'x86-efi-kexec-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
x86/efi: parse_efi_setup() build fix
x86: ksysfs.c build fix
x86/efi: Delete superfluous global variables
x86: Reserve setup_data ranges late after parsing memmap cmdline
x86: Export x86 boot_params to sysfs
x86: Add xloadflags bit for EFI runtime support on kexec
x86/efi: Pass necessary EFI data for kexec via setup_data
efi: Export EFI runtime memory mapping to sysfs
efi: Export more EFI table variables to sysfs
x86/efi: Cleanup efi_enter_virtual_mode() function
x86/efi: Fix off-by-one bug in EFI Boot Services reservation
x86/efi: Add a wrapper function efi_map_region_fixed()
x86/efi: Remove unused variables in __map_region()
x86/efi: Check krealloc return value
x86/efi: Runtime services virtual mapping
x86/mm/cpa: Map in an arbitrary pgd
x86/mm/pageattr: Add last levels of error path
x86/mm/pageattr: Add a PUD error unwinding path
x86/mm/pageattr: Add a PTE pagetable populating function
x86/mm/pageattr: Add a PMD pagetable populating function
...
23 files changed, 1689 insertions, 116 deletions
diff --git a/Documentation/ABI/testing/sysfs-firmware-efi b/Documentation/ABI/testing/sysfs-firmware-efi new file mode 100644 index 00000000000..05874da7ce8 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-efi @@ -0,0 +1,20 @@ +What: /sys/firmware/efi/fw_vendor +Date: December 2013 +Contact: Dave Young <dyoung@redhat.com> +Description: It shows the physical address of firmware vendor field in the + EFI system table. +Users: Kexec + +What: /sys/firmware/efi/runtime +Date: December 2013 +Contact: Dave Young <dyoung@redhat.com> +Description: It shows the physical address of runtime service table entry in + the EFI system table. +Users: Kexec + +What: /sys/firmware/efi/config_table +Date: December 2013 +Contact: Dave Young <dyoung@redhat.com> +Description: It shows the physical address of config table entry in the EFI + system table. +Users: Kexec diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-runtime-map b/Documentation/ABI/testing/sysfs-firmware-efi-runtime-map new file mode 100644 index 00000000000..c61b9b348e9 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-efi-runtime-map @@ -0,0 +1,34 @@ +What: /sys/firmware/efi/runtime-map/ +Date: December 2013 +Contact: Dave Young <dyoung@redhat.com> +Description: Switching efi runtime services to virtual mode requires + that all efi memory ranges which have the runtime attribute + bit set to be mapped to virtual addresses. + + The efi runtime services can only be switched to virtual + mode once without rebooting. The kexec kernel must maintain + the same physical to virtual address mappings as the first + kernel. The mappings are exported to sysfs so userspace tools + can reassemble them and pass them into the kexec kernel. + + /sys/firmware/efi/runtime-map/ is the directory the kernel + exports that information in. + + subdirectories are named with the number of the memory range: + + /sys/firmware/efi/runtime-map/0 + /sys/firmware/efi/runtime-map/1 + /sys/firmware/efi/runtime-map/2 + /sys/firmware/efi/runtime-map/3 + ... + + Each subdirectory contains five files: + + attribute : The attributes of the memory range. + num_pages : The size of the memory range in pages. + phys_addr : The physical address of the memory range. + type : The type of the memory range. + virt_addr : The virtual address of the memory range. + + Above values are all hexadecimal numbers with the '0x' prefix. +Users: Kexec diff --git a/Documentation/ABI/testing/sysfs-kernel-boot_params b/Documentation/ABI/testing/sysfs-kernel-boot_params new file mode 100644 index 00000000000..eca38ce2852 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-boot_params @@ -0,0 +1,38 @@ +What: /sys/kernel/boot_params +Date: December 2013 +Contact: Dave Young <dyoung@redhat.com> +Description: The /sys/kernel/boot_params directory contains two + files: "data" and "version" and one subdirectory "setup_data". + It is used to export the kernel boot parameters of an x86 + platform to userspace for kexec and debugging purpose. + + If there's no setup_data in boot_params the subdirectory will + not be created. + + "data" file is the binary representation of struct boot_params. + + "version" file is the string representation of boot + protocol version. + + "setup_data" subdirectory contains the setup_data data + structure in boot_params. setup_data is maintained in kernel + as a link list. In "setup_data" subdirectory there's one + subdirectory for each link list node named with the number + of the list nodes. The list node subdirectory contains two + files "type" and "data". "type" file is the string + representation of setup_data type. "data" file is the binary + representation of setup_data payload. + + The whole boot_params directory structure is like below: + /sys/kernel/boot_params + |__ data + |__ setup_data + | |__ 0 + | | |__ data + | | |__ type + | |__ 1 + | |__ data + | |__ type + |__ version + +Users: Kexec diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 3b8e262c365..4eb5fff022b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -899,6 +899,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. edd= [EDD] Format: {"off" | "on" | "skip[mbr]"} + efi= [EFI] + Format: { "old_map" } + old_map [X86-64]: switch to the old ioremap-based EFI + runtime services mapping. 32-bit still uses this one by + default. + efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of your efi variable storage. Use this parameter only if diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index f4f268c2b82..cb81741d3b0 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -608,6 +608,9 @@ Protocol: 2.12+ - If 1, the kernel supports the 64-bit EFI handoff entry point given at handover_offset + 0x200. + Bit 4 (read): XLF_EFI_KEXEC + - If 1, the kernel supports kexec EFI boot with EFI runtime support. + Field name: cmdline_size Type: read Offset/size: 0x238/4 diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 881582f75c9..c584a51add1 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -28,4 +28,11 @@ reference. Current X86-64 implementations only support 40 bits of address space, but we support up to 46 bits. This expands into MBZ space in the page tables. +->trampoline_pgd: + +We map EFI runtime services in the aforementioned PGD in the virtual +range of 64Gb (arbitrarily set, can be raised if needed) + +0xffffffef00000000 - 0xffffffff00000000 + -Andi Kleen, Jul 2004 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9ec06a1f6d6..ec3b8ba6809 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -391,7 +391,14 @@ xloadflags: #else # define XLF23 0 #endif - .word XLF0 | XLF1 | XLF23 + +#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC) +# define XLF4 XLF_EFI_KEXEC +#else +# define XLF4 0 +#endif + + .word XLF0 | XLF1 | XLF23 | XLF4 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 65c6e6e3a55..3b978c472d0 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,6 +1,24 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +/* + * We map the EFI regions needed for runtime services non-contiguously, + * with preserved alignment on virtual addresses starting from -4G down + * for a total max space of 64G. This way, we provide for stable runtime + * services addresses across kernels so that a kexec'd kernel can still + * use them. + * + * This is the main reason why we're doing stable VA mappings for RT + * services. + * + * This flag is used in conjuction with a chicken bit called + * "efi=old_map" which can be used as a fallback to the old runtime + * services mapping method in case there's some b0rkage with a + * particular EFI implementation (haha, it is hard to hold up the + * sarcasm here...). + */ +#define EFI_OLD_MEMMAP EFI_ARCH_1 + #ifdef CONFIG_X86_32 #define EFI_LOADER_SIGNATURE "EL32" @@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4), (u64)(a5), (u64)(a6)) +#define _efi_call_virtX(x, f, ...) \ +({ \ + efi_status_t __s; \ + \ + efi_sync_low_kernel_mappings(); \ + preempt_disable(); \ + __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \ + preempt_enable(); \ + __s; \ +}) + #define efi_call_virt0(f) \ - efi_call0((efi.systab->runtime->f)) -#define efi_call_virt1(f, a1) \ - efi_call1((efi.systab->runtime->f), (u64)(a1)) -#define efi_call_virt2(f, a1, a2) \ - efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) -#define efi_call_virt3(f, a1, a2, a3) \ - efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3)) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4)) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5)) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) + _efi_call_virtX(0, f) +#define efi_call_virt1(f, a1) \ + _efi_call_virtX(1, f, (u64)(a1)) +#define efi_call_virt2(f, a1, a2) \ + _efi_call_virtX(2, f, (u64)(a1), (u64)(a2)) +#define efi_call_virt3(f, a1, a2, a3) \ + _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3)) +#define efi_call_virt4(f, a1, a2, a3, a4) \ + _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4)) +#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ + _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5)) +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ + _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -95,12 +120,28 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern unsigned long x86_efi_facility; +extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_map_region(efi_memory_desc_t *md); +extern void __init efi_map_region_fixed(efi_memory_desc_t *md); +extern void efi_sync_low_kernel_mappings(void); +extern void efi_setup_page_tables(void); +extern void __init old_map_region(efi_memory_desc_t *md); + +struct efi_setup_data { + u64 fw_vendor; + u64 runtime; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +extern u64 efi_setup; #ifdef CONFIG_EFI @@ -110,7 +151,7 @@ static inline bool efi_is_native(void) } extern struct console early_efi_console; - +extern void parse_efi_setup(u64 phys_addr, u32 data_len); #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. @@ -122,6 +163,7 @@ extern struct console early_efi_console; #define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 0ecac257fb2..a83aa44bb1f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { } */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern phys_addr_t slow_virt_to_phys(void *__address); - +extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 9c3733c5f8f..225b0988043 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -6,6 +6,7 @@ #define SETUP_E820_EXT 1 #define SETUP_DTB 2 #define SETUP_PCI 3 +#define SETUP_EFI 4 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -23,6 +24,7 @@ #define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) #define XLF_EFI_HANDOVER_32 (1<<2) #define XLF_EFI_HANDOVER_64 (1<<3) +#define XLF_EFI_KEXEC (1<<4) #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b0a34e2cd7..510cca5c539 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 00000000000..c2bedaea11f --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,340 @@ +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * Copyright (C) 2013, 2013 Red Hat, Inc. + * Dave Young <dyoung@redhat.com> + * + * This file is released under the GPLv2 + */ + +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/io.h> +#include <asm/setup.h> + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, (void *)&boot_params + off, count); + return count; +} + +static struct bin_attribute boot_params_data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = boot_params_data_read, + .size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { + &boot_params_version_attr.attr, + NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { + &boot_params_data_attr, + NULL, +}; + +static struct attribute_group boot_params_attr_group = { + .attrs = boot_params_version_attrs, + .bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ + const char *name; + + name = kobject_name(kobj); + return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + if (nr == i) { + *paddr = pa_data; + return 0; + } + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + if (nr == i) { + *size = data->len; + iounmap(data); + return 0; + } + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int nr, ret; + u64 paddr; + struct setup_data *data; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + ret = sprintf(buf, "0x%x\n", data->type); + iounmap(data); + return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, + struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, + loff_t off, size_t count) +{ + int nr, ret = 0; + u64 paddr; + struct setup_data *data; + void *p; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + if (off > data->len) { + ret = -EINVAL; + goto out; + } + + if (count > data->len - off) + count = data->len - off; + + if (!count) + goto out; + + ret = count; + p = ioremap_cache(paddr + sizeof(*data), data->len); + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(buf, p + off, count); + iounmap(p); +out: + iounmap(data); + return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { + &type_attr.attr, + NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { + &data_attr, + NULL, +}; + +static struct attribute_group setup_data_attr_group = { + .attrs = setup_data_type_attrs, + .bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, + struct kobject **kobjp, int nr) +{ + int ret = 0; + size_t size; + struct kobject *kobj; + char name[16]; /* should be enough for setup_data nodes numbers */ + snprintf(name, 16, "%d", nr); + + kobj = kobject_create_and_add(name, parent); + if (!kobj) + return -ENOMEM; + + ret = get_setup_data_size(nr, &size); + if (ret) + goto out_kobj; + + data_attr.size = size; + ret = sysfs_create_group(kobj, &setup_data_attr_group); + if (ret) + goto out_kobj; + *kobjp = kobj; + + return 0; +out_kobj: + kobject_put(kobj); + return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ + sysfs_remove_group(kobj, &setup_data_attr_group); + kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ + int ret = 0; + struct setup_data *data; + + *nr = 0; + while (pa_data) { + *nr += 1; + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + pa_data = data->next; + iounmap(data); + } + +out: + return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ + struct kobject *setup_data_kobj, **kobjp; + u64 pa_data; + int i, j, nr, ret = 0; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return 0; + + setup_data_kobj = kobject_create_and_add("setup_data", parent); + if (!setup_data_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = get_setup_data_total_num(pa_data, &nr); + if (ret) + goto out_setup_data_kobj; + + kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); + if (!kobjp) { + ret = -ENOMEM; + goto out_setup_data_kobj; + } + + for (i = 0; i < nr; i++) { + ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); + if (ret) + goto out_clean_nodes; + } + + kfree(kobjp); + return 0; + +out_clean_nodes: + for (j = i - 1; j > 0; j--) + cleanup_setup_data_node(*(kobjp + j)); + kfree(kobjp); +out_setup_data_kobj: + kobject_put(setup_data_kobj); +out: + return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ + int ret; + struct kobject *boot_params_kobj; + + boot_params_kobj = kobject_create_and_add("boot_params", + kernel_kobj); + if (!boot_params_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); + if (ret) + goto out_boot_params_kobj; + + ret = create_setup_data_nodes(boot_params_kobj); + if (ret) + goto out_create_group; + + return 0; +out_create_group: + sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: + kobject_put(boot_params_kobj); +out: + return ret; +} + +arch_initcall(boot_params_ksysfs_init); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cb233bc9dee..be4b456e444 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -447,6 +447,9 @@ static void __init parse_setup_data(void) case SETUP_DTB: add_dtb(pa_data); break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; default: break; } @@ -924,8 +927,6 @@ void __init setup_arch(char **cmdline_p) iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); - /* update the e820_saved too */ - e820_reserve_setup_data(); copy_edd(); @@ -987,6 +988,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif + /* update the e820_saved too */ + e820_reserve_setup_data(); finish_e820_parsing(); if (efi_enabled(EFI_BOOT)) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bb32480c2d7..b3b19f46c01 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -30,6 +30,7 @@ */ struct cpa_data { unsigned long *vaddr; + pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; int numpages; @@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, return prot; } -/* - * Lookup the page table entry for a virtual address. Return a pointer - * to the entry and the level of the mapping. - * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. - */ -pte_t *lookup_address(unsigned long address, unsigned int *level) +static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) { - pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; @@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) return pte_offset_kernel(pmd, address); } + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + return __lookup_address_in_pgd(pgd_offset_k(address), address, level); +} EXPORT_SYMBOL_GPL(lookup_address); +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, + unsigned int *level) +{ + if (cpa->pgd) + return __lookup_address_in_pgd(cpa->pgd + pgd_index(address), + address, level); + + return lookup_address(address, level); +} + /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() @@ -437,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Check for races, another CPU might have split this page * up already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) goto out_unlock; @@ -543,7 +559,8 @@ out_unlock: } static int -__split_large_page(pte_t *kpte, unsigned long address, struct page *base) +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, + struct page *base) { pte_t *pbase = (pte_t *)page_address(base); unsigned long pfn, pfninc = 1; @@ -556,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base) * Check for races, another CPU might have split this page * up for us already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; @@ -632,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base) return 0; } -static int split_large_page(pte_t *kpte, unsigned long address) +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + unsigned long address) { struct page *base; @@ -644,15 +662,390 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!base) return -ENOMEM; - if (__split_large_page(kpte, address, base)) + if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; } +static bool try_to_free_pte_page(pte_t *pte) +{ + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; + + free_page((unsigned long)pte); + return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; + + free_page((unsigned long)pmd); + return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, start); + + while (start < end) { + set_pte(pte, __pte(0)); + + start += PAGE_SIZE; + pte++; + } + + if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + if (unmap_pte_range(pmd, start, end)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, start); + + /* + * Not on a 2MB page boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + __unmap_pmd_range(pud, pmd, start, pre_end); + + start = pre_end; + pmd++; + } + + /* + * Try to unmap in 2M chunks. + */ + while (end - start >= PMD_SIZE) { + if (pmd_large(*pmd)) + pmd_clear(pmd); + else + __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; + } + + /* + * 4K leftovers? |