diff options
Diffstat (limited to 'arch/x86/mm/pat.c')
| -rw-r--r-- | arch/x86/mm/pat.c | 813 |
1 files changed, 548 insertions, 265 deletions
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2a50e0fa64a..657438858e8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -7,30 +7,35 @@ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. */ -#include <linux/mm.h> -#include <linux/kernel.h> -#include <linux/gfp.h> -#include <linux/fs.h> +#include <linux/seq_file.h> #include <linux/bootmem.h> #include <linux/debugfs.h> -#include <linux/seq_file.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/rbtree.h> -#include <asm/msr.h> -#include <asm/tlbflush.h> +#include <asm/cacheflush.h> #include <asm/processor.h> -#include <asm/page.h> +#include <asm/tlbflush.h> +#include <asm/x86_init.h> #include <asm/pgtable.h> -#include <asm/pat.h> -#include <asm/e820.h> -#include <asm/cacheflush.h> #include <asm/fcntl.h> +#include <asm/e820.h> #include <asm/mtrr.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/pat.h> #include <asm/io.h> +#include "pat_internal.h" + #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; -void __cpuinit pat_disable(char *reason) +static inline void pat_disable(const char *reason) { pat_enabled = 0; printk(KERN_INFO "%s\n", reason); @@ -42,21 +47,23 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} #endif -static int debug_enable; +int pat_debug_enable; + static int __init pat_debug_setup(char *str) { - debug_enable = 1; + pat_debug_enable = 1; return 0; } __setup("debugpat", pat_debug_setup); -#define dprintk(fmt, arg...) \ - do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) - - static u64 __read_mostly boot_pat_state; enum { @@ -73,20 +80,25 @@ enum { void pat_init(void) { u64 pat; + bool boot_cpu = !boot_pat_state; if (!pat_enabled) return; - /* Paranoia check. */ - if (!cpu_has_pat && boot_pat_state) { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } } /* Set PWT to Write-Combining. All other bits stay the same */ @@ -110,49 +122,15 @@ void pat_init(void) rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); wrmsrl(MSR_IA32_CR_PAT, pat); - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); -} - -#undef PAT -static char *cattr_name(unsigned long flags) -{ - switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; - } + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); } -/* - * The global memtype list keeps track of memory type for specific - * physical memory areas. Conflicting memory types in different - * mappings can cause CPU cache corruption. To avoid this we keep track. - * - * The list is sorted based on starting address and can contain multiple - * entries for each address (this allows reference counting for overlapping - * areas). All the aliases have the same cache attributes of course. - * Zero attributes are represented as holes. - * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. - * - * memtype_lock protects the whole list. - */ - -struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; -}; +#undef PAT -static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -171,44 +149,112 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_UNCACHABLE) - return _PAGE_CACHE_UC; - if (mtrr_type == MTRR_TYPE_WRCOMB) - return _PAGE_CACHE_WC; + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_UC_MINUS; + + return _PAGE_CACHE_WB; } return req_type; } -static int chk_conflict(struct memtype *new, struct memtype *entry, - unsigned long *type) +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + +static int +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ + struct pagerange_state *state = arg; + + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; + + return state->ram && state->not_ram; +} + +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { - if (new->type != entry->type) { - if (type) { - new->type = entry->type; - *type = entry->type; - } else - goto conflict; + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; + + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); } - /* check overlaps with more than one entry in the list */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (new->end <= entry->start) - break; - else if (new->type != entry->type) - goto conflict; + return (ret > 0) ? -1 : (state.ram ? 1 : 0); +} + +/* + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; } - return 0; - conflict: - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, - new->end, cattr_name(new->type), cattr_name(entry->type)); - return -EBUSY; + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + unsigned long type; + + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + start, end - 1, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } + } + + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, req_type); + } + return 0; } -static struct memtype *cached_entry; -static u64 cached_start; +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, -1); + } + return 0; +} /* * req_type typically has one of the: @@ -217,29 +263,26 @@ static u64 cached_start; * - _PAGE_CACHE_UC_MINUS * - _PAGE_CACHE_UC * - * req_type will have a special case value '-1', when requester want to inherit - * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. - * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { - struct memtype *new, *entry; + struct memtype *new; unsigned long actual_type; - struct list_head *where; + int is_range_ram; int err = 0; - BUG_ON(start >= end); /* end is exclusive */ + BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == -1) - *new_type = _PAGE_CACHE_WB; + if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -247,108 +290,58 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; } - if (req_type == -1) { - /* - * Call mtrr_lookup to get the type hint. This is an - * optimization for /dev/mem mmap'ers into WB memory (BIOS - * tools and ACPI tools). Use WB request for WB memory and use - * UC_MINUS otherwise. - */ - u8 mtrr_type = mtrr_type_lookup(start, end); + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + + if (new_type) + *new_type = actual_type; - if (mtrr_type == MTRR_TYPE_WRBACK) - actual_type = _PAGE_CACHE_WB; - else - actual_type = _PAGE_CACHE_UC_MINUS; - } else - actual_type = pat_x_mtrr_type(start, end, - req_type & _PAGE_CACHE_MASK); + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = reserve_ram_pages_type(start, end, req_type, new_type); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } - new = kmalloc(sizeof(struct memtype), GFP_KERNEL); + new = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; - new->start = start; - new->end = end; - new->type = actual_type; - - if (new_type) - *new_type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else - entry = list_entry(&memtype_list, struct memtype, nd); - - /* Search for existing mapping that overlaps the current range */ - where = NULL; - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (end <= entry->start) { - where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); - break; - } else if (start <= entry->start) { /* end > entry->start */ - err = chk_conflict(new, entry, new_type); - if (!err) { - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); - } - break; - } else if (start < entry->end) { /* start > entry->start */ - err = chk_conflict(new, entry, new_type); - if (!err) { - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); - - /* - * Move to right position in the linked - * list to add this new entry - */ - list_for_each_entry_continue(entry, - &memtype_list, nd) { - if (start <= entry->start) { - where = entry->nd.prev; - break; - } - } - } - break; - } - } - + err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " - "track %s, req %s\n", - start, end, cattr_name(new->type), cattr_name(req_type)); + printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); + return err; } - cached_start = start; - - if (where) - list_add(&new->nd, where); - else - list_add_tail(&new->nd, &memtype_list); - spin_unlock(&memtype_lock); - dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(new->type), cattr_name(req_type), + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; @@ -356,39 +349,137 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry; int err = -EINVAL; + int is_range_ram; + struct memtype *entry; if (!pat_enabled) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; - spin_lock(&memtype_lock); - list_for_each_entry(entry, &memtype_list, nd) { - if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; - - list_del(&entry->nd); - kfree(entry); - err = 0; - break; - } + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = free_ram_pages_type(start, end); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; } + + spin_lock(&memtype_lock); + entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (err) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", - current->comm, current->pid, start, end); + if (!entry) { + printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); + return -EINVAL; } - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); - return err; + kfree(entry); + + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); + + return 0; +} + + +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = rbt_memtype_lookup(paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; } +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + resource_size_t size = end - start; + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) @@ -403,17 +494,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) return 1; } #else +/* This check is needed to avoid cache aliasing when PAT is enabled */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { u64 from = ((u64)pfn) << PAGE_SHIFT; u64 to = from + size; u64 cursor = from; + if (!pat_enabled) + return 1; + while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -426,16 +520,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { - u64 offset = ((u64) pfn) << PAGE_SHIFT; - unsigned long flags = -1; - int retval; + unsigned long flags = _PAGE_CACHE_WB; if (!range_is_allowed(pfn, size)) return 0; - if (file->f_flags & O_SYNC) { + if (file->f_flags & O_DSYNC) flags = _PAGE_CACHE_UC_MINUS; - } #ifdef CONFIG_X86_32 /* @@ -456,88 +547,277 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, } #endif + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +{ + unsigned long id_sz; + + if (base > __pa(high_memory-1)) + return 0; + /* - * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. - * - * Without O_SYNC, we want to get - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. */ - if (flags != -1) { - retval = reserve_memtype(offset, offset + size, flags, NULL); - } else { - retval = reserve_memtype(offset, offset + size, -1, &flags); - } - - if (retval < 0) + if (!page_is_ram(base >> PAGE_SHIFT)) return 0; - if (((pfn < max_low_pfn_mapped) || - (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && - ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { - free_memtype(offset, offset + size); - printk(KERN_INFO - "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", + id_sz = (__pa(high_memory-1) <= base + size) ? + __pa(high_memory) - base : + size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " + "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(flags), - offset, (unsigned long long)(offset + size)); + base, (unsigned long long)(base + size-1)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) +{ + int is_ram = 0; + int ret; + unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + unsigned long flags = want_flags; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + + /* + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. + */ + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; } - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | - flags); - return 1; + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR "%s:%d map pfn expected mapping type %s" + " for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(flags)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } + + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + free_memtype(paddr, paddr + size); + return -EINVAL; + } + return 0; } -void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + */ +int track_pfn_copy(struct vm_area_struct *vma) +{ + resource_size_t paddr; + unsigned long prot; + unsigned long vma_size = vma->vm_end - vma->vm_start; + pgprot_t pgprot; + + if (vma->vm_flags & VM_PAT) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + } + + return 0; +} + +/* + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + */ +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) { - u64 addr = (u64)pfn << PAGE_SHIFT; + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); - reserve_memtype(addr, addr + size, want_flags, &flags); - if (flags != want_flags) { - printk(KERN_INFO - "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", - current->comm, current->pid, - cattr_name(want_flags), - addr, (unsigned long long)(addr + size), - cattr_name(flags)); + /* reserve the whole chunk starting from paddr */ + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; } + + if (!pat_enabled) + return 0; + + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + flags = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; + + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; } -void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +/* + * untrack_pfn is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case pfn, size are zero). + */ +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { - u64 addr = (u64)pfn << PAGE_SHIFT; + resource_size_t paddr; + unsigned long prot; + + if (!(vma->vm_flags & VM_PAT)) + return; - free_memtype(addr, addr + size); + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; + } + free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; +} + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); } +EXPORT_SYMBOL_GPL(pgprot_writecombine); -#if defined(CONFIG_DEBUG_FS) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) -/* get Nth element of the linked list */ static struct memtype *memtype_get_idx(loff_t pos) { - struct memtype *list_node, *print_entry; - int i = 1; + struct memtype *print_entry; + int ret; - print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!print_entry) return NULL; spin_lock(&memtype_lock); - list_for_each_entry(list_node, &memtype_list, nd) { - if (pos == i) { - *print_entry = *list_node; - spin_unlock(&memtype_lock); - return print_entry; - } - ++i; - } + ret = rbt_memtype_copy_nth_element(print_entry, pos); spin_unlock(&memtype_lock); - kfree(print_entry); - return NULL; + + if (!ret) { + return print_entry; + } else { + kfree(print_entry); + return NULL; + } } static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) @@ -567,10 +847,11 @@ static int memtype_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), print_entry->start, print_entry->end); kfree(print_entry); + return 0; } -static struct seq_operations memtype_seq_ops = { +static const struct seq_operations memtype_seq_ops = { .start = memtype_seq_start, .next = memtype_seq_next, .stop = memtype_seq_stop, @@ -591,11 +872,13 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, - NULL, &memtype_fops); + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } return 0; } late_initcall(pat_memtype_list_init); -#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ |
