diff options
Diffstat (limited to 'arch/x86/mm/pat.c')
| -rw-r--r-- | arch/x86/mm/pat.c | 764 |
1 files changed, 333 insertions, 431 deletions
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c009a241d56..657438858e8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,13 +12,15 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/gfp.h> +#include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/rbtree.h> #include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/tlbflush.h> +#include <asm/x86_init.h> #include <asm/pgtable.h> #include <asm/fcntl.h> #include <asm/e820.h> @@ -28,6 +30,8 @@ #include <asm/pat.h> #include <asm/io.h> +#include "pat_internal.h" + #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; @@ -51,19 +55,15 @@ static inline void pat_disable(const char *reason) #endif -static int debug_enable; +int pat_debug_enable; static int __init pat_debug_setup(char *str) { - debug_enable = 1; + pat_debug_enable = 1; return 0; } __setup("debugpat", pat_debug_setup); -#define dprintk(fmt, arg...) \ - do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) - - static u64 __read_mostly boot_pat_state; enum { @@ -80,6 +80,7 @@ enum { void pat_init(void) { u64 pat; + bool boot_cpu = !boot_pat_state; if (!pat_enabled) return; @@ -121,49 +122,15 @@ void pat_init(void) rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); wrmsrl(MSR_IA32_CR_PAT, pat); - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); -} -#undef PAT - -static char *cattr_name(unsigned long flags) -{ - switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; - } + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); } -/* - * The global memtype list keeps track of memory type for specific - * physical memory areas. Conflicting memory types in different - * mappings can cause CPU cache corruption. To avoid this we keep track. - * - * The list is sorted based on starting address and can contain multiple - * entries for each address (this allows reference counting for overlapping - * areas). All the aliases have the same cache attributes of course. - * Zero attributes are represented as holes. - * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. - * - * memtype_lock protects the whole list. - */ - -struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; -}; +#undef PAT -static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -182,130 +149,111 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_UNCACHABLE) - return _PAGE_CACHE_UC; - if (mtrr_type == MTRR_TYPE_WRCOMB) - return _PAGE_CACHE_WC; + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_UC_MINUS; + + return _PAGE_CACHE_WB; } return req_type; } +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + static int -chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) -{ - if (new->type != entry->type) { - if (type) { - new->type = entry->type; - *type = entry->type; - } else - goto conflict; - } +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ + struct pagerange_state *state = arg; - /* check overlaps with more than one entry in the list */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (new->end <= entry->start) - break; - else if (new->type != entry->type) - goto conflict; - } - return 0; + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; - conflict: - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, - new->end, cattr_name(new->type), cattr_name(entry->type)); - return -EBUSY; + return state->ram && state->not_ram; } -static struct memtype *cached_entry; -static u64 cached_start; - -static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - /* - * For legacy reasons, physical address range in the legacy ISA - * region is tracked as non-RAM. This will allow users of - * /dev/mem to map portions of legacy ISA region, even when - * some of those portions are listed(or not even listed) with - * different e820 types(RAM/reserved/..) - */ - if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && - page_is_ram(page_nr)) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); } - return ram_page; + return (ret > 0) ? -1 : (state.ram ? 1 : 0); } /* - * For RAM pages, mark the pages as non WB memory type using - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or - * set_memory_wc() on a RAM page at a time before marking it as WB again. - * This is ok, because only one driver will be owning the page and - * doing set_memory_*() calls. - * - * For now, we use PageNonWB to track that the RAM page is being mapped - * as non WB. In future, we will have to use one more flag - * (or some other mechanism in page_struct) to distinguish between - * UC and WC mapping. + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - page = pfn_to_page(pfn); - if (page_mapped(page) || PageNonWB(page)) - goto out; + unsigned long type; - SetPageNonWB(page); + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + start, end - 1, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } } - return 0; -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - ClearPageNonWB(page); + set_page_memtype(page, req_type); } - - return -EINVAL; + return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - if (page_mapped(page) || !PageNonWB(page)) - goto out; - - ClearPageNonWB(page); + set_page_memtype(page, -1); } return 0; - -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { - page = pfn_to_page(pfn); - SetPageNonWB(page); - } - return -EINVAL; } /* @@ -315,9 +263,6 @@ out: * - _PAGE_CACHE_UC_MINUS * - _PAGE_CACHE_UC * - * req_type will have a special case value '-1', when requester want to inherit - * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. - * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error @@ -326,9 +271,8 @@ out: int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { - struct memtype *new, *entry; + struct memtype *new; unsigned long actual_type; - struct list_head *where; int is_range_ram; int err = 0; @@ -337,8 +281,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == -1) - *new_type = _PAGE_CACHE_WB; + if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -346,41 +290,34 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; } - if (req_type == -1) { - /* - * Call mtrr_lookup to get the type hint. This is an - * optimization for /dev/mem mmap'ers into WB memory (BIOS - * tools and ACPI tools). Use WB request for WB memory and use - * UC_MINUS otherwise. - */ - u8 mtrr_type = mtrr_type_lookup(start, end); - - if (mtrr_type == MTRR_TYPE_WRBACK) - actual_type = _PAGE_CACHE_WB; - else - actual_type = _PAGE_CACHE_UC_MINUS; - } else { - actual_type = pat_x_mtrr_type(start, end, - req_type & _PAGE_CACHE_MASK); - } + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); if (new_type) *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + err = reserve_ram_pages_type(start, end, req_type, new_type); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } - new = kmalloc(sizeof(struct memtype), GFP_KERNEL); + new = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; @@ -390,73 +327,21 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else - entry = list_entry(&memtype_list, struct memtype, nd); - - /* Search for existing mapping that overlaps the current range */ - where = NULL; - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (end <= entry->start) { - where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); - break; - } else if (start <= entry->start) { /* end > entry->start */ - err = chk_conflict(new, entry, new_type); - if (!err) { - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); - } - break; - } else if (start < entry->end) { /* start > entry->start */ - err = chk_conflict(new, entry, new_type); - if (!err) { - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); - - /* - * Move to right position in the linked - * list to add this new entry - */ - list_for_each_entry_continue(entry, - &memtype_list, nd) { - if (start <= entry->start) { - where = entry->nd.prev; - break; - } - } - } - break; - } - } - + err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " - "track %s, req %s\n", - start, end, cattr_name(new->type), cattr_name(req_type)); + printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); return err; } - cached_start = start; - - if (where) - list_add(&new->nd, where); - else - list_add_tail(&new->nd, &memtype_list); - spin_unlock(&memtype_lock); - dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(new->type), cattr_name(req_type), + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; @@ -464,48 +349,138 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry; int err = -EINVAL; int is_range_ram; + struct memtype *entry; if (!pat_enabled) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + err = free_ram_pages_type(start, end); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } spin_lock(&memtype_lock); - list_for_each_entry(entry, &memtype_list, nd) { - if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; - - list_del(&entry->nd); - kfree(entry); - err = 0; - break; - } - } + entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (err) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", - current->comm, current->pid, start, end); + if (!entry) { + printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); + return -EINVAL; } - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + kfree(entry); - return err; + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); + + return 0; } +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = rbt_memtype_lookup(paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + resource_size_t size = end - start; + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -531,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -546,16 +520,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { - u64 offset = ((u64) pfn) << PAGE_SHIFT; - unsigned long flags = -1; - int retval; + unsigned long flags = _PAGE_CACHE_WB; if (!range_is_allowed(pfn, size)) return 0; - if (file->f_flags & O_SYNC) { + if (file->f_flags & O_DSYNC) flags = _PAGE_CACHE_UC_MINUS; - } #ifdef CONFIG_X86_32 /* @@ -576,64 +547,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, } #endif - /* - * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. - * - * Without O_SYNC, we want to get - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise - */ - if (flags != -1) { - retval = reserve_memtype(offset, offset + size, flags, NULL); - } else { - retval = reserve_memtype(offset, offset + size, -1, &flags); - } - - if (retval < 0) - return 0; - - if (((pfn < max_low_pfn_mapped) || - (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && - ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { - free_memtype(offset, offset + size); - printk(KERN_INFO - "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - offset, (unsigned long long)(offset + size)); - return 0; - } - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | flags); return 1; } -void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) -{ - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); - u64 addr = (u64)pfn << PAGE_SHIFT; - unsigned long flags; - - reserve_memtype(addr, addr + size, want_flags, &flags); - if (flags != want_flags) { - printk(KERN_INFO - "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", - current->comm, current->pid, - cattr_name(want_flags), - addr, (unsigned long long)(addr + size), - cattr_name(flags)); - } -} - -void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) -{ - u64 addr = (u64)pfn << PAGE_SHIFT; - - free_memtype(addr, addr + size); -} - /* * Change the memory type for the physial address range in kernel identity * mapping space if that range is a part of identity map. @@ -642,20 +560,26 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (!pat_enabled || base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + /* + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) + return 0; + + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { - printk(KERN_INFO - "%s:%d ioremap_change_attr failed %s " - "for %Lx-%Lx\n", + printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " + "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(flags), - base, (unsigned long long)(base + size)); + base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; @@ -671,31 +595,49 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, { int is_ram = 0; int ret; - unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + unsigned long flags = want_flags; is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. Maintain the current - * behavior with RAM pages by returning success. + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. */ - if (is_ram != 0) + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; + } ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) return ret; if (flags != want_flags) { - if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for %Lx-%Lx, got %s\n", + " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); return -EINVAL; } @@ -729,34 +671,25 @@ static void free_pfn_range(u64 paddr, unsigned long size) } /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. - * Otherwise, we reserve the entire vma range, my ging through the PTEs page - * by page to get physical address and protection. */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma) { - int retval = 0; - unsigned long i, j; resource_size_t paddr; unsigned long prot; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (!pat_enabled) - return 0; - - if (is_linear_pfn_mapping(vma)) { + if (vma->vm_flags & VM_PAT) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. */ - if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); return -EINVAL; } @@ -764,124 +697,95 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } - /* reserve entire vma page by page, using pfn and prot from pte */ - for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) - continue; - - pgprot = __pgprot(prot); - retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1); - if (retval) - goto cleanup_ret; - } return 0; - -cleanup_ret: - /* Reserve error: Cleanup partial reservation and return error */ - for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) - continue; - - free_pfn_range(paddr, PAGE_SIZE); - } - - return retval; } /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - * * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. - * Otherwise, we look t the pfn and size and reserve only the specified range - * page by page. - * - * Note that this function can be called with caller trying to map only a - * subrange/page inside the vma. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) { - int retval = 0; - unsigned long i, j; - resource_size_t base_paddr; - resource_size_t paddr; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; + unsigned long flags; + + /* reserve the whole chunk starting from paddr */ + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; + } if (!pat_enabled) return 0; - if (is_linear_pfn_mapping(vma)) { - /* reserve the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot, 0); - } + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + flags = lookup_memtype(paddr); - /* reserve page by page using pfn and size */ - base_paddr = (resource_size_t)pfn << PAGE_SHIFT; - for (i = 0; i < size; i += PAGE_SIZE) { - paddr = base_paddr + i; - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0); - if (retval) - goto cleanup_ret; + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + return 0; +} -cleanup_ret: - /* Reserve error: Cleanup partial reservation and return error */ - for (j = 0; j < i; j += PAGE_SIZE) { - paddr = base_paddr + j; - free_pfn_range(paddr, PAGE_SIZE); - } +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; - return retval; + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { - unsigned long i; resource_size_t paddr; unsigned long prot; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; - - if (!pat_enabled) - return; - if (is_linear_pfn_mapping(vma)) { - /* free the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - free_pfn_range(paddr, vma_size); + if (!(vma->vm_flags & VM_PAT)) return; - } - if (size != 0 && size != vma_size) { - /* free page by page, using pfn and size */ - paddr = (resource_size_t)pfn << PAGE_SHIFT; - for (i = 0; i < size; i += PAGE_SIZE) { - paddr = paddr + i; - free_pfn_range(paddr, PAGE_SIZE); + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; } - } else { - /* free entire vma, page by page, using the pfn from pte */ - for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) - continue; - free_pfn_range(paddr, PAGE_SIZE); - } + size = vma->vm_end - vma->vm_start; } + free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; } pgprot_t pgprot_writecombine(pgprot_t prot) @@ -895,29 +799,25 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine); #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) -/* get Nth element of the linked list */ static struct memtype *memtype_get_idx(loff_t pos) { - struct memtype *list_node, *print_entry; - int i = 1; + struct memtype *print_entry; + int ret; - print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!print_entry) return NULL; spin_lock(&memtype_lock); - list_for_each_entry(list_node, &memtype_list, nd) { - if (pos == i) { - *print_entry = *list_node; - spin_unlock(&memtype_lock); - return print_entry; - } - ++i; - } + ret = rbt_memtype_copy_nth_element(print_entry, pos); spin_unlock(&memtype_lock); - kfree(print_entry); - return NULL; + if (!ret) { + return print_entry; + } else { + kfree(print_entry); + return NULL; + } } static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) @@ -951,7 +851,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations memtype_seq_ops = { +static const struct seq_operations memtype_seq_ops = { .start = memtype_seq_start, .next = memtype_seq_next, .stop = memtype_seq_stop, @@ -972,8 +872,10 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, - NULL, &memtype_fops); + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } return 0; } |
