diff options
Diffstat (limited to 'arch/x86/mm/pat.c')
| -rw-r--r-- | arch/x86/mm/pat.c | 437 |
1 files changed, 148 insertions, 289 deletions
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ae9648eb1c7..657438858e8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,7 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/gfp.h> +#include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/rbtree.h> @@ -30,6 +30,8 @@ #include <asm/pat.h> #include <asm/io.h> +#include "pat_internal.h" + #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; @@ -53,19 +55,15 @@ static inline void pat_disable(const char *reason) #endif -static int debug_enable; +int pat_debug_enable; static int __init pat_debug_setup(char *str) { - debug_enable = 1; + pat_debug_enable = 1; return 0; } __setup("debugpat", pat_debug_setup); -#define dprintk(fmt, arg...) \ - do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) - - static u64 __read_mostly boot_pat_state; enum { @@ -132,84 +130,7 @@ void pat_init(void) #undef PAT -static char *cattr_name(unsigned long flags) -{ - switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; - } -} - -/* - * The global memtype list keeps track of memory type for specific - * physical memory areas. Conflicting memory types in different - * mappings can cause CPU cache corruption. To avoid this we keep track. - * - * The list is sorted based on starting address and can contain multiple - * entries for each address (this allows reference counting for overlapping - * areas). All the aliases have the same cache attributes of course. - * Zero attributes are represented as holes. - * - * The data structure is a list that is also organized as an rbtree - * sorted on the start address of memtype range. - * - * memtype_lock protects both the linear list and rbtree. - */ - -struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; - struct rb_node rb; -}; - -static struct rb_root memtype_rbroot = RB_ROOT; -static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ - -static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) -{ - struct rb_node *node = root->rb_node; - struct memtype *last_lower = NULL; - - while (node) { - struct memtype *data = container_of(node, struct memtype, rb); - - if (data->start < start) { - last_lower = data; - node = node->rb_right; - } else if (data->start > start) { - node = node->rb_left; - } else - return data; - } - - /* Will return NULL if there is no entry with its start <= start */ - return last_lower; -} - -static void memtype_rb_insert(struct rb_root *root, struct memtype *data) -{ - struct rb_node **new = &(root->rb_node); - struct rb_node *parent = NULL; - - while (*new) { - struct memtype *this = container_of(*new, struct memtype, rb); - - parent = *new; - if (data->start <= this->start) - new = &((*new)->rb_left); - else if (data->start > this->start) - new = &((*new)->rb_right); - } - - rb_link_node(&data->rb, parent, new); - rb_insert_color(&data->rb, root); -} +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -237,58 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + static int -chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) { - if (new->type != entry->type) { - if (type) { - new->type = entry->type; - *type = entry->type; - } else - goto conflict; - } + struct pagerange_state *state = arg; - /* check overlaps with more than one entry in the list */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (new->end <= entry->start) - break; - else if (new->type != entry->type) - goto conflict; - } - return 0; + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; - conflict: - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, - new->end, cattr_name(new->type), cattr_name(entry->type)); - return -EBUSY; + return state->ram && state->not_ram; } -static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - /* - * For legacy reasons, physical address range in the legacy ISA - * region is tracked as non-RAM. This will allow users of - * /dev/mem to map portions of legacy ISA region, even when - * some of those portions are listed(or not even listed) with - * different e820 types(RAM/reserved/..) - */ - if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && - page_is_ram(page_nr)) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); } - return ram_page; + return (ret > 0) ? -1 : (state.ram ? 1 : 0); } /* @@ -296,8 +206,6 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end) * Here we do two pass: * - Find the memtype of all the pages in the range, look for any conflicts * - In case of no conflicts, set the new memtype for pages in the range - * - * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) @@ -317,9 +225,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed " - "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", - start, end, type, req_type); + printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -364,9 +271,8 @@ static int free_ram_pages_type(u64 start, u64 end) int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { - struct memtype *new, *entry; + struct memtype *new; unsigned long actual_type; - struct list_head *where; int is_range_ram; int err = 0; @@ -404,16 +310,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) { - spin_lock(&memtype_lock); err = reserve_ram_pages_type(start, end, req_type, new_type); - spin_unlock(&memtype_lock); return err; } else if (is_range_ram < 0) { return -EINVAL; } - new = kmalloc(sizeof(struct memtype), GFP_KERNEL); + new = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; @@ -423,63 +327,21 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); - /* Search for existing mapping that overlaps the current range */ - where = NULL; - list_for_each_entry(entry, &memtype_list, nd) { - if (end <= entry->start) { - where = entry->nd.prev; - break; - } else if (start <= entry->start) { /* end > entry->start */ - err = chk_conflict(new, entry, new_type); - if (!err) { - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = entry->nd.prev; - } - break; - } else if (start < entry->end) { /* start > entry->start */ - err = chk_conflict(new, entry, new_type); - if (!err) { - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - - /* - * Move to right position in the linked - * list to add this new entry - */ - list_for_each_entry_continue(entry, - &memtype_list, nd) { - if (start <= entry->start) { - where = entry->nd.prev; - break; - } - } - } - break; - } - } - + err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " - "track %s, req %s\n", - start, end, cattr_name(new->type), cattr_name(req_type)); + printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); return err; } - if (where) - list_add(&new->nd, where); - else - list_add_tail(&new->nd, &memtype_list); - - memtype_rb_insert(&memtype_rbroot, new); - spin_unlock(&memtype_lock); - dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(new->type), cattr_name(req_type), + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; @@ -487,9 +349,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry, *saved_entry; int err = -EINVAL; int is_range_ram; + struct memtype *entry; if (!pat_enabled) return 0; @@ -501,9 +363,7 @@ int free_memtype(u64 start, u64 end) is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) { - spin_lock(&memtype_lock); err = free_ram_pages_type(start, end); - spin_unlock(&memtype_lock); return err; } else if (is_range_ram < 0) { @@ -511,56 +371,20 @@ int free_memtype(u64 start, u64 end) } spin_lock(&memtype_lock); - - entry = memtype_rb_search(&memtype_rbroot, start); - if (unlikely(entry == NULL)) - goto unlock_ret; - - /* - * Saved entry points to an entry with start same or less than what - * we searched for. Now go through the list in both directions to look - * for the entry that matches with both start and end, with list stored - * in sorted start address - */ - saved_entry = entry; - list_for_each_entry_from(entry, &memtype_list, nd) { - if (entry->start == start && entry->end == end) { - rb_erase(&entry->rb, &memtype_rbroot); - list_del(&entry->nd); - kfree(entry); - err = 0; - break; - } else if (entry->start > start) { - break; - } - } - - if (!err) - goto unlock_ret; - - entry = saved_entry; - list_for_each_entry_reverse(entry, &memtype_list, nd) { - if (entry->start == start && entry->end == end) { - rb_erase(&entry->rb, &memtype_rbroot); - list_del(&entry->nd); - kfree(entry); - err = 0; - break; - } else if (entry->start < start) { - break; - } - } -unlock_ret: + entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (err) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", - current->comm, current->pid, start, end); + if (!entry) { + printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); + return -EINVAL; } - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + kfree(entry); - return err; + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); + + return 0; } @@ -583,10 +407,8 @@ static unsigned long lookup_memtype(u64 paddr) if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; - spin_lock(&memtype_lock); page = pfn_to_page(paddr >> PAGE_SHIFT); rettype = get_page_memtype(page); - spin_unlock(&memtype_lock); /* * -1 from get_page_memtype() implies RAM page is in its * default state and not reserved, and hence of type WB @@ -599,7 +421,7 @@ static unsigned long lookup_memtype(u64 paddr) spin_lock(&memtype_lock); - entry = memtype_rb_search(&memtype_rbroot, paddr); + entry = rbt_memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else @@ -684,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -739,20 +560,26 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) + return 0; + + /* + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { - printk(KERN_INFO - "%s:%d ioremap_change_attr failed %s " - "for %Lx-%Lx\n", + printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " + "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(flags), - base, (unsigned long long)(base + size)); + base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; @@ -784,12 +611,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags = lookup_memtype(paddr); if (want_flags != flags) { - printk(KERN_WARNING - "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | @@ -807,11 +633,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for %Lx-%Lx, got %s\n", + " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); return -EINVAL; } @@ -845,20 +671,20 @@ static void free_pfn_range(u64 paddr, unsigned long size) } /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long prot; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (is_linear_pfn_mapping(vma)) { + if (vma->vm_flags & VM_PAT) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. @@ -875,31 +701,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) } /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - * * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) { + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; - resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; - if (is_linear_pfn_mapping(vma)) { - /* reserve the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot, 0); + /* reserve the whole chunk starting from paddr */ + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; } if (!pat_enabled) return 0; - /* for vm_insert_pfn and friends, we set prot based on lookup */ - flags = lookup_memtype(pfn << PAGE_SHIFT); + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + flags = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; + + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -907,22 +761,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long prot; - if (is_linear_pfn_mapping(vma)) { - /* free the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - free_pfn_range(paddr, vma_size); + if (!(vma->vm_flags & VM_PAT)) return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; } + free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; } pgprot_t pgprot_writecombine(pgprot_t prot) @@ -936,29 +799,25 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine); #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) -/* get Nth element of the linked list */ static struct memtype *memtype_get_idx(loff_t pos) { - struct memtype *list_node, *print_entry; - int i = 1; + struct memtype *print_entry; + int ret; - print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!print_entry) return NULL; spin_lock(&memtype_lock); - list_for_each_entry(list_node, &memtype_list, nd) { - if (pos == i) { - *print_entry = *list_node; - spin_unlock(&memtype_lock); - return print_entry; - } - ++i; - } + ret = rbt_memtype_copy_nth_element(print_entry, pos); spin_unlock(&memtype_lock); - kfree(print_entry); - return NULL; + if (!ret) { + return print_entry; + } else { + kfree(print_entry); + return NULL; + } } static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) |
