diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 1636 |
1 files changed, 1245 insertions, 391 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bbf953eeb58..254ce2b9015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -9,43 +9,357 @@ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/highmem.h> +#include <linux/mmu_notifier.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/cpuset.h> #include <linux/mutex.h> +#include <linux/bootmem.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/io.h> #include <linux/hugetlb.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; -static unsigned long surplus_huge_pages; -static unsigned long nr_overcommit_huge_pages; -unsigned long max_huge_pages; -unsigned long sysctl_overcommit_huge_pages; -static struct list_head hugepage_freelists[MAX_NUMNODES]; -static unsigned int nr_huge_pages_node[MAX_NUMNODES]; -static unsigned int free_huge_pages_node[MAX_NUMNODES]; -static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int hugetlb_next_nid; + +static int max_hstate; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +__initdata LIST_HEAD(huge_boot_pages); + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; + +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ static DEFINE_SPINLOCK(hugetlb_lock); -static void clear_huge_page(struct page *page, unsigned long addr) +/* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. + * + * The region data structures are protected by a combination of the mmap_sem + * and the hugetlb_instantion_mutex. To access or modify a region the caller + * must either hold the mmap_sem for write, or the mmap_sem for read and + * the hugetlb_instantiation mutex: + * + * down_write(&mm->mmap_sem); + * or + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); + */ +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarantee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + +static long region_count(struct list_head *head, long f, long t) +{ + struct file_region *rg; + long chg = 0; + + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + int seg_from; + int seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + + return chg; +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. + */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + +struct resv_map { + struct kref refs; + struct list_head regions; +}; + +struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(&resv_map->regions, 0); + kfree(resv_map); +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); + return 0; +} + +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + + return (get_vma_private_data(vma) & flag) != 0; +} + +/* Decrement the reserved pages in the hugepage pool by one */ +static void decrement_hugepage_resv_vma(struct hstate *h, + struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_NORESERVE) + return; + + if (vma->vm_flags & VM_SHARED) { + /* Shared mappings always use reserves */ + h->resv_huge_pages--; + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + h->resv_huge_pages--; + } +} + +/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ +void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + vma->vm_private_data = (void *)0; +} + +/* Returns true if the VMA has associated reserve pages */ +static int vma_has_reserves(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return 1; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + return 0; +} + +static void clear_huge_page(struct page *page, + unsigned long addr, unsigned long sz) { int i; might_sleep(); - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + for (i = 0; i < sz/PAGE_SIZE; i++) { cond_resched(); clear_user_highpage(page + i, addr + i * PAGE_SIZE); } @@ -55,42 +369,44 @@ static void copy_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; + struct hstate *h = hstate_vma(vma); might_sleep(); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } -static void enqueue_huge_page(struct page *page) +static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &hugepage_freelists[nid]); - free_huge_pages++; - free_huge_pages_node[nid]++; + list_add(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct hstate *h) { int nid; struct page *page = NULL; for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; break; } } return page; } -static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, - unsigned long address) +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address, int avoid_reserve) { int nid; struct page *page = NULL; @@ -101,18 +417,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, struct zone *zone; struct zoneref *z; + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_reserves(vma) && + h->free_huge_pages - h->resv_huge_pages == 0) + return NULL; + + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + return NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + !list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; - if (vma && vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; } } @@ -120,12 +451,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, return page; } -static void update_and_free_page(struct page *page) +static void update_and_free_page(struct hstate *h, struct page *page) { int i; - nr_huge_pages--; - nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { + + h->nr_huge_pages--; + h->nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); @@ -133,11 +465,27 @@ static void update_and_free_page(struct page *page) set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); +} + +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; } static void free_huge_page(struct page *page) { + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = page_hstate(page); int nid = page_to_nid(page); struct address_space *mapping; @@ -147,12 +495,12 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (surplus_huge_pages_node[nid]) { - update_and_free_page(page); - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + update_and_free_page(h, page); + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; } else { - enqueue_huge_page(page); + enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); if (mapping) @@ -164,7 +512,7 @@ static void free_huge_page(struct page *page) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(int delta) +static int adjust_pool_surplus(struct hstate *h, int delta) { static int prev_nid; int nid = prev_nid; @@ -177,15 +525,15 @@ static int adjust_pool_surplus(int delta) nid = first_node(node_online_map); /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !surplus_huge_pages_node[nid]) + if (delta < 0 && !h->surplus_huge_pages_node[nid]) continue; /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && surplus_huge_pages_node[nid] >= - nr_huge_pages_node[nid]) + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) continue; - surplus_huge_pages += delta; - surplus_huge_pages_node[nid] += delta; + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; ret = 1; break; } while (nid != prev_nid); @@ -194,59 +542,74 @@ static int adjust_pool_surplus(int delta) return ret; } -static struct page *alloc_fresh_huge_page_node(int nid) +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ +} + +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; + if (h->order >= MAX_ORDER) + return NULL; + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { __free_pages(page, HUGETLB_PAGE_ORDER); return NULL; } - set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[nid]++; - spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ + prep_new_huge_page(h, page, nid); } return page; } -static int alloc_fresh_huge_page(void) +/* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ +static int hstate_next_node(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->hugetlb_next_nid = next_nid; + return next_nid; +} + +static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = hugetlb_next_nid; + start_nid = h->hugetlb_next_nid; do { - page = alloc_fresh_huge_page_node(hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; - /* - * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. - */ - next_nid = next_node(hugetlb_next_nid, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - hugetlb_next_nid = next_nid; - } while (!page && hugetlb_next_nid != start_nid); + next_nid = hstate_next_node(h); + } while (!page && h->hugetlb_next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -256,12 +619,15 @@ static int alloc_fresh_huge_page(void) return ret; } -static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, - unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct page *page; unsigned int nid; + if (h->order >= MAX_ORDER) + return NULL; + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -286,18 +652,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * per-node value is checked there. */ spin_lock(&hugetlb_lock); - if (surplus_huge_pages >= nr_overcommit_huge_pages) { + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { spin_unlock(&hugetlb_lock); return NULL; } else { - nr_huge_pages++; - surplus_huge_pages++; + h->nr_huge_pages++; + h->surplus_huge_pages++; } spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); spin_lock(&hugetlb_lock); if (page) { @@ -312,12 +678,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, /* * We incremented the global counters already */ - nr_huge_pages_node[nid]++; - surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[nid]++; + h->surplus_huge_pages_node[nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { - nr_huge_pages--; - surplus_huge_pages--; + h->nr_huge_pages--; + h->surplus_huge_pages--; __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -329,16 +695,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ -static int gather_surplus_pages(int delta) +static int gather_surplus_pages(struct hstate *h, int delta) { struct list_head surplus_list; struct page *page, *tmp; int ret, i; int needed, allocated; - needed = (resv_huge_pages + delta) - free_huge_pages; + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { - resv_huge_pages += delta; + h->resv_huge_pages += delta; return 0; } @@ -349,7 +715,7 @@ static int gather_surplus_pages(int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(NULL, 0); + page = alloc_buddy_huge_page(h, NULL, 0); if (!page) { /* * We were not able to allocate enough pages to @@ -370,7 +736,8 @@ retry: * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock(&hugetlb_lock); - needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); if (needed > 0) goto retry; @@ -383,7 +750,7 @@ retry: * before they are reserved. */ needed += allocated; - resv_huge_pages += delta; + h->resv_huge_pages += delta; ret = 0; free: /* Free the needed pages to the hugetlb pool */ @@ -391,7 +758,7 @@ free: if ((--needed) < 0) break; list_del(&page->lru); - enqueue_huge_page(page); + enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ @@ -419,7 +786,8 @@ free: * allocated to satisfy the reservation must be explicitly freed if they were * never used. */ -static void return_unused_surplus_pages(unsigned long unused_resv_pages) +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) { static int nid = -1; struct page *page; @@ -434,157 +802,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) unsigned long remaining_iterations = num_online_nodes(); /* Uncommit the reservation */ - resv_huge_pages -= unused_resv_pages; + h->resv_huge_pages -= unused_resv_pages; + + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; - nr_pages = min(unused_resv_pages, surplus_huge_pages); + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); - if (!surplus_huge_pages_node[nid]) + if (!h->surplus_huge_pages_node[nid]) continue; - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[nid]--; - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; nr_pages--; remaining_iterations = num_online_nodes(); } } } +/* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase + * reservation and actually increase quota before an allocation can occur. + * Where any new reservation would be required the reservation change is + * prepared, but not committed. Once the page has been quota'd allocated + * an instantiated the change should be committed via vma_commit_reservation. + * No action is required on failure. + */ +static int vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + return region_chg(&inode->i_mapping->private_list, + idx, idx + 1); -static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, - unsigned long addr) + } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + return 1; + + } else { + int err; + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + err = region_chg(&reservations->regions, idx, idx + 1); + if (err < 0) + return err; + return 0; + } +} +static void vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; - spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(vma, addr); - spin_unlock(&hugetlb_lock); - return page ? page : ERR_PTR(-VM_FAULT_OOM); + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + region_add(&inode->i_mapping->private_list, idx, idx + 1); + + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + /* Mark this page used in the map. */ + region_add(&reservations->regions, idx, idx + 1); + } } -static struct page *alloc_huge_page_private(struct vm_area_struct *vma, - unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) { - struct page *page = NULL; + struct hstate *h = hstate_vma(vma); + struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned int chg; - if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) - return ERR_PTR(-VM_FAULT_SIGBUS); + /* + * Processes that did not create the mapping will have no reserves and + * will not have accounted against quota. Check that the quota can be + * made before satisfying the allocation + * MAP_NORESERVE mappings may also need pages and quota allocated + * if no reserve mapping overlaps. + */ + chg = vma_needs_reservation(h, vma, addr); + if (chg < 0) + return ERR_PTR(chg); + if (chg) + if (hugetlb_get_quota(inode->i_mapping, chg)) + return ERR_PTR(-ENOSPC); spin_lock(&hugetlb_lock); - if (free_huge_pages > resv_huge_pages) - page = dequeue_huge_page_vma(vma, addr); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); + if (!page) { - page = alloc_buddy_huge_page(vma, addr); + page = alloc_buddy_huge_page(h, vma, addr); if (!page) { - hugetlb_put_quota(vma->vm_file->f_mapping, 1); + hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_OOM); } } + + set_page_refcounted(page); + set_page_private(page, (unsigned long) mapping); + + vma_commit_reservation(h, vma, addr); + return page; } -static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) +__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) { - struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); - if (vma->vm_flags & VM_MAYSHARE) - page = alloc_huge_page_shared(vma, addr); - else - page = alloc_huge_page_private(vma, addr); + while (nr_nodes) { + void *addr; + + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); - if (!IS_ERR(page)) { - set_page_refcounted(page); - set_page_private(page, (unsigned long) mapping); + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + if (m) + goto found; + } + hstate_next_node(h); + nr_nodes--; } - return page; + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; } -static int __init hugetlb_init(void) +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) { - unsigned long i; - - if (HPAGE_SHIFT == 0) - return 0; - - for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&hugepage_freelists[i]); + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); + } +} - hugetlb_next_nid = first_node(node_online_map); +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) +{ + unsigned long i; - for (i = 0; i < max_huge_pages; ++i) { - if (!alloc_fresh_huge_page()) + for (i = 0; i < h->max_huge_pages; ++i) { + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) break; } - max_huge_pages = free_huge_pages = nr_huge_pages = i; - printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); - return 0; + h->max_huge_pages = i; } -module_init(hugetlb_init); -static int __init hugetlb_setup(char *s) +static void __init hugetlb_init_hstates(void) { - if (sscanf(s, "%lu", &max_huge_pages) <= 0) - max_huge_pages = 0; - return 1; + struct hstate *h; + + for_each_hstate(h) { + /* oversize hugepages were init'ed in early boot */ + if (h->order < MAX_ORDER) + hugetlb_hstate_alloc_pages(h); + } } -__setup("hugepages=", hugetlb_setup); -static unsigned int cpuset_mems_nr(unsigned int *array) +static char * __init memfmt(char *buf, unsigned long n) { - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} - return nr; +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + char buf[32]; + printk(KERN_INFO "HugeTLB registered %s page size, " + "pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); + } } -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM -static void try_to_free_low(unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count) { int i; + if (h->order >= MAX_ORDER) + return; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; - list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { - if (count >= nr_huge_pages) + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) return; if (PageHighMem(page)) continue; list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[page_to_nid(page)]--; + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[page_to_nid(page)]--; } } } #else -static inline void try_to_free_low(unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count) { } #endif -#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) -static unsigned long set_max_huge_pages(unsigned long count) +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + /* * Increase the pool size * First take pages out of surplus state. Then make up the @@ -597,20 +1077,19 @@ static unsigned long set_max_huge_pages(unsigned long count) * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); - while (surplus_huge_pages && count > persistent_huge_pages) { - if (!adjust_pool_surplus(-1)) + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, -1)) break; } |