diff options
Diffstat (limited to 'mm/memory_hotplug.c')
| -rw-r--r-- | mm/memory_hotplug.c | 224 | 
1 files changed, 163 insertions, 61 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e..469bbf505f8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -9,7 +9,6 @@  #include <linux/swap.h>  #include <linux/interrupt.h>  #include <linux/pagemap.h> -#include <linux/bootmem.h>  #include <linux/compiler.h>  #include <linux/export.h>  #include <linux/pagevec.h> @@ -31,6 +30,7 @@  #include <linux/firmware-map.h>  #include <linux/stop_machine.h>  #include <linux/hugetlb.h> +#include <linux/memblock.h>  #include <asm/tlbflush.h> @@ -46,19 +46,84 @@  static void generic_online_page(struct page *page);  static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); -DEFINE_MUTEX(mem_hotplug_mutex); +/* The same as the cpu_hotplug lock, but for memory hotplug. */ +static struct { +	struct task_struct *active_writer; +	struct mutex lock; /* Synchronizes accesses to refcount, */ +	/* +	 * Also blocks the new readers during +	 * an ongoing mem hotplug operation. +	 */ +	int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	struct lockdep_map dep_map; +#endif +} mem_hotplug = { +	.active_writer = NULL, +	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock), +	.refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	.dep_map = {.name = "mem_hotplug.lock" }, +#endif +}; -void lock_memory_hotplug(void) +/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ +#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) +#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map) +#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map) + +void get_online_mems(void)  { -	mutex_lock(&mem_hotplug_mutex); +	might_sleep(); +	if (mem_hotplug.active_writer == current) +		return; +	memhp_lock_acquire_read(); +	mutex_lock(&mem_hotplug.lock); +	mem_hotplug.refcount++; +	mutex_unlock(&mem_hotplug.lock); +  } -void unlock_memory_hotplug(void) +void put_online_mems(void)  { -	mutex_unlock(&mem_hotplug_mutex); +	if (mem_hotplug.active_writer == current) +		return; +	mutex_lock(&mem_hotplug.lock); + +	if (WARN_ON(!mem_hotplug.refcount)) +		mem_hotplug.refcount++; /* try to fix things up */ + +	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) +		wake_up_process(mem_hotplug.active_writer); +	mutex_unlock(&mem_hotplug.lock); +	memhp_lock_release(); +  } +static void mem_hotplug_begin(void) +{ +	mem_hotplug.active_writer = current; + +	memhp_lock_acquire(); +	for (;;) { +		mutex_lock(&mem_hotplug.lock); +		if (likely(!mem_hotplug.refcount)) +			break; +		__set_current_state(TASK_UNINTERRUPTIBLE); +		mutex_unlock(&mem_hotplug.lock); +		schedule(); +	} +} + +static void mem_hotplug_done(void) +{ +	mem_hotplug.active_writer = NULL; +	mutex_unlock(&mem_hotplug.lock); +	memhp_lock_release(); +}  /* add this memory to iomem resource */  static struct resource *register_memory_resource(u64 start, u64 size) @@ -268,7 +333,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,  }  /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic() */ + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */  static int __ref ensure_zone_is_initialized(struct zone *zone,  			unsigned long start_pfn, unsigned long num_pages)  { @@ -365,8 +430,7 @@ out_fail:  static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,  			    unsigned long end_pfn)  { -	unsigned long old_pgdat_end_pfn = -		pgdat->node_start_pfn + pgdat->node_spanned_pages; +	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);  	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)  		pgdat->node_start_pfn = start_pfn; @@ -402,13 +466,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)  static int __meminit __add_section(int nid, struct zone *zone,  					unsigned long phys_start_pfn)  { -	int nr_pages = PAGES_PER_SECTION;  	int ret;  	if (pfn_valid(phys_start_pfn))  		return -EEXIST; -	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); +	ret = sparse_add_one_section(zone, phys_start_pfn);  	if (ret < 0)  		return ret; @@ -579,9 +642,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,  static void shrink_pgdat_span(struct pglist_data *pgdat,  			      unsigned long start_pfn, unsigned long end_pfn)  { -	unsigned long pgdat_start_pfn =  pgdat->node_start_pfn; -	unsigned long pgdat_end_pfn = -		pgdat->node_start_pfn + pgdat->node_spanned_pages; +	unsigned long pgdat_start_pfn = pgdat->node_start_pfn; +	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ +	unsigned long pgdat_end_pfn = p;  	unsigned long pfn;  	struct mem_section *ms;  	int nid = pgdat->node_id; @@ -729,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback)  {  	int rc = -EINVAL; -	lock_memory_hotplug(); +	get_online_mems(); +	mutex_lock(&online_page_callback_lock);  	if (online_page_callback == generic_online_page) {  		online_page_callback = callback;  		rc = 0;  	} -	unlock_memory_hotplug(); +	mutex_unlock(&online_page_callback_lock); +	put_online_mems();  	return rc;  } @@ -746,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback)  {  	int rc = -EINVAL; -	lock_memory_hotplug(); +	get_online_mems(); +	mutex_lock(&online_page_callback_lock);  	if (online_page_callback == callback) {  		online_page_callback = generic_online_page;  		rc = 0;  	} -	unlock_memory_hotplug(); +	mutex_unlock(&online_page_callback_lock); +	put_online_mems();  	return rc;  } @@ -901,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ  	int ret;  	struct memory_notify arg; -	lock_memory_hotplug(); +	mem_hotplug_begin();  	/*  	 * This doesn't need a lock to do pfn_to_page().  	 * The section can't be removed here because of the @@ -909,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ  	 */  	zone = page_zone(pfn_to_page(pfn)); +	ret = -EINVAL;  	if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && -	    !can_online_high_movable(zone)) { -		unlock_memory_hotplug(); -		return -EINVAL; -	} +	    !can_online_high_movable(zone)) +		goto out;  	if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { -		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { -			unlock_memory_hotplug(); -			return -EINVAL; -		} +		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) +			goto out;  	}  	if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { -		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { -			unlock_memory_hotplug(); -			return -EINVAL; -		} +		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) +			goto out;  	}  	/* Previous code may changed the zone of the pfn range */ @@ -935,14 +997,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ  	arg.nr_pages = nr_pages;  	node_states_check_changes_online(nr_pages, zone, &arg); -	nid = page_to_nid(pfn_to_page(pfn)); +	nid = pfn_to_nid(pfn);  	ret = memory_notify(MEM_GOING_ONLINE, &arg);  	ret = notifier_to_errno(ret);  	if (ret) {  		memory_notify(MEM_CANCEL_ONLINE, &arg); -		unlock_memory_hotplug(); -		return ret; +		goto out;  	}  	/*  	 * If this zone is not populated, then it is not in zonelist. @@ -966,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ  		       (((unsigned long long) pfn + nr_pages)  			    << PAGE_SHIFT) - 1);  		memory_notify(MEM_CANCEL_ONLINE, &arg); -		unlock_memory_hotplug(); -		return ret; +		goto out;  	}  	zone->present_pages += onlined_pages; @@ -997,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ  	if (onlined_pages)  		memory_notify(MEM_ONLINE, &arg); -	unlock_memory_hotplug(); - -	return 0; +out: +	mem_hotplug_done(); +	return ret;  }  #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1009,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)  	struct pglist_data *pgdat;  	unsigned long zones_size[MAX_NR_ZONES] = {0};  	unsigned long zholes_size[MAX_NR_ZONES] = {0}; -	unsigned long start_pfn = start >> PAGE_SHIFT; +	unsigned long start_pfn = PFN_DOWN(start);  	pgdat = NODE_DATA(nid);  	if (!pgdat) { @@ -1044,17 +1104,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)  } -/* +/** + * try_online_node - online a node if offlined + *   * called by cpu_up() to online a node without onlined memory.   */ -int mem_online_node(int nid) +int try_online_node(int nid)  {  	pg_data_t	*pgdat;  	int	ret; -	lock_memory_hotplug(); +	if (node_online(nid)) +		return 0; + +	mem_hotplug_begin();  	pgdat = hotadd_new_pgdat(nid, 0);  	if (!pgdat) { +		pr_err("Cannot online node %d due to NULL pgdat\n", nid);  		ret = -ENOMEM;  		goto out;  	} @@ -1062,14 +1128,20 @@ int mem_online_node(int nid)  	ret = register_one_node(nid);  	BUG_ON(ret); +	if (pgdat->node_zonelists->_zonerefs->zone == NULL) { +		mutex_lock(&zonelists_mutex); +		build_all_zonelists(NULL, NULL); +		mutex_unlock(&zonelists_mutex); +	} +  out: -	unlock_memory_hotplug(); +	mem_hotplug_done();  	return ret;  }  static int check_hotplug_memory_range(u64 start, u64 size)  { -	u64 start_pfn = start >> PAGE_SHIFT; +	u64 start_pfn = PFN_DOWN(start);  	u64 nr_pages = size >> PAGE_SHIFT;  	/* Memory range must be aligned with section */ @@ -1097,17 +1169,18 @@ int __ref add_memory(int nid, u64 start, u64 size)  	if (ret)  		return ret; -	lock_memory_hotplug(); -  	res = register_memory_resource(start, size);  	ret = -EEXIST;  	if (!res) -		goto out; +		return ret;  	{	/* Stupid hack to suppress address-never-null warning */  		void *p = NODE_DATA(nid);  		new_pgdat = !p;  	} + +	mem_hotplug_begin(); +  	new_node = !node_online(nid);  	if (new_node) {  		pgdat = hotadd_new_pgdat(nid, start); @@ -1147,7 +1220,7 @@ error:  	release_memory_resource(res);  out: -	unlock_memory_hotplug(); +	mem_hotplug_done();  	return ret;  }  EXPORT_SYMBOL_GPL(add_memory); @@ -1299,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)  #ifdef CONFIG_DEBUG_VM  			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",  			       pfn); -			dump_page(page); +			dump_page(page, "failed to remove from LRU");  #endif  			put_page(page);  			/* Because we don't have big zone->lock. we should @@ -1321,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)  		 * alloc_migrate_target should be improooooved!!  		 * migrate_pages returns # of failed pages.  		 */ -		ret = migrate_pages(&source, alloc_migrate_target, 0, +		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,  					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);  		if (ret)  			putback_movable_pages(&source); @@ -1412,6 +1485,37 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)  }  #endif /* CONFIG_MOVABLE_NODE */ +static int __init cmdline_parse_movable_node(char *p) +{ +#ifdef CONFIG_MOVABLE_NODE +	/* +	 * Memory used by the kernel cannot be hot-removed because Linux +	 * cannot migrate the kernel pages. When memory hotplug is +	 * enabled, we should prevent memblock from allocating memory +	 * for the kernel. +	 * +	 * ACPI SRAT records all hotpluggable memory ranges. But before +	 * SRAT is parsed, we don't know about it. +	 * +	 * The kernel image is loaded into memory at very early time. We +	 * cannot prevent this anyway. So on NUMA system, we set any +	 * node the kernel resides in as un-hotpluggable. +	 * +	 * Since on modern servers, one node could have double-digit +	 * gigabytes memory, we can assume the memory around the kernel +	 * image is also un-hotpluggable. So before SRAT is parsed, just +	 * allocate memory near the kernel image to try the best to keep +	 * the kernel away from hotpluggable memory. +	 */ +	memblock_set_bottom_up(true); +	movable_node_enabled = true; +#else +	pr_warn("movable_node option not supported\n"); +#endif +	return 0; +} +early_param("movable_node", cmdline_parse_movable_node); +  /* check which state of node_states will be changed when offline memory */  static void node_states_check_changes_offline(unsigned long nr_pages,  		struct zone *zone, struct memory_notify *arg) @@ -1523,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn,  	if (!test_pages_in_a_zone(start_pfn, end_pfn))  		return -EINVAL; -	lock_memory_hotplug(); +	mem_hotplug_begin();  	zone = page_zone(pfn_to_page(start_pfn));  	node = zone_to_nid(zone); @@ -1630,7 +1734,7 @@ repeat:  	writeback_set_ratelimit();  	memory_notify(MEM_OFFLINE, &arg); -	unlock_memory_hotplug(); +	mem_hotplug_done();  	return 0;  failed_removal: @@ -1642,7 +1746,7 @@ failed_removal:  	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);  out: -	unlock_memory_hotplug(); +	mem_hotplug_done();  	return ret;  } @@ -1702,7 +1806,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,  }  #ifdef CONFIG_MEMORY_HOTREMOVE -static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) +static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)  {  	int ret = !is_memblock_offlined(mem); @@ -1846,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)  	BUG_ON(check_hotplug_memory_range(start, size)); -	lock_memory_hotplug(); +	mem_hotplug_begin();  	/*  	 * All memory blocks must be offlined before removing memory.  Check @@ -1854,11 +1958,9 @@ void __ref remove_memory(int nid, u64 start, u64 size)  	 * if this is not the case.  	 */  	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, -				is_memblock_offlined_cb); -	if (ret) { -		unlock_memory_hotplug(); +				check_memblock_offlined_cb); +	if (ret)  		BUG(); -	}  	/* remove memmap entry */  	firmware_map_remove(start, start + size, "System RAM"); @@ -1867,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)  	try_offline_node(nid); -	unlock_memory_hotplug(); +	mem_hotplug_done();  }  EXPORT_SYMBOL_GPL(remove_memory);  #endif /* CONFIG_MEMORY_HOTREMOVE */  | 
