diff options
Diffstat (limited to 'mm/swapfile.c')
| -rw-r--r-- | mm/swapfile.c | 1696 |
1 files changed, 1039 insertions, 657 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c7..4c524f7bd0b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -14,14 +14,13 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> -#include <linux/shm.h> +#include <linux/shmem_fs.h> #include <linux/blkdev.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> @@ -31,6 +30,10 @@ #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> +#include <linux/oom.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> +#include <linux/export.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -42,9 +45,10 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; -long nr_swap_pages; +atomic_long_t nr_swap_pages; +/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; @@ -53,9 +57,28 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -76,7 +99,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -95,39 +118,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) } /* - * We need this because the bdev->unplug_fn can sleep and we cannot - * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a mutex. - */ -static DECLARE_RWSEM(swap_unplug_sem); - -void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) -{ - swp_entry_t entry; - - down_read(&swap_unplug_sem); - entry.val = page_private(page); - if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)]->bdev; - struct backing_dev_info *bdi; - - /* - * If the page is removed from swapcache from under us (with a - * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page_private(page) above. - * If the WARN_ON triggers during a swapoff it maybe the race - * condition and it's harmless. However if it triggers without - * swapoff it signals a problem. - */ - WARN_ON(page_count(page) <= 1); - - bdi = bdev->bd_inode->i_mapping->backing_dev_info; - blk_run_backing_dev(bdi, page); - } - up_read(&swap_unplug_sem); -} - -/* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ @@ -203,23 +193,304 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -static int wait_for_discard(void *word) +#define SWAPFILE_CLUSTER 256 +#define LATENCY_LIMIT 256 + +static inline void cluster_set_flag(struct swap_cluster_info *info, + unsigned int flag) { - schedule(); - return 0; + info->flags = flag; } -#define SWAPFILE_CLUSTER 256 -#define LATENCY_LIMIT 256 +static inline unsigned int cluster_count(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_count(struct swap_cluster_info *info, + unsigned int c) +{ + info->data = c; +} + +static inline void cluster_set_count_flag(struct swap_cluster_info *info, + unsigned int c, unsigned int f) +{ + info->flags = f; + info->data = c; +} + +static inline unsigned int cluster_next(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_next(struct swap_cluster_info *info, + unsigned int n) +{ + info->data = n; +} + +static inline void cluster_set_next_flag(struct swap_cluster_info *info, + unsigned int n, unsigned int f) +{ + info->flags = f; + info->data = n; +} + +static inline bool cluster_is_free(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_FREE; +} -static inline unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static inline bool cluster_is_null(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_NEXT_NULL; +} + +static inline void cluster_set_null(struct swap_cluster_info *info) +{ + info->flags = CLUSTER_FLAG_NEXT_NULL; + info->data = 0; +} + +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map(), mark the swap entries bad (occupied). It + * will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + if (cluster_is_null(&si->discard_cluster_head)) { + cluster_set_next_flag(&si->discard_cluster_head, + idx, 0); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } else { + unsigned int tail = cluster_next(&si->discard_cluster_tail); + cluster_set_next(&si->cluster_info[tail], idx); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } + + schedule_work(&si->discard_work); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_is_null(&si->discard_cluster_head)) { + idx = cluster_next(&si->discard_cluster_head); + + cluster_set_next_flag(&si->discard_cluster_head, + cluster_next(&info[idx]), 0); + if (cluster_next(&si->discard_cluster_tail) == idx) { + cluster_set_null(&si->discard_cluster_head); + cluster_set_null(&si->discard_cluster_tail); + } + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&si->free_cluster_head)) { + cluster_set_next_flag(&si->free_cluster_head, + idx, 0); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&si->free_cluster_tail); + cluster_set_next(&info[tail], idx); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + if (cluster_is_free(&cluster_info[idx])) { + VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); + cluster_set_next_flag(&p->free_cluster_head, + cluster_next(&cluster_info[idx]), 0); + if (cluster_next(&p->free_cluster_tail) == idx) { + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->free_cluster_head); + } + cluster_set_count_flag(&cluster_info[idx], 0, 0); + } + + VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) + 1); +} + +/* + * The cluster corresponding to page_nr decreases one usage. If the usage + * counter becomes 0, which means no page in the cluster is in using, we can + * optionally discard the cluster and add it to free cluster list. + */ +static void dec_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + + VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) - 1); + + if (cluster_count(&cluster_info[idx]) == 0) { + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(p, idx); + return; + } + + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } else { + unsigned int tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } + } +} + +/* + * It's possible scan_swap_map() uses a free cluster in the middle of free + * cluster list. Avoiding such abuse to avoid list corruption. + */ +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + unsigned long offset) +{ + struct percpu_cluster *percpu_cluster; + bool conflict; + + offset /= SWAPFILE_CLUSTER; + conflict = !cluster_is_null(&si->free_cluster_head) && + offset != cluster_next(&si->free_cluster_head) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_is_null(&si->free_cluster_head)) { + cluster->index = si->free_cluster_head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_is_null(&si->discard_cluster_head)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + *scan_base = *offset = si->cluster_next; + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * + SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; +} + +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) { unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; - int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -235,36 +506,27 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si, si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { - /* - * Start range check on racing allocations, in case - * they overlap the cluster we eventually decide on - * (we scan without swap_lock to allow preemption). - * It's hardly conceivable that cluster_nr could be - * wrapped during our scan, but don't depend on it. - */ - if (si->lowest_alloc) - goto checks; - si->lowest_alloc = si->max; - si->highest_alloc = 0; - } - spin_unlock(&swap_lock); + + spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * But if seek is cheap, search from our current position, so - * that swap is allocated from all over the partition: if the - * Flash Translation Layer only remaps within limited zones, - * we don't want to wear out the first zone too quickly. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. */ - if (!(si->flags & SWP_SOLIDSTATE)) - scan_base = offset = si->lowest_bit; + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -272,32 +534,10 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - - offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster < scan_base; offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&swap_lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -307,12 +547,15 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si, } offset = scan_base; - spin_lock(&swap_lock); + spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; - si->lowest_alloc = 0; } checks: + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -323,9 +566,9 @@ checks: /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - spin_unlock(&swap_lock); + spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); - spin_lock(&swap_lock); + spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; @@ -343,75 +586,26 @@ checks: if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; - if (si->lowest_alloc) { - /* - * Only set when SWP_DISCARDABLE, and there's a scan - * for a free cluster in progress or just completed. - */ - if (found_free_cluster) { - /* - * To optimize wear-levelling, discard the - * old data of the cluster, taking care not to - * discard any of its pages that have already - * been allocated by racing tasks (offset has - * already stepped over any at the beginning). - */ - if (offset < si->highest_alloc && - si->lowest_alloc <= last_in_cluster) - last_in_cluster = si->lowest_alloc - 1; - si->flags |= SWP_DISCARDING; - spin_unlock(&swap_lock); - - if (offset < last_in_cluster) - discard_swap_cluster(si, offset, - last_in_cluster - offset + 1); - - spin_lock(&swap_lock); - si->lowest_alloc = 0; - si->flags &= ~SWP_DISCARDING; - - smp_mb(); /* wake_up_bit advises this */ - wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); - - } else if (si->flags & SWP_DISCARDING) { - /* - * Delay using pages allocated by racing tasks - * until the whole discard has been issued. We - * could defer that delay until swap_writepage, - * but it's easier to keep this self-contained. - */ - spin_unlock(&swap_lock); - wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), - wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&swap_lock); - } else { - /* - * Note pages allocated by racing tasks while - * scan for a free cluster is in progress, so - * that its final discard can exclude them. - */ - if (offset < si->lowest_alloc) - si->lowest_alloc = offset; - if (offset > si->highest_alloc) - si->highest_alloc = offset; - } - } return offset; scan: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { @@ -420,21 +614,22 @@ scan: } } offset = si->lowest_bit; - while (++offset < scan_base) { + while (offset < scan_base) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } + offset++; } - spin_lock(&swap_lock); + spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; @@ -443,65 +638,87 @@ no_page: swp_entry_t get_swap_page(void) { - struct swap_info_struct *si; + struct swap_info_struct *si, *next; pgoff_t offset; - int type, next; - int wrapped = 0; - spin_lock(&swap_lock); - if (nr_swap_pages <= 0) + if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; - nr_swap_pages--; - - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - si = swap_info[type]; - next = si->next; - if (next < 0 || - (!wrapped && si->prio != swap_info[next]->prio)) { - next = swap_list.head; - wrapped++; + atomic_long_dec(&nr_swap_pages); + + spin_lock(&swap_avail_lock); + +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + spin_lock(&si->lock); + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&si->lock); + goto nextsi; } - if (!si->highest_bit) - continue; - if (!(si->flags & SWP_WRITEOK)) - continue; - - swap_list.next = next; /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); - if (offset) { - spin_unlock(&swap_lock); - return swp_entry(type, offset); - } - next = swap_list.next; + spin_unlock(&si->lock); + if (offset) + return swp_entry(si->type, offset); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. + */ + if (plist_node_empty(&next->avail_list)) + goto start_over; } - nr_swap_pages++; + spin_unlock(&swap_avail_lock); + + atomic_long_inc(&nr_swap_pages); noswap: - spin_unlock(&swap_lock); return (swp_entry_t) {0}; } -/* The only caller of this function is now susupend routine */ +/* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; pgoff_t offset; - spin_lock(&swap_lock); si = swap_info[type]; + spin_lock(&si->lock); if (si && (si->flags & SWP_WRITEOK)) { - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return swp_entry(type, offset); } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); } - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return (swp_entry_t) {0}; } @@ -523,20 +740,20 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) goto bad_offset; if (!p->swap_map[offset]) goto bad_free; - spin_lock(&swap_lock); + spin_lock(&p->lock); return p; bad_free: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); goto out; bad_offset: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; } @@ -579,26 +796,37 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, /* free if no reference */ if (!usage) { - struct gendisk *disk = p->bdev->bd_disk; + dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; - if (offset > p->highest_bit) + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; p->highest_bit = offset; - if (swap_list.next >= 0 && - p->prio > swap_info[swap_list.next]->prio) - swap_list.next = p->type; - nr_swap_pages++; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } + atomic_long_inc(&nr_swap_pages); p->inuse_pages--; - if ((p->flags & SWP_BLKDEV) && - disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, offset); + frontswap_invalidate_page(p->type, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } } return usage; } /* - * Caller has made sure that the swapdevice corresponding to entry + * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free(swp_entry_t entry) @@ -608,7 +836,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { swap_entry_free(p, entry, 1); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -625,7 +853,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) count = swap_entry_free(p, entry, SWAP_HAS_CACHE); if (page) mem_cgroup_uncharge_swapcache(page, entry, count != 0); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -634,7 +862,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static inline int page_swapcount(struct page *page) +int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; @@ -644,7 +872,7 @@ static inline int page_swapcount(struct page *page) p = swap_info_get(entry); if (p) { count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } return count; } @@ -659,7 +887,7 @@ int reuse_swap_page(struct page *page) { int count; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return 0; count = page_mapcount(page); @@ -679,7 +907,7 @@ int reuse_swap_page(struct page *page) */ int try_to_free_swap(struct page *page) { - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (!PageSwapCache(page)) return 0; @@ -700,10 +928,10 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibernation clears bits from gfp_allowed_mask to prevent - * memory reclaim from writing to disk, so check that here. + * Hibernation suspends storage while it is writing the image + * to disk so check that here. */ - if (!(gfp_allowed_mask & __GFP_IO)) + if (pm_suspended_storage()) return 0; delete_from_swap_cache(page); @@ -726,13 +954,14 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), + entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } } - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } if (page) { /* @@ -750,37 +979,6 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_count_swap_user - count the user of a swap entry - * @ent: the swap entry to be checked - * @pagep: the pointer for the swap cache page of the entry to be stored - * - * Returns the number of the user of the swap entry. The number is valid only - * for swaps of anonymous pages. - * If the entry is found on swap cache, the page is stored to pagep with - * refcount of it being incremented. - */ -int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) -{ - struct page *page; - struct swap_info_struct *p; - int count = 0; - - page = find_get_page(&swapper_space, ent.val); - if (page) - count += page_mapcount(page); - p = swap_info_get(ent); - if (p) { - count += swap_count(p->swap_map[swp_offset(ent)]); - spin_unlock(&swap_lock); - } - - *pagep = page; - return count; -} -#endif - #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). @@ -861,17 +1059,34 @@ unsigned int count_swap_pages(int type, int free) if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; + spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } + spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -880,20 +1095,26 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr = NULL; + struct page *swapcache; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { + swapcache = page; + page = ksm_might_need_to_copy(page, vma, addr); + if (unlikely(!page)) + return -ENOMEM; + + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { - if (ret > 0) - mem_cgroup_cancel_charge_swapin(ptr); + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } @@ -903,8 +1124,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, ptr); + if (page == swapcache) + page_add_anon_rmap(page, vma, addr); + else /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, addr); + mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* * Move the page to the active list so it is not @@ -914,6 +1138,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, out: pte_unmap_unlock(pte, ptl); out_nolock: + if (page != swapcache) { + unlock_page(page); + put_page(page); + } return ret; } @@ -932,7 +1160,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * some architectures (e.g. x86_32 with PAE) we might catch a glimpse * of unmatched parts which look like swp_pte, so unuse_pte must * recheck under pte lock. Scanning without pte lock lets it be - * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. */ pte = pte_offset_map(pmd, addr); do { @@ -940,7 +1168,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) @@ -964,7 +1192,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); if (ret) @@ -1048,11 +1276,12 @@ static int unuse_mm(struct mm_struct *mm, } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1078,7 +1307,13 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - count = si->swap_map[i]; + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } + count = ACCESS_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; } @@ -1089,12 +1324,20 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned char *swap_map; + volatile unsigned char *swap_map; /* swap_map is accessed without + * locking. Mark it as volatile + * to prevent compiler doing + * something odd. + */ unsigned char swcount; struct page *page; swp_entry_t entry; @@ -1123,7 +1366,7 @@ static int try_to_unuse(unsigned int type) * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1145,7 +1388,15 @@ static int try_to_unuse(unsigned int type) * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ - if (!*swap_map) + swcount = *swap_map; + /* + * We don't hold lock here, so the swap entry could be + * SWAP_MAP_BAD (when the cluster is discarding). + * Instead of fail out, We can just skip the swap + * entry because swapoff will wait for discarding + * finish anyway. + */ + if (!swcount || swcount == SWAP_MAP_BAD) continue; retval = -ENOMEM; break; @@ -1290,6 +1541,10 @@ static int try_to_unuse(unsigned int type) * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1373,6 +1628,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) list_del(&se->list); kfree(se); } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + sis->flags &= ~SWP_FILE; + mapping->a_ops->swap_deactivate(swap_file); + } } /* @@ -1381,7 +1644,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) * * This function rather assumes that it is called in ascending page order. */ -static int +int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { @@ -1454,196 +1717,170 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { - struct inode *inode; - unsigned blocks_per_page; - unsigned long page_no; - unsigned blkbits; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; int ret; - inode = sis->swap_file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto out; + return ret; } - blkbits = inode->i_blkbits; - blocks_per_page = PAGE_SIZE >> blkbits; + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (!ret) { + sis->flags |= SWP_FILE; + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + } + return ret; + } + + return generic_swapfile_activate(sis, swap_file, span); +} +static void _enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) +{ + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; /* - * Map all the blocks into the extent list. This code doesn't try - * to be very smart. + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low */ - probe_block = 0; - page_no = 0; - last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - - first_block = bmap(inode, probe_block); - if (first_block == 0) - goto bad_bmap; - - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; - } + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; + p->swap_map = swap_map; + p->cluster_info = cluster_info; + p->flags |= SWP_WRITEOK; + atomic_long_add(p->pages, &nr_swap_pages); + total_swap_pages += p->pages; - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { - sector_t block; - - block = bmap(inode, probe_block + block_in_page); - if (block == 0) - goto bad_bmap; - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; - } - } + assert_spin_locked(&swap_lock); + /* + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. + */ + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); +} - first_block >>= (PAGE_SHIFT - blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long *frontswap_map) +{ + frontswap_init(p->type, frontswap_map); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +} - /* - * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks - */ - ret = add_swap_extent(sis, page_no, 1, first_block); - if (ret < 0) - goto out; - nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; - } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; -out: - return ret; -bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); - ret = -EINVAL; - goto out; +static void reinsert_swap_info(struct swap_info_struct *p) +{ + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + struct swap_cluster_info *cluster_info; + unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char *pathname; - int i, type, prev; - int err; + struct filename *pathname; + int err, found = 0; + unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + BUG_ON(!current->mm); + pathname = getname(specialfile); - err = PTR_ERR(pathname); if (IS_ERR(pathname)) - goto out; + return PTR_ERR(pathname); - victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); - putname(pathname); + victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; - prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { - p = swap_info[type]; + plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { - if (p->swap_file->f_mapping == mapping) + if (p->swap_file->f_mapping == mapping) { + found = 1; break; + } } - prev = type; } - if (type < 0) { + if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } - if (!security_vm_enough_memory(p->pages)) + if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) - swap_list.head = p->next; - else - swap_info[prev]->next = p->next; - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + spin_lock(&p->lock); if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i]->next) - swap_info[i]->prio = p->prio--; + struct swap_info_struct *si = p; + + plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; + si->list.prio--; + si->avail_list.prio--; + } least_priority++; } - nr_swap_pages -= p->pages; + plist_del(&p->list, &swap_active_head); + atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; + spin_unlock(&p->lock); spin_unlock(&swap_lock); - current->flags |= PF_OOM_ORIGIN; - err = try_to_unuse(type); - current->flags &= ~PF_OOM_ORIGIN; + set_current_oom_origin(); + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ - spin_lock(&swap_lock); - if (p->prio < 0) - p->prio = --least_priority; - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - p->flags |= SWP_WRITEOK; - spin_unlock(&swap_lock); + reinsert_swap_info(p); goto out_dput; } - /* wait for any unplug function to finish */ - down_write(&swap_unplug_sem); - up_write(&swap_unplug_sem); + flush_work(&p->discard_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -1651,39 +1888,62 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_lock(&swapon_mutex); spin_lock(&swap_lock); + spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { + spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); + spin_lock(&p->lock); } swap_file = p->swap_file; + old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; - p->flags = 0; + cluster_info = p->cluster_info; + p->cluster_info = NULL; + frontswap_map = frontswap_map_get(p); + spin_unlock(&p->lock); spin_unlock(&swap_lock); + frontswap_invalidate_area(p->type); + frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; vfree(swap_map); - /* Destroy swap account informatin */ - swap_cgroup_swapoff(type); + vfree(cluster_info); + vfree(frontswap_map); + /* Destroy swap account information */ + swap_cgroup_swapoff(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); - set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + set_blocksize(bdev, old_block_size); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; mutex_unlock(&inode->i_mutex); } filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); @@ -1691,23 +1951,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) out_dput: filp_close(victim, NULL); out: + putname(pathname); return err; } #ifdef CONFIG_PROC_FS -struct proc_swaps { - struct seq_file seq; - int event; -}; - static unsigned swaps_poll(struct file *file, poll_table *wait) { - struct proc_swaps *s = file->private_data; + struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); - if (s->event != atomic_read(&proc_poll_event)) { - s->event = atomic_read(&proc_poll_event); + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); return POLLIN | POLLRDNORM | POLLERR | POLLPRI; } @@ -1780,7 +2036,7 @@ static int swap_show(struct seq_file *swap, void *v) len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", si->pages << (PAGE_SHIFT - 10), si->inuse_pages << (PAGE_SHIFT - 10), @@ -1797,24 +2053,16 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - struct proc_swaps *s; + struct seq_file *seq; int ret; - s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); - if (!s) - return -ENOMEM; - - file->private_data = s; - ret = seq_open(file, &swaps_op); - if (ret) { - kfree(s); + if (ret) return ret; - } - s->seq.private = s; - s->event = atomic_read(&proc_poll_event); - return ret; + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; } static const struct file_operations proc_swaps_operations = { @@ -1842,49 +2090,24 @@ static int __init max_swapfiles_check(void) late_initcall(max_swapfiles_check); #endif -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ -SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; - char *name = NULL; - struct block_device *bdev = NULL; - struct file *swap_file = NULL; - struct address_space *mapping; unsigned int type; - int i, prev; - int error; - union swap_header *swap_header; - unsigned int nr_good_pages; - int nr_extents = 0; - sector_t span; - unsigned long maxpages; - unsigned long swapfilepages; - unsigned char *swap_map = NULL; - struct page *page = NULL; - struct inode *inode = NULL; - int did_down = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } - error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); kfree(p); - goto out; + return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; @@ -1905,84 +2128,56 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; - p->next = -1; spin_unlock(&swap_lock); + spin_lock_init(&p->lock); - name = getname(specialfile); - error = PTR_ERR(name); - if (IS_ERR(name)) { - name = NULL; - goto bad_swap_2; - } - swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); - error = PTR_ERR(swap_file); - if (IS_ERR(swap_file)) { - swap_file = NULL; - goto bad_swap_2; - } - - p->swap_file = swap_file; - mapping = swap_file->f_mapping; - inode = mapping->host; - - error = -EBUSY; - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; + return p; +} - if (i == type || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) - goto bad_swap; - } +static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +{ + int error; - error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); - error = bd_claim(bdev, sys_swapon); + p->bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(p->bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); if (error < 0) { - bdev = NULL; - error = -EINVAL; - goto bad_swap; + p->bdev = NULL; + return -EINVAL; } - p->old_block_size = block_size(bdev); - error = set_blocksize(bdev, PAGE_SIZE); + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) - goto bad_swap; - p->bdev = bdev; + return error; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); - did_down = 1; - if (IS_SWAPFILE(inode)) { - error = -EBUSY; - goto bad_swap; - } - } else { - goto bad_swap; - } + if (IS_SWAPFILE(inode)) + return -EBUSY; + } else + return -EINVAL; - swapfilepages = i_size_read(inode) >> PAGE_SHIFT; + return 0; +} - /* - * Read the swap header. - */ - if (!mapping->a_ops->readpage) { - error = -EINVAL; - goto bad_swap; - } - page = read_mapping_page(mapping, 0, swap_file); - if (IS_ERR(page)) { - error = PTR_ERR(page); - goto bad_swap; - } - swap_header = kmap(page); +static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +{ + int i; + unsigned long maxpages; + unsigned long swapfilepages; + unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { - printk(KERN_ERR "Unable to find swap-space signature\n"); - error = -EINVAL; - goto bad_swap; + pr_err("Unable to find swap-space signature\n"); + return 0; } /* swap partition endianess hack... */ @@ -1995,11 +2190,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; + pr_warn("Unable to handle swap header version %d\n", + swap_header->info.version); + return 0; } p->lowest_bit = 1; @@ -2008,13 +2201,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 + * device. There are two limiting factors: 1) the number + * of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte as defined by the + * different architectures. In order to find the + * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap + * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a @@ -2022,132 +2215,341 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) */ maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; - if (maxpages > swap_header->info.last_page) { - maxpages = swap_header->info.last_page + 1; + last_page = swap_header->info.last_page; + if (last_page > maxpages) { + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", + maxpages << (PAGE_SHIFT - 10), + last_page << (PAGE_SHIFT - 10)); + } + if (maxpages > last_page) { + maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } p->highest_bit = maxpages - 1; - error = -EINVAL; if (!maxpages) - goto bad_swap; + return 0; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - goto bad_swap; + pr_warn("Swap area shorter than signature indicates\n"); + return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; + return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + return 0; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + return maxpages; +} + +static int setup_swap_map_and_extents(struct swap_info_struct *p, + union swap_header *swap_header, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long maxpages, + sector_t *span) +{ + int i; + unsigned int nr_good_pages; + int nr_extents; + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; - memset(swap_map, 0, maxpages); nr_good_pages = maxpages - 1; /* omit header page */ + cluster_set_null(&p->free_cluster_head); + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->discard_cluster_head); + cluster_set_null(&p->discard_cluster_tail); + for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) { - error = -EINVAL; - goto bad_swap; - } + if (page_nr == 0 || page_nr > swap_header->info.last_page) + return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; + /* + * Haven't marked the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, page_nr); } } - error = swap_cgroup_swapon(type, maxpages); - if (error) - goto bad_swap; + /* Haven't marked the cluster free yet, no list operation involved */ + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(p, cluster_info, i); if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; + /* + * Not mark the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, 0); p->max = maxpages; p->pages = nr_good_pages; - nr_extents = setup_swap_extents(p, &span); - if (nr_extents < 0) { - error = nr_extents; - goto bad_swap; - } + nr_extents = setup_swap_extents(p, span); + if (nr_extents < 0) + return nr_extents; nr_good_pages = p->pages; } if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); + pr_warn("Empty swap-file\n"); + return -EINVAL; + } + + if (!cluster_info) + return nr_extents; + + for (i = 0; i < nr_clusters; i++) { + if (!cluster_count(&cluster_info[idx])) { + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, + idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } + } + idx++; + if (idx == nr_clusters) + idx = 0; + } + return nr_extents; +} + +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct *p; + struct filename *name; + struct file *swap_file = NULL; + struct address_space *mapping; + int i; + int prio; + int error; + union swap_header *swap_header; + int nr_extents; + sector_t span; + unsigned long maxpages; + unsigned char *swap_map = NULL; + struct swap_cluster_info *cluster_info = NULL; + unsigned long *frontswap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = alloc_swap_info(); + if (IS_ERR(p)) + return PTR_ERR(p); + + INIT_WORK(&p->discard_work, swap_discard_work); + + name = getname(specialfile); + if (IS_ERR(name)) { + error = PTR_ERR(name); + name = NULL; + goto bad_swap; + } + swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swap_file)) { + error = PTR_ERR(swap_file); + swap_file = NULL; + goto bad_swap; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *q = swap_info[i]; + + if (q == p || !q->swap_file) + continue; + if (mapping == q->swap_file->f_mapping) { + error = -EBUSY; + goto bad_swap; + } + } + + inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + error = claim_swapfile(p, inode); + if (unlikely(error)) + goto bad_swap; + + /* + * Read the swap header. + */ + if (!mapping->a_ops->readpage) { + error = -EINVAL; + goto bad_swap; + } + page = read_mapping_page(mapping, 0, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap; + } + swap_header = kmap(page); + + maxpages = read_swap_header(p, swap_header, inode); + if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap; } - if (p->bdev) { - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + /* OK, set up the swap map and apply the bad block list */ + swap_map = vzalloc(maxpages); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + /* + * select a random position to start with to help wear leveling + * SSD + */ + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + + cluster_info = vzalloc(DIV_ROUND_UP(maxpages, + SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + if (!cluster_info) { + error = -ENOMEM; + goto bad_swap; + } + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } + for_each_possible_cpu(i) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster_set_null(&cluster->index); + } + } + + error = swap_cgroup_swapon(p->type, maxpages); + if (error) + goto bad_swap; + + nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, + cluster_info, maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap; + } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); + + if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + pr_err("swapon: discard_swap(%p): %d\n", + p, err); } - if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) - p->flags |= SWP_DISCARDABLE; } mutex_lock(&swapon_mutex); - spin_lock(&swap_lock); + prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - p->prio = + prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - else - p->prio = --least_priority; - p->swap_map = swap_map; - p->flags |= SWP_WRITEOK; - nr_swap_pages += nr_good_pages; - total_swap_pages += nr_good_pages; + enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", - nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, + pr_info("Adding %uk swap on %s. " + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", + (frontswap_map) ? "FS" : ""); - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); + if (S_ISREG(inode->i_mode)) + inode->i_flags |= S_SWAPFILE; error = 0; goto out; bad_swap: - if (bdev) { - set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; + if (inode && S_ISBLK(inode->i_mode) && p->bdev) { + set_blocksize(p->bdev, p->old_block_size); + blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); - swap_cgroup_swapoff(type); -bad_swap_2: + swap_cgroup_swapoff(p->type); spin_lock(&swap_lock); p->swap_file = NULL; p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); - if (swap_file) + vfree(cluster_info); + if (swap_file) { + if (inode && S_ISREG(inode->i_mode)) { + mutex_unlock(&inode->i_mutex); + inode = NULL; + } filp_close(swap_file, NULL); + } out: if (page && !IS_ERR(page)) { kunmap(page); @@ -2155,11 +2557,8 @@ out: } if (name) putname(name); - if (did_down) { - if (!error) - inode->i_flags |= S_SWAPFILE; + if (inode && S_ISREG(inode->i_mode)) mutex_unlock(&inode->i_mutex); - } return error; } @@ -2175,7 +2574,7 @@ void si_swapinfo(struct sysinfo *val) if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += si->inuse_pages; } - val->freeswap = nr_swap_pages + nr_to_be_unused; + val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } @@ -2208,11 +2607,21 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p = swap_info[type]; offset = swp_offset(entry); - spin_lock(&swap_lock); + spin_lock(&p->lock); if (unlikely(offset >= p->max)) goto unlock_out; count = p->swap_map[offset]; + + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; @@ -2243,12 +2652,12 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&swap_lock); + spin_unlock(&p->lock); out: return err; bad_file: - printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } @@ -2290,57 +2699,30 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +struct swap_info_struct *page_swap_info(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + BUG_ON(!PageSwapCache(page)); + return swap_info[swp_type(swap)]; +} + /* - * swap_lock prevents swap_map being freed. Don't grab an extra - * reference on the swaphandle, it doesn't matter if it becomes unused. + * out-of-line __page_file_ methods to avoid include hell. */ -int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +struct address_space *__page_file_mapping(struct page *page) { - struct swap_info_struct *si; - int our_page_cluster = page_cluster; - pgoff_t target, toff; - pgoff_t base, end; - int nr_pages = 0; - - if (!our_page_cluster) /* no readahead */ - return 0; - - si = swap_info[swp_type(entry)]; - target = swp_offset(entry); - base = (target >> our_page_cluster) << our_page_cluster; - end = base + (1 << our_page_cluster); - if (!base) /* first page is swap header */ - base++; - - spin_lock(&swap_lock); - if (end > si->max) /* don't go beyond end of map */ - end = si->max; - - /* Count contiguous allocated slots above our target */ - for (toff = target; ++toff < end; nr_pages++) { - /* Don't read in free or bad pages */ - if (!si->swap_map[toff]) - break; - if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) - break; - } - /* Count contiguous allocated slots below our target */ - for (toff = target; --toff >= base; nr_pages++) { - /* Don't read in free or bad pages */ - if (!si->swap_map[toff]) - break; - if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) - break; - } - spin_unlock(&swap_lock); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + return page_swap_info(page)->swap_file->f_mapping; +} +EXPORT_SYMBOL_GPL(__page_file_mapping); - /* - * Indicate starting offset, and return number of pages to get: - * if only 1, say 0, since there's then no readahead to be done. - */ - *offset = ++toff; - return nr_pages? ++nr_pages: 0; +pgoff_t __page_file_index(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + return swp_offset(swap); } +EXPORT_SYMBOL_GPL(__page_file_index); /* * add_swap_count_continuation - called when a swap count is duplicated @@ -2395,14 +2777,14 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return -ENOMEM; } /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel pagetables: so it - * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + * no architecture is using highmem pages for kernel page tables: so it + * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; @@ -2428,9 +2810,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) if (!(count & COUNT_CONTINUED)) goto out; - map = kmap_atomic(list_page, KM_USER0) + offset; + map = kmap_atomic(list_page) + offset; count = *map; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); /* * If this continuation count now has some space in it, @@ -2443,7 +2825,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); outer: if (page) __free_page(page); @@ -2473,7 +2855,7 @@ static bool swap_count_continued(struct swap_info_struct *si, offset &= ~PAGE_MASK; page = list_entry(head->lru.next, struct page, lru); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ @@ -2483,26 +2865,26 @@ static bool swap_count_continued(struct swap_info_struct *si, * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); BUG_ON(page == head); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); if (page == head) return false; /* add count continuation */ - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); while (page != head) { - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } return true; /* incremented */ @@ -2513,22 +2895,22 @@ init_map: *map = 0; /* we didn't zero the page */ */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); BUG_ON(page == head); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); while (page != head) { - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } return count == COUNT_CONTINUED; |
