From f792685006274a850e6cc0ea9ade275ccdfc90bc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 5 Mar 2013 18:05:46 +0100
Subject: math64: New div64_u64_rem helper

Provide an extended version of div64_u64() that
also returns the remainder of the division.

We are going to need this to refine the cputime
scaling code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 lib/div64.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/lib/div64.c b/lib/div64.c
index a163b6caef7..3af5728d95f 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,9 +79,10 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
- * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
+ * @remainder:  64bit remainder
  *
  * This implementation is a modified version of the algorithm proposed
  * by the book 'Hacker's Delight'.  The original source and full proof
@@ -89,27 +90,33 @@ EXPORT_SYMBOL(div_s64_rem);
  *
  * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
  */
-#ifndef div64_u64
-u64 div64_u64(u64 dividend, u64 divisor)
+#ifndef div64_u64_rem
+u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
 {
 	u32 high = divisor >> 32;
 	u64 quot;
 
 	if (high == 0) {
-		quot = div_u64(dividend, divisor);
+		u32 rem32;
+		quot = div_u64_rem(dividend, divisor, &rem32);
+		*remainder = rem32;
 	} else {
 		int n = 1 + fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
 			quot--;
-		if ((dividend - quot * divisor) >= divisor)
+
+		*remainder = dividend - quot * divisor;
+		if (*remainder >= divisor) {
 			quot++;
+			*remainder -= divisor;
+		}
 	}
 
 	return quot;
 }
-EXPORT_SYMBOL(div64_u64);
+EXPORT_SYMBOL(div64_u64_rem);
 #endif
 
 /**
-- 
cgit v1.2.3-70-g09d2


From cbe5e6109538ddab57764a88d9f0c2accd0c7d48 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Mar 2013 22:17:36 -0600
Subject: lru_cache: introduce lc_get_cumulative()

New helper to be able to consolidate more updates
into a single transaction.
Without this, we can only grab a single refcount
on an updated element while preparing a transaction.

lc_get_cumulative - like lc_get; also finds to-be-changed elements
  @lc: the lru cache to operate on
  @enr: the label to look up

  Unlike lc_get this also returns the element for @enr, if it is belonging to
  a pending transaction, so the return values are like for lc_get(),
  plus:

  pointer to an element already on the "to_be_changed" list.
	  In this case, the cache was already marked %LC_DIRTY.

  Caller needs to make sure that the pending transaction is completed,
  before proceeding to actually use this element.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

Fixed up by Jens to export lc_get_cumulative().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lru_cache.h |  1 +
 lib/lru_cache.c           | 56 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 47 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 4019013c659..46262284de4 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc);
 extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
 extern void lc_del(struct lru_cache *lc, struct lc_element *element);
 
+extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 8335d39d2cc..4a83ecd0365 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc)
 	return 0;
 }
 
-static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
+/* used as internal flags to __lc_get */
+enum {
+	LC_GET_MAY_CHANGE = 1,
+	LC_GET_MAY_USE_UNCOMMITTED = 2,
+};
+
+static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags)
 {
 	struct lc_element *e;
 
@@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
 	 * this enr is currently being pulled in already,
 	 * and will be available once the pending transaction
 	 * has been committed. */
-	if (e && e->lc_new_number == e->lc_number) {
+	if (e) {
+		if (e->lc_new_number != e->lc_number) {
+			/* It has been found above, but on the "to_be_changed"
+			 * list, not yet committed.  Don't pull it in twice,
+			 * wait for the transaction, then try again...
+			 */
+			if (!(flags & LC_GET_MAY_USE_UNCOMMITTED))
+				RETURN(NULL);
+			/* ... unless the caller is aware of the implications,
+			 * probably preparing a cumulative transaction. */
+			++e->refcnt;
+			++lc->hits;
+			RETURN(e);
+		}
+		/* else: lc_new_number == lc_number; a real hit. */
 		++lc->hits;
 		if (e->refcnt++ == 0)
 			lc->used++;
 		list_move(&e->list, &lc->in_use); /* Not evictable... */
 		RETURN(e);
 	}
+	/* e == NULL */
 
 	++lc->misses;
-	if (!may_change)
-		RETURN(NULL);
-
-	/* It has been found above, but on the "to_be_changed" list, not yet
-	 * committed.  Don't pull it in twice, wait for the transaction, then
-	 * try again */
-	if (e)
+	if (!(flags & LC_GET_MAY_CHANGE))
 		RETURN(NULL);
 
 	/* To avoid races with lc_try_lock(), first, mark us dirty
@@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
  */
 struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 {
-	return __lc_get(lc, enr, 1);
+	return __lc_get(lc, enr, LC_GET_MAY_CHANGE);
+}
+
+/**
+ * lc_get_cumulative - like lc_get; also finds to-be-changed elements
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Unlike lc_get this also returns the element for @enr, if it is belonging to
+ * a pending transaction, so the return values are like for lc_get(),
+ * plus:
+ *
+ * pointer to an element already on the "to_be_changed" list.
+ * 	In this case, the cache was already marked %LC_DIRTY.
+ *
+ * Caller needs to make sure that the pending transaction is completed,
+ * before proceeding to actually use this element.
+ */
+struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr)
+{
+	return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED);
 }
 
 /**
@@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats);
 EXPORT_SYMBOL(lc_seq_dump_details);
 EXPORT_SYMBOL(lc_try_lock);
 EXPORT_SYMBOL(lc_is_used);
+EXPORT_SYMBOL(lc_get_cumulative);
-- 
cgit v1.2.3-70-g09d2


From 2db76d7c3c6db93058f983c8240f7c7c25e87ee6 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Tue, 26 Mar 2013 15:14:18 +0200
Subject: lib/scatterlist: sg_page_iter: support sg lists w/o backing pages

The i915 driver uses sg lists for memory without backing 'struct page'
pages, similarly to other IO memory regions, setting only the DMA
address for these. It does this, so that it can program the HW MMU
tables in a uniform way both for sg lists with and without backing pages.

Without a valid page pointer we can't call nth_page to get the current
page in __sg_page_iter_next, so add a helper that relevant users can
call separately. Also add a helper to get the DMA address of the current
page (idea from Daniel).

Convert all places in i915, to use the new API.

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_cache.c            |  2 +-
 drivers/gpu/drm/i915/i915_drv.h        |  2 +-
 drivers/gpu/drm/i915/i915_gem.c        |  8 ++++----
 drivers/gpu/drm/i915/i915_gem_dmabuf.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c    |  6 ++----
 drivers/gpu/drm/i915/i915_gem_tiling.c |  4 ++--
 include/linux/scatterlist.h            | 28 +++++++++++++++++++++++-----
 lib/scatterlist.c                      |  4 +---
 8 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'lib')

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index bc8edbeca3f..bb8f5801218 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -109,7 +109,7 @@ drm_clflush_sg(struct sg_table *st)
 
 		mb();
 		for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
-			drm_clflush_page(sg_iter.page);
+			drm_clflush_page(sg_page_iter_page(&sg_iter));
 		mb();
 
 		return;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d091ea12fa..f69538508d8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1543,7 +1543,7 @@ static inline struct page *i915_gem_object_get_page(struct drm_i915_gem_object *
 	struct sg_page_iter sg_iter;
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, n)
-		return sg_iter.page;
+		return sg_page_iter_page(&sg_iter);
 
 	return NULL;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a1123a32dc2..911bd40ef51 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -442,7 +442,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
 			 offset >> PAGE_SHIFT) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 
 		if (remain <= 0)
 			break;
@@ -765,7 +765,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
 			 offset >> PAGE_SHIFT) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 		int partial_cacheline_write;
 
 		if (remain <= 0)
@@ -1647,7 +1647,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 		obj->dirty = 0;
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 
 		if (obj->dirty)
 			set_page_dirty(page);
@@ -1827,7 +1827,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 err_pages:
 	sg_mark_end(sg);
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
-		page_cache_release(sg_iter.page);
+		page_cache_release(sg_page_iter_page(&sg_iter));
 	sg_free_table(st);
 	kfree(st);
 	return PTR_ERR(page);
diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index 898615d2d5e..c6dfc1466e3 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -130,7 +130,7 @@ static void *i915_gem_dmabuf_vmap(struct dma_buf *dma_buf)
 
 	i = 0;
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0);
-		pages[i++] = sg_iter.page;
+		pages[i++] = sg_page_iter_page(&sg_iter);
 
 	obj->dma_buf_vmapping = vmap(pages, i, 0, PAGE_KERNEL);
 	drm_free_large(pages);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 4cbae7bbb83..24a23b31b55 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -123,8 +123,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 	for_each_sg_page(pages->sgl, &sg_iter, pages->nents, 0) {
 		dma_addr_t page_addr;
 
-		page_addr = sg_dma_address(sg_iter.sg) +
-				(sg_iter.sg_pgoffset << PAGE_SHIFT);
+		page_addr = sg_page_iter_dma_address(&sg_iter);
 		pt_vaddr[act_pte] = gen6_pte_encode(ppgtt->dev, page_addr,
 						    cache_level);
 		if (++act_pte == I915_PPGTT_PT_ENTRIES) {
@@ -424,8 +423,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 	dma_addr_t addr;
 
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) {
-		addr = sg_dma_address(sg_iter.sg) +
-			(sg_iter.sg_pgoffset << PAGE_SHIFT);
+		addr = sg_page_iter_dma_address(&sg_iter);
 		iowrite32(gen6_pte_encode(dev, addr, level), &gtt_entries[i]);
 		i++;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index f799708bcb8..c807eb93755 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -481,7 +481,7 @@ i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj)
 
 	i = 0;
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 		char new_bit_17 = page_to_phys(page) >> 17;
 		if ((new_bit_17 & 0x1) !=
 		    (test_bit(i, obj->bit_17) != 0)) {
@@ -511,7 +511,7 @@ i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj)
 
 	i = 0;
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		if (page_to_phys(sg_iter.page) & (1 << 17))
+		if (page_to_phys(sg_page_iter_page(&sg_iter)) & (1 << 17))
 			__set_bit(i, obj->bit_17);
 		else
 			__clear_bit(i, obj->bit_17);
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2d8bdaef961..e96b9546c4c 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -235,13 +235,13 @@ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
  * sg page iterator
  *
  * Iterates over sg entries page-by-page.  On each successful iteration,
- * @piter->page points to the current page, @piter->sg to the sg holding this
- * page and @piter->sg_pgoffset to the page's page offset within the sg. The
- * iteration will stop either when a maximum number of sg entries was reached
- * or a terminating sg (sg_last(sg) == true) was reached.
+ * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
+ * to get the current page and its dma address. @piter->sg will point to the
+ * sg holding this page and @piter->sg_pgoffset to the page's page offset
+ * within the sg. The iteration will stop either when a maximum number of sg
+ * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
  */
 struct sg_page_iter {
-	struct page		*page;		/* current page */
 	struct scatterlist	*sg;		/* sg holding the page */
 	unsigned int		sg_pgoffset;	/* page offset within the sg */
 
@@ -255,6 +255,24 @@ bool __sg_page_iter_next(struct sg_page_iter *piter);
 void __sg_page_iter_start(struct sg_page_iter *piter,
 			  struct scatterlist *sglist, unsigned int nents,
 			  unsigned long pgoffset);
+/**
+ * sg_page_iter_page - get the current page held by the page iterator
+ * @piter:	page iterator holding the page
+ */
+static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
+{
+	return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
+}
+
+/**
+ * sg_page_iter_dma_address - get the dma address of the current page held by
+ * the page iterator.
+ * @piter:	page iterator holding the page
+ */
+static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter)
+{
+	return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT);
+}
 
 /**
  * for_each_sg_page - iterate over the pages of the given sg list
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index b83c144d731..a1cf8cae60e 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -401,7 +401,6 @@ void __sg_page_iter_start(struct sg_page_iter *piter,
 	piter->__pg_advance = 0;
 	piter->__nents = nents;
 
-	piter->page = NULL;
 	piter->sg = sglist;
 	piter->sg_pgoffset = pgoffset;
 }
@@ -426,7 +425,6 @@ bool __sg_page_iter_next(struct sg_page_iter *piter)
 		if (!--piter->__nents || !piter->sg)
 			return false;
 	}
-	piter->page = nth_page(sg_page(piter->sg), piter->sg_pgoffset);
 
 	return true;
 }
@@ -496,7 +494,7 @@ bool sg_miter_next(struct sg_mapping_iter *miter)
 		miter->__remaining = min_t(unsigned long, miter->__remaining,
 					   PAGE_SIZE - miter->__offset);
 	}
-	miter->page = miter->piter.page;
+	miter->page = sg_page_iter_page(&miter->piter);
 	miter->consumed = miter->length = miter->__remaining;
 
 	if (miter->__flags & SG_MITER_ATOMIC)
-- 
cgit v1.2.3-70-g09d2


From 0ecc833bac594099505a090cbca6ccd5b83d5975 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Mar 2013 12:23:28 -0400
Subject: mode_t, whack-a-mole at 11...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/net/wireless/ath/wil6210/debugfs.c | 4 ++--
 drivers/staging/dgrp/dgrp_common.h         | 2 +-
 drivers/staging/dgrp/dgrp_specproc.c       | 2 +-
 fs/f2fs/acl.c                              | 2 +-
 fs/f2fs/dir.c                              | 2 +-
 fs/proc/self.c                             | 2 +-
 lib/notifier-error-inject.c                | 4 ++--
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'lib')

diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 65fc9683bfd..76c7694518d 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -216,7 +216,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_iomem_x32, wil_debugfs_iomem_x32_get,
 			wil_debugfs_iomem_x32_set, "0x%08llx\n");
 
 static struct dentry *wil_debugfs_create_iomem_x32(const char *name,
-						   mode_t mode,
+						   umode_t mode,
 						   struct dentry *parent,
 						   void __iomem *value)
 {
@@ -367,7 +367,7 @@ static const struct file_operations fops_ioblob = {
 
 static
 struct dentry *wil_debugfs_create_ioblob(const char *name,
-					 mode_t mode,
+					 umode_t mode,
 					 struct dentry *parent,
 					 struct debugfs_blob_wrapper *blob)
 {
diff --git a/drivers/staging/dgrp/dgrp_common.h b/drivers/staging/dgrp/dgrp_common.h
index 0583fe9c7b0..fde116236e9 100644
--- a/drivers/staging/dgrp/dgrp_common.h
+++ b/drivers/staging/dgrp/dgrp_common.h
@@ -120,7 +120,7 @@ enum {
 struct dgrp_proc_entry {
 	int                  id;          /* Integer identifier */
 	const char        *name;          /* ASCII identifier */
-	mode_t             mode;          /* File access permissions */
+	umode_t            mode;          /* File access permissions */
 	struct dgrp_proc_entry *child;    /* Child pointer */
 
 	/* file ops to use, pass NULL to use default */
diff --git a/drivers/staging/dgrp/dgrp_specproc.c b/drivers/staging/dgrp/dgrp_specproc.c
index 73f287f9660..dddf8a2e396 100644
--- a/drivers/staging/dgrp/dgrp_specproc.c
+++ b/drivers/staging/dgrp/dgrp_specproc.c
@@ -228,7 +228,7 @@ static void register_proc_table(struct dgrp_proc_entry *table,
 {
 	struct proc_dir_entry *de;
 	int len;
-	mode_t mode;
+	umode_t mode;
 
 	if (table == NULL)
 		return;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 137af4255da..44abc2f286e 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct posix_acl *acl;
 	int error;
-	mode_t mode = get_inode_mode(inode);
+	umode_t mode = get_inode_mode(inode);
 
 	if (!test_opt(sbi, POSIX_ACL))
 		return 0;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a1f38443ece..1be948768e2 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -60,7 +60,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
 {
-	mode_t mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
diff --git a/fs/proc/self.c b/fs/proc/self.c
index aa5cc3bff14..d8a02529661 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -51,7 +51,7 @@ static const struct inode_operations proc_self_inode_operations = {
 void __init proc_self_init(void)
 {
 	struct proc_dir_entry *proc_self_symlink;
-	mode_t mode;
+	umode_t mode;
 
 	mode = S_IFLNK | S_IRWXUGO;
 	proc_self_symlink = proc_create("self", mode, NULL, NULL );
diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c
index 44b92cb6224..eb4a04afea8 100644
--- a/lib/notifier-error-inject.c
+++ b/lib/notifier-error-inject.c
@@ -17,7 +17,7 @@ static int debugfs_errno_get(void *data, u64 *val)
 DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set,
 			"%lld\n");
 
-static struct dentry *debugfs_create_errno(const char *name, mode_t mode,
+static struct dentry *debugfs_create_errno(const char *name, umode_t mode,
 				struct dentry *parent, int *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_errno);
@@ -50,7 +50,7 @@ struct dentry *notifier_err_inject_init(const char *name, struct dentry *parent,
 			struct notifier_err_inject *err_inject, int priority)
 {
 	struct notifier_err_inject_action *action;
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	struct dentry *actions_dir;
 
-- 
cgit v1.2.3-70-g09d2


From a49b7e82cab0f9b41f483359be83f44fbb6b4979 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 13 Apr 2013 15:15:30 -0700
Subject: kobject: fix kset_find_obj() race with concurrent last kobject_put()

Anatol Pomozov identified a race condition that hits module unloading
and re-loading.  To quote Anatol:

 "This is a race codition that exists between kset_find_obj() and
  kobject_put().  kset_find_obj() might return kobject that has refcount
  equal to 0 if this kobject is freeing by kobject_put() in other
  thread.

  Here is timeline for the crash in case if kset_find_obj() searches for
  an object tht nobody holds and other thread is doing kobject_put() on
  the same kobject:

    THREAD A (calls kset_find_obj())     THREAD B (calls kobject_put())
    splin_lock()
                                         atomic_dec_return(kobj->kref), counter gets zero here
                                         ... starts kobject cleanup ....
                                         spin_lock() // WAIT thread A in kobj_kset_leave()
    iterate over kset->list
    atomic_inc(kobj->kref) (counter becomes 1)
    spin_unlock()
                                         spin_lock() // taken
                                         // it does not know that thread A increased counter so it
                                         remove obj from list
                                         spin_unlock()
                                         vfree(module) // frees module object with containing kobj

    // kobj points to freed memory area!!
    kobject_put(kobj) // OOPS!!!!

  The race above happens because module.c tries to use kset_find_obj()
  when somebody unloads module.  The module.c code was introduced in
  commit 6494a93d55fa"

Anatol supplied a patch specific for module.c that worked around the
problem by simply not using kset_find_obj() at all, but rather than make
a local band-aid, this just fixes kset_find_obj() to be thread-safe
using the proper model of refusing the get a new reference if the
refcount has already dropped to zero.

See examples of this proper refcount handling not only in the kref
documentation, but in various other equivalent uses of this pattern by
grepping for atomic_inc_not_zero().

[ Side note: the module race does indicate that module loading and
  unloading is not properly serialized wrt sysfs information using the
  module mutex.  That may require further thought, but this is the
  correct fix at the kobject layer regardless. ]

Reported-analyzed-and-tested-by: Anatol Pomozov <anatol.pomozov@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/kobject.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/kobject.c b/lib/kobject.c
index e07ee1fcd6f..a65486613d7 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -529,6 +529,13 @@ struct kobject *kobject_get(struct kobject *kobj)
 	return kobj;
 }
 
+static struct kobject *kobject_get_unless_zero(struct kobject *kobj)
+{
+	if (!kref_get_unless_zero(&kobj->kref))
+		kobj = NULL;
+	return kobj;
+}
+
 /*
  * kobject_cleanup - free kobject resources.
  * @kobj: object to cleanup
@@ -751,7 +758,7 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name)
 
 	list_for_each_entry(k, &kset->list, entry) {
 		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
-			ret = kobject_get(k);
+			ret = kobject_get_unless_zero(k);
 			break;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 0635eb8a54cf0fea64b174bb68bc36b9c3d622db Mon Sep 17 00:00:00 2001
From: Matthew Garrett <matthew.garrett@nebula.com>
Date: Mon, 15 Apr 2013 13:09:45 -0700
Subject: Move utf16 functions to kernel core and rename

We want to be able to use the utf16 functions that are currently present
in the EFI variables code in platform-specific code as well. Move them to
the kernel core, and in the process rename them to accurately describe what
they do - they don't handle UTF16, only UCS2.

Signed-off-by: Matthew Garrett <matthew.garrett@nebula.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/Kconfig    |  1 +
 drivers/firmware/efivars.c  | 80 ++++++++++-----------------------------------
 include/linux/ucs2_string.h | 14 ++++++++
 lib/Kconfig                 |  3 ++
 lib/Makefile                |  2 ++
 lib/ucs2_string.c           | 51 +++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 62 deletions(-)
 create mode 100644 include/linux/ucs2_string.h
 create mode 100644 lib/ucs2_string.c

(limited to 'lib')

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 42c759a4d04..3e532002e4d 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -39,6 +39,7 @@ config FIRMWARE_MEMMAP
 config EFI_VARS
 	tristate "EFI Variable Support via sysfs"
 	depends on EFI
+	select UCS2_STRING
 	default n
 	help
 	  If you say Y here, you are able to get EFI (Extensible Firmware
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bf15d81d74e..182ce947117 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -80,6 +80,7 @@
 #include <linux/slab.h>
 #include <linux/pstore.h>
 #include <linux/ctype.h>
+#include <linux/ucs2_string.h>
 
 #include <linux/fs.h>
 #include <linux/ramfs.h>
@@ -172,51 +173,6 @@ static void efivar_update_sysfs_entries(struct work_struct *);
 static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries);
 static bool efivar_wq_enabled = true;
 
-/* Return the number of unicode characters in data */
-static unsigned long
-utf16_strnlen(efi_char16_t *s, size_t maxlength)
-{
-	unsigned long length = 0;
-
-	while (*s++ != 0 && length < maxlength)
-		length++;
-	return length;
-}
-
-static inline unsigned long
-utf16_strlen(efi_char16_t *s)
-{
-	return utf16_strnlen(s, ~0UL);
-}
-
-/*
- * Return the number of bytes is the length of this string
- * Note: this is NOT the same as the number of unicode characters
- */
-static inline unsigned long
-utf16_strsize(efi_char16_t *data, unsigned long maxlength)
-{
-	return utf16_strnlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t);
-}
-
-static inline int
-utf16_strncmp(const efi_char16_t *a, const efi_char16_t *b, size_t len)
-{
-	while (1) {
-		if (len == 0)
-			return 0;
-		if (*a < *b)
-			return -1;
-		if (*a > *b)
-			return 1;
-		if (*a == 0) /* implies *b == 0 */
-			return 0;
-		a++;
-		b++;
-		len--;
-	}
-}
-
 static bool
 validate_device_path(struct efi_variable *var, int match, u8 *buffer,
 		     unsigned long len)
@@ -268,7 +224,7 @@ validate_load_option(struct efi_variable *var, int match, u8 *buffer,
 	u16 filepathlength;
 	int i, desclength = 0, namelen;
 
-	namelen = utf16_strnlen(var->VariableName, sizeof(var->VariableName));
+	namelen = ucs2_strnlen(var->VariableName, sizeof(var->VariableName));
 
 	/* Either "Boot" or "Driver" followed by four digits of hex */
 	for (i = match; i < match+4; i++) {
@@ -291,7 +247,7 @@ validate_load_option(struct efi_variable *var, int match, u8 *buffer,
 	 * There's no stored length for the description, so it has to be
 	 * found by hand
 	 */
-	desclength = utf16_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
+	desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
 
 	/* Each boot entry must have a descriptor */
 	if (!desclength)
@@ -581,7 +537,7 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
 	spin_lock_irq(&efivars->lock);
 
 	status = check_var_size_locked(efivars, new_var->Attributes,
-	       new_var->DataSize + utf16_strsize(new_var->VariableName, 1024));
+	       new_var->DataSize + ucs2_strsize(new_var->VariableName, 1024));
 
 	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
 		status = efivars->ops->set_variable(new_var->VariableName,
@@ -759,7 +715,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 	 * QueryVariableInfo() isn't supported by the firmware.
 	 */
 
-	varsize = datasize + utf16_strsize(var->var.VariableName, 1024);
+	varsize = datasize + ucs2_strsize(var->var.VariableName, 1024);
 	status = check_var_size(efivars, attributes, varsize);
 
 	if (status != EFI_SUCCESS) {
@@ -1211,7 +1167,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
 
 		inode = NULL;
 
-		len = utf16_strlen(entry->var.VariableName);
+		len = ucs2_strlen(entry->var.VariableName);
 
 		/* name, plus '-', plus GUID, plus NUL*/
 		name = kmalloc(len + 1 + GUID_LEN + 1, GFP_ATOMIC);
@@ -1469,8 +1425,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 
 		if (efi_guidcmp(entry->var.VendorGuid, vendor))
 			continue;
-		if (utf16_strncmp(entry->var.VariableName, efi_name,
-				  utf16_strlen(efi_name))) {
+		if (ucs2_strncmp(entry->var.VariableName, efi_name,
+				  ucs2_strlen(efi_name))) {
 			/*
 			 * Check if an old format,
 			 * which doesn't support holding
@@ -1482,8 +1438,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 			for (i = 0; i < DUMP_NAME_LEN; i++)
 				efi_name_old[i] = name_old[i];
 
-			if (utf16_strncmp(entry->var.VariableName, efi_name_old,
-					  utf16_strlen(efi_name_old)))
+			if (ucs2_strncmp(entry->var.VariableName, efi_name_old,
+					  ucs2_strlen(efi_name_old)))
 				continue;
 		}
 
@@ -1561,8 +1517,8 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 	 * Does this variable already exist?
 	 */
 	list_for_each_entry_safe(search_efivar, n, &efivars->list, list) {
-		strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024);
-		strsize2 = utf16_strsize(new_var->VariableName, 1024);
+		strsize1 = ucs2_strsize(search_efivar->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(new_var->VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(&(search_efivar->var.VariableName),
 				new_var->VariableName, strsize1) &&
@@ -1578,7 +1534,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 	}
 
 	status = check_var_size_locked(efivars, new_var->Attributes,
-	       new_var->DataSize + utf16_strsize(new_var->VariableName, 1024));
+	       new_var->DataSize + ucs2_strsize(new_var->VariableName, 1024));
 
 	if (status && status != EFI_UNSUPPORTED) {
 		spin_unlock_irq(&efivars->lock);
@@ -1602,7 +1558,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 
 	/* Create the entry in sysfs.  Locking is not required here */
 	status = efivar_create_sysfs_entry(efivars,
-					   utf16_strsize(new_var->VariableName,
+					   ucs2_strsize(new_var->VariableName,
 							 1024),
 					   new_var->VariableName,
 					   &new_var->VendorGuid);
@@ -1632,8 +1588,8 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	 * Does this variable already exist?
 	 */
 	list_for_each_entry_safe(search_efivar, n, &efivars->list, list) {
-		strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024);
-		strsize2 = utf16_strsize(del_var->VariableName, 1024);
+		strsize1 = ucs2_strsize(search_efivar->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(del_var->VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(&(search_efivar->var.VariableName),
 				del_var->VariableName, strsize1) &&
@@ -1679,9 +1635,9 @@ static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
 	unsigned long strsize1, strsize2;
 	bool found = false;
 
-	strsize1 = utf16_strsize(variable_name, 1024);
+	strsize1 = ucs2_strsize(variable_name, 1024);
 	list_for_each_entry_safe(entry, n, &efivars->list, list) {
-		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(variable_name, &(entry->var.VariableName),
 				strsize2) &&
diff --git a/include/linux/ucs2_string.h b/include/linux/ucs2_string.h
new file mode 100644
index 00000000000..cbb20afdbc0
--- /dev/null
+++ b/include/linux/ucs2_string.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_UCS2_STRING_H_
+#define _LINUX_UCS2_STRING_H_
+
+#include <linux/types.h>	/* for size_t */
+#include <linux/stddef.h>	/* for NULL */
+
+typedef u16 ucs2_char_t;
+
+unsigned long ucs2_strnlen(const ucs2_char_t *s, size_t maxlength);
+unsigned long ucs2_strlen(const ucs2_char_t *s);
+unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength);
+int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len);
+
+#endif /* _LINUX_UCS2_STRING_H_ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 3958dc4389f..fe01d418b09 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -404,4 +404,7 @@ config OID_REGISTRY
 	help
 	  Enable fast lookup object identifier registry.
 
+config UCS2_STRING
+        tristate
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index d7946ff75b2..6e2cc561f76 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -174,3 +174,5 @@ quiet_cmd_build_OID_registry = GEN     $@
       cmd_build_OID_registry = perl $(srctree)/$(src)/build_OID_registry $< $@
 
 clean-files	+= oid_registry_data.c
+
+obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c
new file mode 100644
index 00000000000..6f500ef2301
--- /dev/null
+++ b/lib/ucs2_string.c
@@ -0,0 +1,51 @@
+#include <linux/ucs2_string.h>
+#include <linux/module.h>
+
+/* Return the number of unicode characters in data */
+unsigned long
+ucs2_strnlen(const ucs2_char_t *s, size_t maxlength)
+{
+        unsigned long length = 0;
+
+        while (*s++ != 0 && length < maxlength)
+                length++;
+        return length;
+}
+EXPORT_SYMBOL(ucs2_strnlen);
+
+unsigned long
+ucs2_strlen(const ucs2_char_t *s)
+{
+        return ucs2_strnlen(s, ~0UL);
+}
+EXPORT_SYMBOL(ucs2_strlen);
+
+/*
+ * Return the number of bytes is the length of this string
+ * Note: this is NOT the same as the number of unicode characters
+ */
+unsigned long
+ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength)
+{
+        return ucs2_strnlen(data, maxlength/sizeof(ucs2_char_t)) * sizeof(ucs2_char_t);
+}
+EXPORT_SYMBOL(ucs2_strsize);
+
+int
+ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len)
+{
+        while (1) {
+                if (len == 0)
+                        return 0;
+                if (*a < *b)
+                        return -1;
+                if (*a > *b)
+                        return 1;
+                if (*a == 0) /* implies *b == 0 */
+                        return 0;
+                a++;
+                b++;
+                len--;
+        }
+}
+EXPORT_SYMBOL(ucs2_strncmp);
-- 
cgit v1.2.3-70-g09d2


From c729de8fcea37a1c444e81857eace12494c804a9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:45 -0700
Subject: x86, kdump: Set crashkernel_low automatically

Chao said that kdump does does work well on his system on 3.8
without extra parameter, even iommu does not work with kdump.
And now have to append crashkernel_low=Y in first kernel to make
kdump work.

We have now modified crashkernel=X to allocate memory beyong 4G (if
available) and do not allocate low range for crashkernel if the user
does not specify that with crashkernel_low=Y.  This causes regression
if iommu is not enabled.  Without iommu, swiotlb needs to be setup in
first 4G and there is no low memory available to second kernel.

Set crashkernel_low automatically if the user does not specify that.

For system that does support IOMMU with kdump properly, user could
specify crashkernel_low=0 to save that 72M low ram.

-v3: add swiotlb_size() according to Konrad.
-v4: add comments what 8M is for according to hpa.
     also update more crashkernel_low= in kernel-parameters.txt
-v5: update changelog according to Vivek.
-v6: Change description about swiotlb referring according to HATAYAMA.

Reported-by: WANG Chao <chaowang@redhat.com>
Tested-by: WANG Chao <chaowang@redhat.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-2-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/kernel-parameters.txt | 14 +++++++++++---
 arch/x86/kernel/setup.c             | 21 ++++++++++++++++++---
 include/linux/swiotlb.h             |  1 +
 lib/swiotlb.c                       | 19 +++++++++++++++----
 4 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4609e81dbc3..cff672da248 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -596,9 +596,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			is selected automatically. Check
 			Documentation/kdump/kdump.txt for further details.
 
-	crashkernel_low=size[KMG]
-			[KNL, x86] parts under 4G.
-
 	crashkernel=range1:size1[,range2:size2,...][@offset]
 			[KNL] Same as above, but depends on the memory
 			in the running system. The syntax of range is
@@ -606,6 +603,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			a memory unit (amount[KMG]). See also
 			Documentation/kdump/kdump.txt for an example.
 
+	crashkernel_low=size[KMG]
+			[KNL, x86_64] range under 4G. When crashkernel= is
+			passed, kernel allocate physical memory region
+			above 4G, that cause second kernel crash on system
+			that require some amount of low memory, e.g. swiotlb
+			requires at least 64M+32K low memory.  Kernel would
+			try to allocate 72M below 4G automatically.
+			This one let user to specify own low range under 4G
+			for second kernel instead.
+			0: to disable low allocation.
+
 	cs89x0_dma=	[HW,NET]
 			Format: <dma>
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 90d8cc930f5..12349202cae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -521,19 +521,34 @@ static void __init reserve_crashkernel_low(void)
 	unsigned long long low_base = 0, low_size = 0;
 	unsigned long total_low_mem;
 	unsigned long long base;
+	bool auto_set = false;
 	int ret;
 
 	total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
 	ret = parse_crashkernel_low(boot_command_line, total_low_mem,
 						&low_size, &base);
-	if (ret != 0 || low_size <= 0)
-		return;
+	if (ret != 0) {
+		/*
+		 * two parts from lib/swiotlb.c:
+		 *	swiotlb size: user specified with swiotlb= or default.
+		 *	swiotlb overflow buffer: now is hardcoded to 32k.
+		 *		We round it to 8M for other buffers that
+		 *		may need to stay low too.
+		 */
+		low_size = swiotlb_size_or_default() + (8UL<<20);
+		auto_set = true;
+	} else {
+		/* passed with crashkernel_low=0 ? */
+		if (!low_size)
+			return;
+	}
 
 	low_base = memblock_find_in_range(low_size, (1ULL<<32),
 					low_size, alignment);
 
 	if (!low_base) {
-		pr_info("crashkernel low reservation failed - No suitable area found.\n");
+		if (!auto_set)
+			pr_info("crashkernel low reservation failed - No suitable area found.\n");
 
 		return;
 	}
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 2de42f9401d..a5ffd32642f 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -25,6 +25,7 @@ extern int swiotlb_force;
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
+unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 
 /*
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index bfe02b8fc55..d23762e6652 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -105,9 +105,9 @@ setup_io_tlb_npages(char *str)
 	if (!strcmp(str, "force"))
 		swiotlb_force = 1;
 
-	return 1;
+	return 0;
 }
-__setup("swiotlb=", setup_io_tlb_npages);
+early_param("swiotlb", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
 unsigned long swiotlb_nr_tbl(void)
@@ -115,6 +115,18 @@ unsigned long swiotlb_nr_tbl(void)
 	return io_tlb_nslabs;
 }
 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
+
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
+unsigned long swiotlb_size_or_default(void)
+{
+	unsigned long size;
+
+	size = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	return size ? size : (IO_TLB_DEFAULT_SIZE);
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -188,8 +200,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 void  __init
 swiotlb_init(int verbose)
 {
-	/* default to 64MB */
-	size_t default_size = 64UL<<20;
+	size_t default_size = IO_TLB_DEFAULT_SIZE;
 	unsigned char *vstart;
 	unsigned long bytes;
 
-- 
cgit v1.2.3-70-g09d2


From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:06:11 -0700
Subject: mm, show_mem: suppress page counts in non-blockable contexts

On large systems with a lot of memory, walking all RAM to determine page
types may take a half second or even more.

In non-blockable contexts, the page allocator will emit a page allocation
failure warning unless __GFP_NOWARN is specified.  In such contexts, irqs
are typically disabled and such a lengthy delay may even result in NMI
watchdog timeouts.

To fix this, suppress the page walk in such contexts when printing the
page allocation failure warning.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/init.c       | 3 +++
 arch/ia64/mm/contig.c    | 2 ++
 arch/ia64/mm/discontig.c | 2 ++
 arch/parisc/mm/init.c    | 2 ++
 arch/unicore32/mm/init.c | 3 +++
 include/linux/mm.h       | 3 ++-
 lib/show_mem.c           | 3 +++
 mm/page_alloc.c          | 7 +++++++
 8 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index ad722f1208a..ad9a9f3f032 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -99,6 +99,9 @@ void show_mem(unsigned int filter)
 	printk("Mem-info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_bank (i, mi) {
 		struct membank *bank = &mi->bank[i];
 		unsigned int pfn1, pfn2;
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 80dab509dfb..67c59ebec89 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -47,6 +47,8 @@ void show_mem(unsigned int filter)
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas(filter);
 	printk(KERN_INFO "Node memory in pages:\n");
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
 	for_each_online_pgdat(pgdat) {
 		unsigned long present;
 		unsigned long flags;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c2e955ee79a..a57436e5d40 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -623,6 +623,8 @@ void show_mem(unsigned int filter)
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas(filter);
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
 	printk(KERN_INFO "Node memory in pages:\n");
 	for_each_online_pgdat(pgdat) {
 		unsigned long present;
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3ac462de53a..cf2da13c41e 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -697,6 +697,8 @@ void show_mem(unsigned int filter)
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas(filter);
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
 #ifndef CONFIG_DISCONTIGMEM
 	i = max_mapnr;
 	while (i-- > 0) {
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index de186bde897..644482882ba 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -66,6 +66,9 @@ void show_mem(unsigned int filter)
 	printk(KERN_DEFAULT "Mem-info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_bank(i, mi) {
 		struct membank *bank = &mi->bank[i];
 		unsigned int pfn1, pfn2;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2091b88d24..f3c7b1f9d1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -899,7 +899,8 @@ extern void pagefault_out_of_memory(void);
  * Flags passed to show_mem() and show_free_areas() to suppress output in
  * various contexts.
  */
-#define SHOW_MEM_FILTER_NODES	(0x0001u)	/* filter disallowed nodes */
+#define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
+#define SHOW_MEM_FILTER_PAGE_COUNT	(0x0002u)	/* page type count */
 
 extern void show_free_areas(unsigned int flags);
 extern bool skip_free_areas_node(unsigned int flags, int nid);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 4407f8c9b1f..b7c72311ad0 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -18,6 +18,9 @@ void show_mem(unsigned int filter)
 	printk("Mem-Info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_online_pgdat(pgdat) {
 		unsigned long i, flags;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ff1536f01b..da7a2fe7332 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2002,6 +2002,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 	    debug_guardpage_minorder() > 0)
 		return;
 
+	/*
+	 * Walking all memory to count page types is very expensive and should
+	 * be inhibited in non-blockable contexts.
+	 */
+	if (!(gfp_mask & __GFP_WAIT))
+		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
-- 
cgit v1.2.3-70-g09d2


From 9375db07adeaeea5f5ea7ca0463a8b371d71ddbb Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 29 Apr 2013 16:17:10 -0700
Subject: genalloc: add devres support, allow to find a managed pool by device

This patch adds three exported functions to lib/genalloc.c:
devm_gen_pool_create, dev_get_gen_pool, and of_get_named_gen_pool.

devm_gen_pool_create is a managed version of gen_pool_create that keeps
track of the pool via devres and allows the management code to
automatically destroy it after device removal.

dev_get_gen_pool retrieves the gen_pool for a given device, if it was
created with devm_gen_pool_create, using devres_find.

of_get_named_gen_pool retrieves the gen_pool for a given device node and
property name, where the property must contain a phandle pointing to a
platform device node.  The corresponding platform device is then fed into
dev_get_gen_pool and the resulting gen_pool is returned.

[akpm@linux-foundation.org: make the of_get_named_gen_pool() stub static, fixing a zillion link errors]
[akpm@linux-foundation.org: squish "struct device declared inside parameter list" warning]
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Tested-by: Michal Simek <monstr@monstr.eu>
Cc: Fabio Estevam <fabio.estevam@freescale.com>
Cc: Matt Porter <mporter@ti.com>
Cc: Dong Aisheng <dong.aisheng@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Javier Martin <javier.martin@vista-silicon.com>
Cc: Huang Shijie <shijie8@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h | 18 +++++++++++
 lib/genalloc.c           | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

(limited to 'lib')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd7c569aaca..661d374aeb2 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -29,6 +29,10 @@
 
 #ifndef __GENALLOC_H__
 #define __GENALLOC_H__
+
+struct device;
+struct device_node;
+
 /**
  * Allocation callback function type definition
  * @map: Pointer to bitmap
@@ -105,4 +109,18 @@ extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
 extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data);
 
+extern struct gen_pool *devm_gen_pool_create(struct device *dev,
+		int min_alloc_order, int nid);
+extern struct gen_pool *dev_get_gen_pool(struct device *dev);
+
+#ifdef CONFIG_OF
+extern struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index);
+#else
+static inline struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index)
+{
+	return NULL;
+}
+#endif
 #endif /* __GENALLOC_H__ */
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 54920433705..b35cfa9bc3d 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -34,6 +34,8 @@
 #include <linux/rculist.h>
 #include <linux/interrupt.h>
 #include <linux/genalloc.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
 
 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
 {
@@ -480,3 +482,82 @@ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 	return start_bit;
 }
 EXPORT_SYMBOL(gen_pool_best_fit);
+
+static void devm_gen_pool_release(struct device *dev, void *res)
+{
+	gen_pool_destroy(*(struct gen_pool **)res);
+}
+
+/**
+ * devm_gen_pool_create - managed gen_pool_create
+ * @dev: device that provides the gen_pool
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ *
+ * Create a new special memory pool that can be used to manage special purpose
+ * memory not managed by the regular kmalloc/kfree interface. The pool will be
+ * automatically destroyed by the device management code.
+ */
+struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
+		int nid)
+{
+	struct gen_pool **ptr, *pool;
+
+	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
+
+	pool = gen_pool_create(min_alloc_order, nid);
+	if (pool) {
+		*ptr = pool;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return pool;
+}
+
+/**
+ * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: Optional name for the gen_pool, usually NULL
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *dev_get_gen_pool(struct device *dev)
+{
+	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
+					NULL);
+
+	if (!p)
+		return NULL;
+	return *p;
+}
+EXPORT_SYMBOL_GPL(dev_get_gen_pool);
+
+#ifdef CONFIG_OF
+/**
+ * of_get_named_gen_pool - find a pool by phandle property
+ * @np: device node
+ * @propname: property name containing phandle(s)
+ * @index: index into the phandle array
+ *
+ * Returns the pool that contains the chunk starting at the physical
+ * address of the device tree node pointed at by the phandle property,
+ * or NULL if not found.
+ */
+struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index)
+{
+	struct platform_device *pdev;
+	struct device_node *np_pool;
+
+	np_pool = of_parse_phandle(np, propname, index);
+	if (!np_pool)
+		return NULL;
+	pdev = of_find_device_by_node(np_pool);
+	if (!pdev)
+		return NULL;
+	return dev_get_gen_pool(&pdev->dev);
+}
+EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
+#endif /* CONFIG_OF */
-- 
cgit v1.2.3-70-g09d2


From 30493cc9dddb68066dcc4878015660fdaa8e0965 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 29 Apr 2013 16:18:09 -0700
Subject: lib/int_sqrt.c: optimize square root algorithm

Optimize the current version of the shift-and-subtract (hardware)
algorithm, described by John von Newmann[1] and Guy L Steele.

Iterating 1,000,000 times, perf shows for the current version:

 Performance counter stats for './sqrt-curr' (10 runs):

         27.170996 task-clock                #    0.979 CPUs utilized            ( +-  3.19% )
                 3 context-switches          #    0.103 K/sec                    ( +-  4.76% )
                 0 cpu-migrations            #    0.004 K/sec                    ( +-100.00% )
               104 page-faults               #    0.004 M/sec                    ( +-  0.16% )
        64,921,199 cycles                    #    2.389 GHz                      ( +-  0.03% )
        28,967,789 stalled-cycles-frontend   #   44.62% frontend cycles idle     ( +-  0.18% )
   <not supported> stalled-cycles-backend
       104,502,623 instructions              #    1.61  insns per cycle
                                             #    0.28  stalled cycles per insn  ( +-  0.00% )
        34,088,368 branches                  # 1254.587 M/sec                    ( +-  0.00% )
             4,901 branch-misses             #    0.01% of all branches          ( +-  1.32% )

       0.027763015 seconds time elapsed                                          ( +-  3.22% )

And for the new version:

Performance counter stats for './sqrt-new' (10 runs):

          0.496869 task-clock                #    0.519 CPUs utilized            ( +-  2.38% )
                 0 context-switches          #    0.000 K/sec
                 0 cpu-migrations            #    0.403 K/sec                    ( +-100.00% )
               104 page-faults               #    0.209 M/sec                    ( +-  0.15% )
           590,760 cycles                    #    1.189 GHz                      ( +-  2.35% )
           395,053 stalled-cycles-frontend   #   66.87% frontend cycles idle     ( +-  3.67% )
   <not supported> stalled-cycles-backend
           398,963 instructions              #    0.68  insns per cycle
                                             #    0.99  stalled cycles per insn  ( +-  0.39% )
            70,228 branches                  #  141.341 M/sec                    ( +-  0.36% )
             3,364 branch-misses             #    4.79% of all branches          ( +-  5.45% )

       0.000957440 seconds time elapsed                                          ( +-  2.42% )

Furthermore, this saves space in instruction text:

   text    data     bss     dec     hex filename
    111       0       0     111      6f lib/int_sqrt-baseline.o
     89       0       0      89      59 lib/int_sqrt.o

[1] http://en.wikipedia.org/wiki/First_Draft_of_a_Report_on_the_EDVAC

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Jonathan Gonzalez <jgonzlez@linets.cl>
Tested-by: Jonathan Gonzalez <jgonzlez@linets.cl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/int_sqrt.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'lib')

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index fc2eeb7cb2e..1ef4cc34497 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -1,3 +1,9 @@
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ *  Based on the shift-and-subtract algorithm for computing integer
+ *  square root from Guy L. Steele.
+ */
 
 #include <linux/kernel.h>
 #include <linux/export.h>
@@ -10,23 +16,23 @@
  */
 unsigned long int_sqrt(unsigned long x)
 {
-	unsigned long op, res, one;
+	unsigned long b, m, y = 0;
 
-	op = x;
-	res = 0;
+	if (x <= 1)
+		return x;
 
-	one = 1UL << (BITS_PER_LONG - 2);
-	while (one > op)
-		one >>= 2;
+	m = 1UL << (BITS_PER_LONG - 2);
+	while (m != 0) {
+		b = y + m;
+		y >>= 1;
 
-	while (one != 0) {
-		if (op >= res + one) {
-			op = op - (res + one);
-			res = res +  2 * one;
+		if (x >= b) {
+			x -= b;
+			y += m;
 		}
-		res /= 2;
-		one /= 4;
+		m >>= 2;
 	}
-	return res;
+
+	return y;
 }
 EXPORT_SYMBOL(int_sqrt);
-- 
cgit v1.2.3-70-g09d2


From 095d141b2e40665289d44a42d387ffac2841ef82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 29 Apr 2013 16:18:10 -0700
Subject: argv_split(): teach it to handle mutable strings

argv_split() allocates argv[count_argc(str)] array and assumes that it
will find the same number of arguments later.  This is obviously wrong if
this string can be changed, say, by sysctl.

With this patch argv_split() kstrndup's the whole string and does not
split it, we simply replace the spaces with zeroes and keep the allocated
memory in argv[-1] for argv_free(arg).

We do not use argv[0] because:

	- str can be all-spaces or empty. In fact this case is fine,
	  we could kfree() it before return, but:

	- str can have a space at the start, and we can not rely on
	  kstrndup(skip_spaces(str)) because it can equally race if
	  this string is mutable.

Also, simplify count_argc() and kill the no longer used skip_arg().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/argv_split.c | 87 ++++++++++++++++++++++++++------------------------------
 1 file changed, 40 insertions(+), 47 deletions(-)

(limited to 'lib')

diff --git a/lib/argv_split.c b/lib/argv_split.c
index 1e9a6cbc368..e927ed0e18a 100644
--- a/lib/argv_split.c
+++ b/lib/argv_split.c
@@ -8,23 +8,17 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 
-static const char *skip_arg(const char *cp)
-{
-	while (*cp && !isspace(*cp))
-		cp++;
-
-	return cp;
-}
-
 static int count_argc(const char *str)
 {
 	int count = 0;
+	bool was_space;
 
-	while (*str) {
-		str = skip_spaces(str);
-		if (*str) {
+	for (was_space = true; *str; str++) {
+		if (isspace(*str)) {
+			was_space = true;
+		} else if (was_space) {
+			was_space = false;
 			count++;
-			str = skip_arg(str);
 		}
 	}
 
@@ -39,10 +33,8 @@ static int count_argc(const char *str)
  */
 void argv_free(char **argv)
 {
-	char **p;
-	for (p = argv; *p; p++)
-		kfree(*p);
-
+	argv--;
+	kfree(argv[0]);
 	kfree(argv);
 }
 EXPORT_SYMBOL(argv_free);
@@ -59,43 +51,44 @@ EXPORT_SYMBOL(argv_free);
  * considered to be a single argument separator.  The returned array
  * is always NULL-terminated.  Returns NULL on memory allocation
  * failure.
+ *
+ * The source string at `str' may be undergoing concurrent alteration via
+ * userspace sysctl activity (at least).  The argv_split() implementation
+ * attempts to handle this gracefully by taking a local copy to work on.
  */
 char **argv_split(gfp_t gfp, const char *str, int *argcp)
 {
-	int argc = count_argc(str);
-	char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp);
-	char **argvp;
-
-	if (argv == NULL)
-		goto out;
-
-	if (argcp)
-		*argcp = argc;
-
-	argvp = argv;
-
-	while (*str) {
-		str = skip_spaces(str);
-
-		if (*str) {
-			const char *p = str;
-			char *t;
-
-			str = skip_arg(str);
+	char *argv_str;
+	bool was_space;
+	char **argv, **argv_ret;
+	int argc;
+
+	argv_str = kstrndup(str, KMALLOC_MAX_SIZE - 1, gfp);
+	if (!argv_str)
+		return NULL;
+
+	argc = count_argc(argv_str);
+	argv = kmalloc(sizeof(*argv) * (argc + 2), gfp);
+	if (!argv) {
+		kfree(argv_str);
+		return NULL;
+	}
 
-			t = kstrndup(p, str-p, gfp);
-			if (t == NULL)
-				goto fail;
-			*argvp++ = t;
+	*argv = argv_str;
+	argv_ret = ++argv;
+	for (was_space = true; *argv_str; argv_str++) {
+		if (isspace(*argv_str)) {
+			was_space = true;
+			*argv_str = 0;
+		} else if (was_space) {
+			was_space = false;
+			*argv++ = argv_str;
 		}
 	}
-	*argvp = NULL;
-
-  out:
-	return argv;
+	*argv = NULL;
 
-  fail:
-	argv_free(argv);
-	return NULL;
+	if (argcp)
+		*argcp = argc;
+	return argv_ret;
 }
 EXPORT_SYMBOL(argv_split);
-- 
cgit v1.2.3-70-g09d2


From 2e0fb404c86d6c86dc33e284310eb5d28d192dcf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Apr 2013 16:18:11 -0700
Subject: lib, net: make isodigit() public and use it

There are at least two users of isodigit().  Let's make it a public
function of ctype.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ctype.h | 6 ++++++
 lib/dynamic_debug.c   | 1 -
 net/sunrpc/cache.c    | 1 -
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/include/linux/ctype.h b/include/linux/ctype.h
index 8acfe312f94..653589e3e30 100644
--- a/include/linux/ctype.h
+++ b/include/linux/ctype.h
@@ -61,4 +61,10 @@ static inline char _tolower(const char c)
 	return c | 0x20;
 }
 
+/* Fast check for octal digit */
+static inline int isodigit(const char c)
+{
+	return c >= '0' && c <= '7';
+}
+
 #endif
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 5276b99ca65..46032453abd 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -281,7 +281,6 @@ static inline int parse_lineno(const char *str, unsigned int *val)
  * allow the user to express a query which matches a format
  * containing embedded spaces.
  */
-#define isodigit(c)		((c) >= '0' && (c) <= '7')
 static char *unescape(char *str)
 {
 	char *in = str;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 25d58e76601..ce2d180d05a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1208,7 +1208,6 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
  * key and content are both parsed by cache
  */
 
-#define isodigit(c) (isdigit(c) && c <= '7')
 int qword_get(char **bpp, char *dest, int bufsize)
 {
 	/* return bytes copied, or -1 on error */
-- 
cgit v1.2.3-70-g09d2


From 3e6628c4b347a558965041290c5a92791dd4c741 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 29 Apr 2013 16:21:16 -0700
Subject: idr: introduce idr_alloc_cyclic()

As Tejun points out, there are several users of the IDR facility that
attempt to use it in a cyclic fashion.  These users are likely to see
-ENOSPC errors after the counter wraps one or more times however.

This patchset adds a new idr_alloc_cyclic routine and converts several
of these users to it.  Many of these users are in obscure parts of the
kernel, and I don't have a good way to test some of them.  The change is
pretty straightforward though, so hopefully it won't be an issue.

There is one other cyclic user of idr_alloc that I didn't touch in
ipc/util.c.  That one is doing some strange stuff that I didn't quite
understand, but it looks like it should probably be converted later
somehow.

This patch:

Thus spake Tejun Heo:

    Ooh, BTW, the cyclic allocation is broken.  It's prone to -ENOSPC
    after the first wraparound.  There are several cyclic users in the
    kernel and I think it probably would be best to implement cyclic
    support in idr.

This patch does that by adding new idr_alloc_cyclic function that such
users in the kernel can use.  With this, there's no need for a caller to
keep track of the last value used as that's now tracked internally.  This
should prevent the ENOSPC problems that can hit when the "last allocated"
counter exceeds INT_MAX.

Later patches will convert existing cyclic users to the new interface.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Jack Morgenstein <jackm@dev.mellanox.co.il>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Robert Love <rlove@rlove.org>
Cc: Roland Dreier <roland@purestorage.com>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Tom Tucker <tom@opengridcomputing.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h |  2 ++
 lib/idr.c           | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'lib')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 2640c7e99e5..a470ac3ef49 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -42,6 +42,7 @@ struct idr {
 	struct idr_layer	*id_free;
 	int			layers;	/* only valid w/o concurrent changes */
 	int			id_free_cnt;
+	int			cur;	/* current pos for cyclic allocation */
 	spinlock_t		lock;
 };
 
@@ -75,6 +76,7 @@ struct idr {
 void *idr_find_slowpath(struct idr *idp, int id);
 void idr_preload(gfp_t gfp_mask);
 int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask);
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask);
 int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *idp, int *nextid);
diff --git a/lib/idr.c b/lib/idr.c
index 322e2816f2f..cca4b9302a7 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -495,6 +495,33 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(idr_alloc);
 
+/**
+ * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
+ * @idr: the (initialized) idr
+ * @ptr: pointer to be associated with the new id
+ * @start: the minimum id (inclusive)
+ * @end: the maximum id (exclusive, <= 0 for max)
+ * @gfp_mask: memory allocation flags
+ *
+ * Essentially the same as idr_alloc, but prefers to allocate progressively
+ * higher ids if it can. If the "cur" counter wraps, then it will start again
+ * at the "start" end of the range and allocate one that has already been used.
+ */
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end,
+			gfp_t gfp_mask)
+{
+	int id;
+
+	id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask);
+	if (id == -ENOSPC)
+		id = idr_alloc(idr, ptr, start, end, gfp_mask);
+
+	if (likely(id >= 0))
+		idr->cur = id + 1;
+	return id;
+}
+EXPORT_SYMBOL(idr_alloc_cyclic);
+
 static void idr_remove_warning(int id)
 {
 	printk(KERN_WARNING
-- 
cgit v1.2.3-70-g09d2


From cedddb00023b1321f2bb2739739aa8c9927fc063 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 29 Apr 2013 16:21:25 -0700
Subject: uuid: use prandom_bytes()

Use prandom_bytes() to generate 16 bytes of pseudo-random bytes.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/uuid.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/uuid.c b/lib/uuid.c
index 52a6fe6387d..398821e4dce 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -25,13 +25,7 @@
 
 static void __uuid_gen_common(__u8 b[16])
 {
-	int i;
-	u32 r;
-
-	for (i = 0; i < 4; i++) {
-		r = random32();
-		memcpy(b + i * 4, &r, 4);
-	}
+	prandom_bytes(b, 16);
 	/* reversion 0b10 */
 	b[8] = (b[8] & 0x3F) | 0x80;
 }
-- 
cgit v1.2.3-70-g09d2


From f39fee5f11f7adfcf5c9643f87718b80450be18a Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 29 Apr 2013 16:21:28 -0700
Subject: lib/: rename random32() to prandom_u32()

Use preferable function name which implies using a pseudo-random
number generator.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/fault-inject.c | 2 +-
 lib/list_sort.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index f7210ad6cff..c5c7a762b85 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -122,7 +122,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 			return false;
 	}
 
-	if (attr->probability <= random32() % 100)
+	if (attr->probability <= prandom_u32() % 100)
 		return false;
 
 	if (!fail_stacktrace(attr))
diff --git a/lib/list_sort.c b/lib/list_sort.c
index d7325c6b103..1183fa70a44 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -229,7 +229,7 @@ static int __init list_sort_test(void)
 			goto exit;
 		}
 		 /* force some equivalencies */
-		el->value = random32() % (TEST_LIST_LEN/3);
+		el->value = prandom_u32() % (TEST_LIST_LEN / 3);
 		el->serial = i;
 		el->poison1 = TEST_POISON1;
 		el->poison2 = TEST_POISON2;
-- 
cgit v1.2.3-70-g09d2


From f3002134158092178be81339ec5a22ff80e6c308 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 30 Apr 2013 11:35:07 +0200
Subject: Revert "math64: New div64_u64_rem helper"

This reverts commit f792685006274a850e6cc0ea9ade275ccdfc90bc.

The cputime scaling code was changed/fixed and does not need the
div64_u64_rem() primitive anymore. It has no other users, so let's
remove them.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: rostedt@goodmis.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1367314507-9728-4-git-send-email-sgruszka@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/math64.h | 19 +------------------
 lib/div64.c            | 19 ++++++-------------
 2 files changed, 7 insertions(+), 31 deletions(-)

(limited to 'lib')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index 931a619407b..b8ba8554472 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -29,15 +29,6 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 	return dividend / divisor;
 }
 
-/**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor
- */
-static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
-{
-	*remainder = dividend % divisor;
-	return dividend / divisor;
-}
-
 /**
  * div64_u64 - unsigned 64bit divide with 64bit divisor
  */
@@ -70,16 +61,8 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
 #endif
 
-#ifndef div64_u64_rem
-extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
-#endif
-
 #ifndef div64_u64
-static inline u64 div64_u64(u64 dividend, u64 divisor)
-{
-	u64 remainder;
-	return div64_u64_rem(dividend, divisor, &remainder);
-}
+extern u64 div64_u64(u64 dividend, u64 divisor);
 #endif
 
 #ifndef div64_s64
diff --git a/lib/div64.c b/lib/div64.c
index 3af5728d95f..a163b6caef7 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
- * @remainder:  64bit remainder
  *
  * This implementation is a modified version of the algorithm proposed
  * by the book 'Hacker's Delight'.  The original source and full proof
@@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem);
  *
  * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
  */
-#ifndef div64_u64_rem
-u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+#ifndef div64_u64
+u64 div64_u64(u64 dividend, u64 divisor)
 {
 	u32 high = divisor >> 32;
 	u64 quot;
 
 	if (high == 0) {
-		u32 rem32;
-		quot = div_u64_rem(dividend, divisor, &rem32);
-		*remainder = rem32;
+		quot = div_u64(dividend, divisor);
 	} else {
 		int n = 1 + fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
 			quot--;
-
-		*remainder = dividend - quot * divisor;
-		if (*remainder >= divisor) {
+		if ((dividend - quot * divisor) >= divisor)
 			quot++;
-			*remainder -= divisor;
-		}
 	}
 
 	return quot;
 }
-EXPORT_SYMBOL(div64_u64_rem);
+EXPORT_SYMBOL(div64_u64);
 #endif
 
 /**
-- 
cgit v1.2.3-70-g09d2


From b0d33c2bd77bcf2d7c9427d2361ac57fe5b33aa1 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Dec 2012 10:18:50 -0800
Subject: vsprintf: Add extension %pSR - print_symbol replacement

print_symbol takes a long and converts it to a function
name and offset.  %pS does something similar, but doesn't
translate the address via __builtin_extract_return_addr.
%pSR does the translation.

This will enable replacing multiple calls like
	printk(...);
	printk_symbol(addr);
	printk("\n");
with a single non-interleavable in dmesg
	printk("... %pSR\n", (void *)addr);

Update documentation too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/printk-formats.txt |  2 ++
 lib/vsprintf.c                   | 18 +++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index e8a6aa473ba..e3d0215013d 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -17,6 +17,8 @@ Symbols/Function Pointers:
 	%pF	versatile_init+0x0/0x110
 	%pf	versatile_init
 	%pS	versatile_init+0x0/0x110
+	%pSR	versatile_init+0x9/0x110
+		(with __builtin_extract_return_addr() translation)
 	%ps	versatile_init
 	%pB	prev_fn_of_versatile_init+0x88/0x88
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0d62fd700f6..e149c641638 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -534,14 +534,21 @@ char *string(char *buf, char *end, const char *s, struct printf_spec spec)
 
 static noinline_for_stack
 char *symbol_string(char *buf, char *end, void *ptr,
-		    struct printf_spec spec, char ext)
+		    struct printf_spec spec, const char *fmt)
 {
-	unsigned long value = (unsigned long) ptr;
+	unsigned long value;
 #ifdef CONFIG_KALLSYMS
 	char sym[KSYM_SYMBOL_LEN];
-	if (ext == 'B')
+#endif
+
+	if (fmt[1] == 'R')
+		ptr = __builtin_extract_return_addr(ptr);
+	value = (unsigned long)ptr;
+
+#ifdef CONFIG_KALLSYMS
+	if (*fmt == 'B')
 		sprint_backtrace(sym, value);
-	else if (ext != 'f' && ext != 's')
+	else if (*fmt != 'f' && *fmt != 's')
 		sprint_symbol(sym, value);
 	else
 		sprint_symbol_no_offset(sym, value);
@@ -987,6 +994,7 @@ int kptr_restrict __read_mostly;
  * - 'f' For simple symbolic function names without offset
  * - 'S' For symbolic direct pointers with offset
  * - 's' For symbolic direct pointers without offset
+ * - '[FfSs]R' as above with __builtin_extract_return_addr() translation
  * - 'B' For backtraced symbolic direct pointers with offset
  * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
  * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
@@ -1060,7 +1068,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 	case 'S':
 	case 's':
 	case 'B':
-		return symbol_string(buf, end, ptr, spec, *fmt);
+		return symbol_string(buf, end, ptr, spec, fmt);
 	case 'R':
 	case 'r':
 		return resource_string(buf, end, ptr, spec, fmt);
-- 
cgit v1.2.3-70-g09d2


From 196779b9b4ce1922afabdc20d0270720603bd46c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:12 -0700
Subject: dump_stack: consolidate dump_stack() implementations and unify their
 behaviors

Both dump_stack() and show_stack() are currently implemented by each
architecture.  show_stack(NULL, NULL) dumps the backtrace for the
current task as does dump_stack().  On some archs, dump_stack() prints
extra information - pid, utsname and so on - in addition to the
backtrace while the two are identical on other archs.

The usages in arch-independent code of the two functions indicate
show_stack(NULL, NULL) should print out bare backtrace while
dump_stack() is used for debugging purposes when something went wrong,
so it does make sense to print additional information on the task which
triggered dump_stack().

There's no reason to require archs to implement two separate but mostly
identical functions.  It leads to unnecessary subtle information.

This patch expands the dummy fallback dump_stack() implementation in
lib/dump_stack.c such that it prints out debug information (taken from
x86) and invokes show_stack(NULL, NULL) and drops arch-specific
dump_stack() implementations in all archs except blackfin.  Blackfin's
dump_stack() does something wonky that I don't understand.

Debug information can be printed separately by calling
dump_stack_print_info() so that arch-specific dump_stack()
implementation can still emit the same debug information.  This is used
in blackfin.

This patch brings the following behavior changes.

* On some archs, an extra level in backtrace for show_stack() could be
  printed.  This is because the top frame was determined in
  dump_stack() on those archs while generic dump_stack() can't do that
  reliably.  It can be compensated by inlining dump_stack() but not
  sure whether that'd be necessary.

* Most archs didn't use to print debug info on dump_stack().  They do
  now.

An example WARN dump follows.

 WARNING: at kernel/workqueue.c:4841 init_workqueues+0x35/0x505()
 Hardware name: empty
 Modules linked in:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #9
  0000000000000009 ffff88007c861e08 ffffffff81c614dc ffff88007c861e48
  ffffffff8108f50f ffffffff82228240 0000000000000040 ffffffff8234a03c
  0000000000000000 0000000000000000 0000000000000000 ffff88007c861e58
 Call Trace:
  [<ffffffff81c614dc>] dump_stack+0x19/0x1b
  [<ffffffff8108f50f>] warn_slowpath_common+0x7f/0xc0
  [<ffffffff8108f56a>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8234a071>] init_workqueues+0x35/0x505
  ...

v2: CPU number added to the generic debug info as requested by s390
    folks and dropped the s390 specific dump_stack().  This loses %ksp
    from the debug message which the maintainers think isn't important
    enough to keep the s390-specific dump_stack() implementation.

    dump_stack_print_info() is moved to kernel/printk.c from
    lib/dump_stack.c.  Because linkage is per objecct file,
    dump_stack_print_info() living in the same lib file as generic
    dump_stack() means that archs which implement custom dump_stack()
    - at this point, only blackfin - can't use dump_stack_print_info()
    as that will bring in the generic version of dump_stack() too.  v1
    The v1 patch broke build on blackfin due to this issue.  The build
    breakage was reported by Fengguang Wu.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>	[s390 bits]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Richard Kuo <rkuo@codeaurora.org>		[hexagon bits]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/traps.c        |  7 -------
 arch/arc/kernel/stacktrace.c     |  7 -------
 arch/arm/kernel/traps.c          |  7 -------
 arch/arm64/kernel/traps.c        |  7 -------
 arch/avr32/kernel/process.c      |  8 --------
 arch/blackfin/kernel/dumpstack.c |  1 +
 arch/c6x/kernel/traps.c          |  9 ---------
 arch/cris/kernel/traps.c         |  7 -------
 arch/frv/kernel/traps.c          | 11 -----------
 arch/h8300/kernel/traps.c        |  7 -------
 arch/hexagon/kernel/traps.c      |  8 --------
 arch/ia64/kernel/process.c       |  8 --------
 arch/m32r/kernel/traps.c         |  9 ---------
 arch/m68k/kernel/traps.c         | 12 ------------
 arch/metag/kernel/traps.c        |  6 ------
 arch/microblaze/kernel/traps.c   |  6 ------
 arch/mips/kernel/traps.c         | 13 -------------
 arch/mn10300/kernel/traps.c      | 11 -----------
 arch/openrisc/kernel/traps.c     | 11 -----------
 arch/parisc/kernel/traps.c       |  8 --------
 arch/powerpc/kernel/process.c    |  6 ------
 arch/s390/kernel/dumpstack.c     | 17 -----------------
 arch/score/kernel/traps.c        | 10 ----------
 arch/sh/kernel/dumpstack.c       |  6 ------
 arch/sparc/kernel/process_32.c   |  7 -------
 arch/sparc/kernel/traps_64.c     |  7 -------
 arch/um/kernel/sysrq.c           | 12 ------------
 arch/unicore32/kernel/traps.c    |  6 ------
 arch/x86/kernel/dumpstack.c      | 18 ------------------
 arch/xtensa/kernel/traps.c       |  8 --------
 include/linux/printk.h           |  5 +++++
 kernel/printk.c                  | 18 ++++++++++++++++++
 lib/dump_stack.c                 | 11 ++++++++---
 33 files changed, 32 insertions(+), 262 deletions(-)

(limited to 'lib')

diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 4037461a649..affccb959a9 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -169,13 +169,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	dik_show_trace(sp);
 }
 
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void
 die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15)
 {
diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index a63ff842564..ca0207b9d5b 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -220,13 +220,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 	show_stacktrace(tsk, NULL);
 }
 
-/* Expected by Rest of kernel code */
-void dump_stack(void)
-{
-	show_stacktrace(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 /* Another API expected by schedular, shows up in "ps" as Wait Channel
  * Ofcourse just returning schedule( ) would be pointless so unwind until
  * the function is not in schedular code
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 1c089119b2d..18b32e8e449 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -204,13 +204,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 }
 #endif
 
-void dump_stack(void)
-{
-	dump_backtrace(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *tsk, unsigned long *sp)
 {
 	dump_backtrace(NULL, tsk);
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index b3c5f628bdb..61d7dd29f75 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -167,13 +167,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 	}
 }
 
-void dump_stack(void)
-{
-	dump_backtrace(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *tsk, unsigned long *sp)
 {
 	dump_backtrace(NULL, tsk);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 073c3c2fa52..89a8017e8ca 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -204,14 +204,6 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 	show_stack_log_lvl(tsk, (unsigned long)stack, NULL, "");
 }
 
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace_log_lvl(current, &stack, NULL, "");
-}
-EXPORT_SYMBOL(dump_stack);
-
 static const char *cpu_modes[] = {
 	"Application", "Supervisor", "Interrupt level 0", "Interrupt level 1",
 	"Interrupt level 2", "Interrupt level 3", "Exception", "NMI"
diff --git a/arch/blackfin/kernel/dumpstack.c b/arch/blackfin/kernel/dumpstack.c
index 5cfbaa29821..95ba6d9e9a3 100644
--- a/arch/blackfin/kernel/dumpstack.c
+++ b/arch/blackfin/kernel/dumpstack.c
@@ -168,6 +168,7 @@ void dump_stack(void)
 #endif
 	trace_buffer_save(tflags);
 	dump_bfin_trace_buffer();
+	dump_stack_print_info(KERN_DEFAULT);
 	show_stack(current, &stack);
 	trace_buffer_restore(tflags);
 }
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index 1be74e5b478..d0b96ef25c1 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -67,15 +67,6 @@ void show_regs(struct pt_regs *regs)
 	pr_err("A31: %08lx  B31: %08lx\n", regs->a31, regs->b31);
 }
 
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_stack(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
-
 void die(char *str, struct pt_regs *fp, int nr)
 {
 	console_verbose();
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index a11ad3229f8..0ffda73734f 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -146,13 +146,6 @@ show_stack(void)
 }
 #endif
 
-void
-dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void
 set_nmi_handler(void (*handler)(struct pt_regs *))
 {
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 5cfd1420b09..cfcd802d6f9 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -466,17 +466,6 @@ asmlinkage void compound_exception(unsigned long esfr1,
 	BUG();
 } /* end compound_exception() */
 
-/*****************************************************************************/
-/*
- * The architecture-independent backtrace generator
- */
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
 }
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index 7833aa3e7c7..cfe494dbe3d 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -164,10 +164,3 @@ void show_trace_task(struct task_struct *tsk)
 {
 	show_stack(tsk,(unsigned long *)tsk->thread.esp0);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL,NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index be5e2dd9c9d..cc2171b2aa0 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -191,14 +191,6 @@ void show_stack(struct task_struct *task, unsigned long *fp)
 	do_show_stack(task, fp, 0);
 }
 
-void dump_stack(void)
-{
-	unsigned long *fp;
-	asm("%0 = r30" : "=r" (fp));
-	show_stack(current, fp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 int die(const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a26fc640e4c..182bd64cc72 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -95,14 +95,6 @@ show_stack (struct task_struct *task, unsigned long *sp)
 	}
 }
 
-void
-dump_stack (void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void
 show_regs (struct pt_regs *regs)
 {
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index 9fe3467a513..a7a424f852e 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -167,15 +167,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_trace(task, sp);
 }
 
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace(current, &stack);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static void show_registers(struct pt_regs *regs)
 {
 	int i = 0;
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index f32ab22e7ed..88fcd8c70e7 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -991,18 +991,6 @@ void show_stack(struct task_struct *task, unsigned long *stack)
 	show_trace(stack);
 }
 
-/*
- * The architecture-independent backtrace generator
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace(&stack);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 /*
  * The vector number returned in the frame pointer may also contain
  * the "fs" (Fault Status) bits on ColdFire. These are in the bottom
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 8961f247b50..2ceeaae5b19 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -987,9 +987,3 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 
 	show_trace(tsk, sp, NULL);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c
index 30e6b5004a6..cb619533a19 100644
--- a/arch/microblaze/kernel/traps.c
+++ b/arch/microblaze/kernel/traps.c
@@ -75,9 +75,3 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 
 	debug_show_held_locks(task);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index c3abb88170f..b512b28cf78 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -206,19 +206,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_stacktrace(task, &regs);
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	struct pt_regs regs;
-
-	prepare_frametrace(&regs);
-	show_backtrace(current, &regs);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static void show_code(unsigned int __user *pc)
 {
 	long i;
diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c
index b900e5afa0a..a7a987c7954 100644
--- a/arch/mn10300/kernel/traps.c
+++ b/arch/mn10300/kernel/traps.c
@@ -293,17 +293,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_trace(sp);
 }
 
-/*
- * the architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_stack(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
 /*
  * dump the register file in the specified exception frame
  */
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 5cce396016d..3d3f6062f49 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -105,17 +105,6 @@ void show_trace_task(struct task_struct *tsk)
 	 */
 }
 
-/*
- * The architecture-independent backtrace generator
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_stack(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_registers(struct pt_regs *regs)
 {
 	int i;
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index aeb8f8f2c07..e64cf5f09b6 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -158,14 +158,6 @@ void show_regs(struct pt_regs *regs)
 	}
 }
 
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static void do_show_stack(struct unwind_frame_info *info)
 {
 	int i = 1;
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 16e77a81ab4..624d44bb44d 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1362,12 +1362,6 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 	} while (count++ < kstack_depth_to_print);
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 #ifdef CONFIG_PPC64
 /* Called with hard IRQs off */
 void __ppc64_runlatch_on(void)
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 03dce39d01e..2f1f639d1a3 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -129,23 +129,6 @@ static void show_last_breaking_event(struct pt_regs *regs)
 #endif
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	printk("CPU: %d %s %s %.*s\n",
-	       task_thread_info(current)->cpu, print_tainted(),
-	       init_utsname()->release,
-	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
-	printk("Process %s (pid: %d, task: %p, ksp: %p)\n",
-	       current->comm, current->pid, current,
-	       (void *) current->thread.ksp);
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 static inline int mask_bits(struct pt_regs *regs, unsigned long bits)
 {
 	return (regs->psw.mask & bits) / ((~bits + 1) & bits);
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 0e46fb19a84..a38f435fba7 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -149,16 +149,6 @@ static void show_registers(struct pt_regs *regs)
 	printk(KERN_NOTICE "\n");
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	show_stack(current_thread_info()->task,
-		   (long *) get_irq_regs()->regs[0]);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void __die(const char *str, struct pt_regs *regs, const char *file,
 	const char *func, unsigned long line)
 {
diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c
index 7617dc4129a..b959f559260 100644
--- a/arch/sh/kernel/dumpstack.c
+++ b/arch/sh/kernel/dumpstack.c
@@ -158,9 +158,3 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 		 (unsigned long)task_stack_page(tsk));
 	show_trace(tsk, sp, NULL);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 2be4214b390..dccf5f58d70 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -164,13 +164,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp)
 	printk("\n");
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 /*
  * Note: sparc64 has a pretty intricated thread_saved_pc, check it out.
  */
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 8d38ca97aa2..b3f833ab90e 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2350,13 +2350,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp)
 	} while (++count < 16);
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static inline struct reg_window *kernel_stack_up(struct reg_window *rw)
 {
 	unsigned long fp = rw->ins[6];
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index e562ff80409..7d101a2a154 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -35,18 +35,6 @@ void show_trace(struct task_struct *task, unsigned long * stack)
 }
 #endif
 
-/*
- * stack dumps generator - this is used by arch-independent code.
- * And this is identical to i386 currently.
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
 /*Stolen from arch/i386/kernel/traps.c */
 static const int kstack_depth_to_print = 24;
 
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 0870b68d2ad..c54e32410ea 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -170,12 +170,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		c_backtrace(fp, mode);
 }
 
-void dump_stack(void)
-{
-	dump_backtrace(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *tsk, unsigned long *sp)
 {
 	dump_backtrace(NULL, tsk);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dd1a7c391c9..deb6421c9e6 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -191,24 +191,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_stack_log_lvl(task, NULL, sp, bp, "");
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long bp;
-	unsigned long stack;
-
-	bp = stack_frame(current, NULL);
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-	show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 static int die_owner = -1;
 static unsigned int die_nest_count;
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 923db5c1527..384b7c7c2f6 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -481,14 +481,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_trace(task, stack);
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-
 void show_code(unsigned int *pc)
 {
 	long i;
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4890fe62c01..7ce1f878cf6 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -145,6 +145,7 @@ extern void wake_up_klogd(void);
 
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
+void dump_stack_print_info(const char *log_lvl);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -182,6 +183,10 @@ static inline void log_buf_kexec_setup(void)
 static inline void setup_log_buf(int early)
 {
 }
+
+static inline void dump_stack_print_info(const char *log_lvl)
+{
+}
 #endif
 
 extern void dump_stack(void) __cold;
diff --git a/kernel/printk.c b/kernel/printk.c
index 376914e2869..70b4b94a0ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -43,6 +43,7 @@
 #include <linux/rculist.h>
 #include <linux/poll.h>
 #include <linux/irq_work.h>
+#include <linux/utsname.h>
 
 #include <asm/uaccess.h>
 
@@ -2849,4 +2850,21 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+	printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+	       log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+	       print_tainted(), init_utsname()->release,
+	       (int)strcspn(init_utsname()->version, " "),
+	       init_utsname()->version);
+}
+
 #endif
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 42f4f55c945..53bad099ebd 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -5,11 +5,16 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 
+/**
+ * dump_stack - dump the current task information and its stack trace
+ *
+ * Architectures can override this implementation by implementing its own.
+ */
 void dump_stack(void)
 {
-	printk(KERN_NOTICE
-		"This architecture does not implement dump_stack()\n");
+	dump_stack_print_info(KERN_DEFAULT);
+	show_stack(NULL, NULL);
 }
-
 EXPORT_SYMBOL(dump_stack);
-- 
cgit v1.2.3-70-g09d2


From 16c7fa05829e8b91db48e3539c5d6ff3c2b18a23 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 30 Apr 2013 15:27:30 -0700
Subject: lib/string_helpers: introduce generic string_unescape

There are several places in kernel where modules unescapes input to convert
C-Style Escape Sequences into byte codes.

The patch provides generic implementation of such approach. Test cases are
also included into the patch.

[akpm@linux-foundation.org: clarify comment]
[akpm@linux-foundation.org: export get_random_int() to modules]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Kirk Reiser <kirk@braille.uwo.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/random.c          |   1 +
 include/linux/string_helpers.h |  58 ++++++++++++++++++
 lib/Kconfig.debug              |   3 +
 lib/Makefile                   |   4 +-
 lib/string_helpers.c           | 133 +++++++++++++++++++++++++++++++++++++++++
 lib/test-string_helpers.c      | 103 +++++++++++++++++++++++++++++++
 6 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 lib/test-string_helpers.c

(limited to 'lib')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 32a6c576495..cd9a6211dca 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1485,6 +1485,7 @@ unsigned int get_random_int(void)
 
 	return ret;
 }
+EXPORT_SYMBOL(get_random_int);
 
 /*
  * randomize_range() returns a start address such that
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index a3eb2f65b65..3eeee9672a4 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -13,4 +13,62 @@ enum string_size_units {
 int string_get_size(u64 size, enum string_size_units units,
 		    char *buf, int len);
 
+#define UNESCAPE_SPACE		0x01
+#define UNESCAPE_OCTAL		0x02
+#define UNESCAPE_HEX		0x04
+#define UNESCAPE_SPECIAL	0x08
+#define UNESCAPE_ANY		\
+	(UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL)
+
+/**
+ * string_unescape - unquote characters in the given string
+ * @src:	source buffer (escaped)
+ * @dst:	destination buffer (unescaped)
+ * @size:	size of the destination buffer (0 to unlimit)
+ * @flags:	combination of the flags (bitwise OR):
+ *	%UNESCAPE_SPACE:
+ *		'\f' - form feed
+ *		'\n' - new line
+ *		'\r' - carriage return
+ *		'\t' - horizontal tab
+ *		'\v' - vertical tab
+ *	%UNESCAPE_OCTAL:
+ *		'\NNN' - byte with octal value NNN (1 to 3 digits)
+ *	%UNESCAPE_HEX:
+ *		'\xHH' - byte with hexadecimal value HH (1 to 2 digits)
+ *	%UNESCAPE_SPECIAL:
+ *		'\"' - double quote
+ *		'\\' - backslash
+ *		'\a' - alert (BEL)
+ *		'\e' - escape
+ *	%UNESCAPE_ANY:
+ *		all previous together
+ *
+ * Returns amount of characters processed to the destination buffer excluding
+ * trailing '\0'.
+ *
+ * Because the size of the output will be the same as or less than the size of
+ * the input, the transformation may be performed in place.
+ *
+ * Caller must provide valid source and destination pointers. Be aware that
+ * destination buffer will always be NULL-terminated. Source string must be
+ * NULL-terminated as well.
+ */
+int string_unescape(char *src, char *dst, size_t size, unsigned int flags);
+
+static inline int string_unescape_inplace(char *buf, unsigned int flags)
+{
+	return string_unescape(buf, buf, 0, flags);
+}
+
+static inline int string_unescape_any(char *src, char *dst, size_t size)
+{
+	return string_unescape(src, dst, size, UNESCAPE_ANY);
+}
+
+static inline int string_unescape_any_inplace(char *buf)
+{
+	return string_unescape_any(buf, buf, 0);
+}
+
 #endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28be08c09ba..77ebaa3dfa1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1463,5 +1463,8 @@ source "lib/Kconfig.kgdb"
 
 source "lib/Kconfig.kmemcheck"
 
+config TEST_STRING_HELPERS
+	tristate "Test functions located in the string_helpers module at runtime"
+
 config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index 6e2cc561f76..23c9a0fe74f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,8 +22,10 @@ lib-y	+= kobject.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o
+obj-y += string_helpers.o
+obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 1cffc223bff..ed5c1454dd6 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -2,10 +2,12 @@
  * Helpers for formatting and printing strings
  *
  * Copyright 31 August 2008 James Bottomley
+ * Copyright (C) 2013, Intel Corporation
  */
 #include <linux/kernel.h>
 #include <linux/math64.h>
 #include <linux/export.h>
+#include <linux/ctype.h>
 #include <linux/string_helpers.h>
 
 /**
@@ -66,3 +68,134 @@ int string_get_size(u64 size, const enum string_size_units units,
 	return 0;
 }
 EXPORT_SYMBOL(string_get_size);
+
+static bool unescape_space(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+
+	switch (*q) {
+	case 'n':
+		*p = '\n';
+		break;
+	case 'r':
+		*p = '\r';
+		break;
+	case 't':
+		*p = '\t';
+		break;
+	case 'v':
+		*p = '\v';
+		break;
+	case 'f':
+		*p = '\f';
+		break;
+	default:
+		return false;
+	}
+	*dst += 1;
+	*src += 1;
+	return true;
+}
+
+static bool unescape_octal(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+	u8 num;
+
+	if (isodigit(*q) == 0)
+		return false;
+
+	num = (*q++) & 7;
+	while (num < 32 && isodigit(*q) && (q - *src < 3)) {
+		num <<= 3;
+		num += (*q++) & 7;
+	}
+	*p = num;
+	*dst += 1;
+	*src = q;
+	return true;
+}
+
+static bool unescape_hex(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+	int digit;
+	u8 num;
+
+	if (*q++ != 'x')
+		return false;
+
+	num = digit = hex_to_bin(*q++);
+	if (digit < 0)
+		return false;
+
+	digit = hex_to_bin(*q);
+	if (digit >= 0) {
+		q++;
+		num = (num << 4) | digit;
+	}
+	*p = num;
+	*dst += 1;
+	*src = q;
+	return true;
+}
+
+static bool unescape_special(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+
+	switch (*q) {
+	case '\"':
+		*p = '\"';
+		break;
+	case '\\':
+		*p = '\\';
+		break;
+	case 'a':
+		*p = '\a';
+		break;
+	case 'e':
+		*p = '\e';
+		break;
+	default:
+		return false;
+	}
+	*dst += 1;
+	*src += 1;
+	return true;
+}
+
+int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
+{
+	char *out = dst;
+
+	while (*src && --size) {
+		if (src[0] == '\\' && src[1] != '\0' && size > 1) {
+			src++;
+			size--;
+
+			if (flags & UNESCAPE_SPACE &&
+					unescape_space(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_OCTAL &&
+					unescape_octal(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_HEX &&
+					unescape_hex(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_SPECIAL &&
+					unescape_special(&src, &out))
+				continue;
+
+			*out++ = '\\';
+		}
+		*out++ = *src++;
+	}
+	*out = '\0';
+
+	return out - dst;
+}
+EXPORT_SYMBOL(string_unescape);
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
new file mode 100644
index 00000000000..6ac48de04c0
--- /dev/null
+++ b/lib/test-string_helpers.c
@@ -0,0 +1,103 @@
+/*
+ * Test cases for lib/string_helpers.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+
+struct test_string {
+	const char *in;
+	const char *out;
+	unsigned int flags;
+};
+
+static const struct test_string strings[] __initconst = {
+	{
+		.in = "\\f\\ \\n\\r\\t\\v",
+		.out = "\f\\ \n\r\t\v",
+		.flags = UNESCAPE_SPACE,
+	},
+	{
+		.in = "\\40\\1\\387\\0064\\05\\040\\8a\\110\\777",
+		.out = " \001\00387\0064\005 \\8aH?7",
+		.flags = UNESCAPE_OCTAL,
+	},
+	{
+		.in = "\\xv\\xa\\x2c\\xD\\x6f2",
+		.out = "\\xv\n,\ro2",
+		.flags = UNESCAPE_HEX,
+	},
+	{
+		.in = "\\h\\\\\\\"\\a\\e\\",
+		.out = "\\h\\\"\a\e\\",
+		.flags = UNESCAPE_SPECIAL,
+	},
+};
+
+static void __init test_string_unescape(unsigned int flags, bool inplace)
+{
+	char in[256];
+	char out_test[256];
+	char out_real[256];
+	int i, p = 0, q_test = 0, q_real = sizeof(out_real);
+
+	for (i = 0; i < ARRAY_SIZE(strings); i++) {
+		const char *s = strings[i].in;
+		int len = strlen(strings[i].in);
+
+		/* Copy string to in buffer */
+		memcpy(&in[p], s, len);
+		p += len;
+
+		/* Copy expected result for given flags */
+		if (flags & strings[i].flags) {
+			s = strings[i].out;
+			len = strlen(strings[i].out);
+		}
+		memcpy(&out_test[q_test], s, len);
+		q_test += len;
+	}
+	in[p++] = '\0';
+
+	/* Call string_unescape and compare result */
+	if (inplace) {
+		memcpy(out_real, in, p);
+		if (flags == UNESCAPE_ANY)
+			q_real = string_unescape_any_inplace(out_real);
+		else
+			q_real = string_unescape_inplace(out_real, flags);
+	} else if (flags == UNESCAPE_ANY) {
+		q_real = string_unescape_any(in, out_real, q_real);
+	} else {
+		q_real = string_unescape(in, out_real, q_real, flags);
+	}
+
+	if (q_real != q_test || memcmp(out_test, out_real, q_test)) {
+		pr_warn("Test failed: flags = %u\n", flags);
+		print_hex_dump(KERN_WARNING, "Input: ",
+			       DUMP_PREFIX_NONE, 16, 1, in, p - 1, true);
+		print_hex_dump(KERN_WARNING, "Expected: ",
+			       DUMP_PREFIX_NONE, 16, 1, out_test, q_test, true);
+		print_hex_dump(KERN_WARNING, "Got: ",
+			       DUMP_PREFIX_NONE, 16, 1, out_real, q_real, true);
+	}
+}
+
+static int __init test_string_helpers_init(void)
+{
+	unsigned int i;
+
+	pr_info("Running tests...\n");
+	for (i = 0; i < UNESCAPE_ANY + 1; i++)
+		test_string_unescape(i, false);
+	test_string_unescape(get_random_int() % (UNESCAPE_ANY + 1), true);
+
+	return -EINVAL;
+}
+module_init(test_string_helpers_init);
+MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3-70-g09d2


From d338b1379f96b20e63aa65082e19e9a18a93b608 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 30 Apr 2013 15:27:32 -0700
Subject: dynamic_debug: reuse generic string_unescape function

There is kernel function to do the job in generic way. Let's use it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dynamic_debug.c | 48 +++++-------------------------------------------
 1 file changed, 5 insertions(+), 43 deletions(-)

(limited to 'lib')

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 46032453abd..99fec3ae405 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -24,6 +24,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
+#include <linux/string_helpers.h>
 #include <linux/uaccess.h>
 #include <linux/dynamic_debug.h>
 #include <linux/debugfs.h>
@@ -276,47 +277,6 @@ static inline int parse_lineno(const char *str, unsigned int *val)
 	return 0;
 }
 
-/*
- * Undo octal escaping in a string, inplace.  This is useful to
- * allow the user to express a query which matches a format
- * containing embedded spaces.
- */
-static char *unescape(char *str)
-{
-	char *in = str;
-	char *out = str;
-
-	while (*in) {
-		if (*in == '\\') {
-			if (in[1] == '\\') {
-				*out++ = '\\';
-				in += 2;
-				continue;
-			} else if (in[1] == 't') {
-				*out++ = '\t';
-				in += 2;
-				continue;
-			} else if (in[1] == 'n') {
-				*out++ = '\n';
-				in += 2;
-				continue;
-			} else if (isodigit(in[1]) &&
-				   isodigit(in[2]) &&
-				   isodigit(in[3])) {
-				*out++ = (((in[1] - '0') << 6) |
-					  ((in[2] - '0') << 3) |
-					  (in[3] - '0'));
-				in += 4;
-				continue;
-			}
-		}
-		*out++ = *in++;
-	}
-	*out = '\0';
-
-	return str;
-}
-
 static int check_set(const char **dest, char *src, char *name)
 {
 	int rc = 0;
@@ -370,8 +330,10 @@ static int ddebug_parse_query(char *words[], int nwords,
 		} else if (!strcmp(words[i], "module")) {
 			rc = check_set(&query->module, words[i+1], "module");
 		} else if (!strcmp(words[i], "format")) {
-			rc = check_set(&query->format, unescape(words[i+1]),
-				       "format");
+			string_unescape_inplace(words[i+1], UNESCAPE_SPACE |
+							    UNESCAPE_OCTAL |
+							    UNESCAPE_SPECIAL);
+			rc = check_set(&query->format, words[i+1], "format");
 		} else if (!strcmp(words[i], "line")) {
 			char *first = words[i+1];
 			char *last = strchr(first, '-');
-- 
cgit v1.2.3-70-g09d2


From 4130f0efbfe5adb360328b777afa8e45f7e467f7 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 30 Apr 2013 15:28:24 -0700
Subject: rbtree_test: add extra rbtree integrity check

Account for the rbtree having  2**bh(v)-1 internal nodes.

While this can be seen as a consequence of other checks, Michel states
that it nicely sums up what the other properties are for.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index af38aedbd87..99515038ff6 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -117,8 +117,7 @@ static int black_path_count(struct rb_node *rb)
 static void check(int nr_nodes)
 {
 	struct rb_node *rb;
-	int count = 0;
-	int blacks = 0;
+	int count = 0, blacks = 0;
 	u32 prev_key = 0;
 
 	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
@@ -134,7 +133,9 @@ static void check(int nr_nodes)
 		prev_key = node->key;
 		count++;
 	}
+
 	WARN_ON_ONCE(count != nr_nodes);
+	WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
 }
 
 static void check_augmented(int nr_nodes)
-- 
cgit v1.2.3-70-g09d2


From c75aaa8ed03eb1312ed2990f1a716b2b9cc0df42 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 30 Apr 2013 15:28:25 -0700
Subject: rbtree_test: add __init/__exit annotations

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 99515038ff6..122f02f9941 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -149,7 +149,7 @@ static void check_augmented(int nr_nodes)
 	}
 }
 
-static int rbtree_test_init(void)
+static int __init rbtree_test_init(void)
 {
 	int i, j;
 	cycles_t time1, time2, time;
@@ -222,7 +222,7 @@ static int rbtree_test_init(void)
 	return -EAGAIN; /* Fail will directly unload the module */
 }
 
-static void rbtree_test_exit(void)
+static void __exit rbtree_test_exit(void)
 {
 	printk(KERN_ALERT "test exit\n");
 }
-- 
cgit v1.2.3-70-g09d2


From 446f24d1199e8a546ba7c97da3fbb9a505a94795 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Tue, 30 Apr 2013 15:28:42 -0700
Subject: Kconfig: consolidate CONFIG_DEBUG_STRICT_USER_COPY_CHECKS

The help text for this config is duplicated across the x86, parisc, and
s390 Kconfig.debug files.  Arnd Bergman noted that the help text was
slightly misleading and should be fixed to state that enabling this
option isn't a problem when using pre 4.4 gcc.

To simplify the rewording, consolidate the text into lib/Kconfig.debug
and modify it there to be more explicit about when you should say N to
this config.

Also, make the text a bit more generic by stating that this option
enables compile time checks so we can cover architectures which emit
warnings vs.  ones which emit errors.  The details of how an
architecture decided to implement the checks isn't as important as the
concept of compile time checking of copy_from_user() calls.

While we're doing this, remove all the copy_from_user_overflow() code
that's duplicated many times and place it into lib/ so that any
architecture supporting this option can get the function for free.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Helge Deller <deller@gmx.de>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/parisc/Kconfig             |  1 +
 arch/parisc/Kconfig.debug       | 14 --------------
 arch/s390/Kconfig               |  1 +
 arch/s390/Kconfig.debug         | 14 --------------
 arch/s390/lib/Makefile          |  1 -
 arch/s390/lib/usercopy.c        |  8 --------
 arch/sparc/lib/Makefile         |  1 -
 arch/sparc/lib/usercopy.c       |  9 ---------
 arch/tile/Kconfig               |  8 +-------
 arch/tile/include/asm/uaccess.h |  7 ++++++-
 arch/tile/lib/uaccess.c         |  8 --------
 arch/x86/Kconfig                |  1 +
 arch/x86/Kconfig.debug          | 14 --------------
 arch/x86/lib/usercopy_32.c      |  6 ------
 lib/Kconfig.debug               | 18 ++++++++++++++++++
 lib/Makefile                    |  1 +
 lib/usercopy.c                  |  9 +++++++++
 17 files changed, 38 insertions(+), 83 deletions(-)
 delete mode 100644 arch/s390/lib/usercopy.c
 delete mode 100644 arch/sparc/lib/usercopy.c
 create mode 100644 lib/usercopy.c

(limited to 'lib')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 0339181bf3a..433e75a2ee9 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -1,5 +1,6 @@
 config PARISC
 	def_bool y
+	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_FUNCTION_TRACER if 64BIT
diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug
index 7305ac8f7f5..bc989e522a0 100644
--- a/arch/parisc/Kconfig.debug
+++ b/arch/parisc/Kconfig.debug
@@ -12,18 +12,4 @@ config DEBUG_RODATA
          portion of the kernel code won't be covered by a TLB anymore.
          If in doubt, say "N".
 
-config DEBUG_STRICT_USER_COPY_CHECKS
-	bool "Strict copy size checks"
-	depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
-	---help---
-	  Enabling this option turns a certain set of sanity checks for user
-	  copy operations into compile time failures.
-
-	  The copy_from_user() etc checks are there to help test if there
-	  are sufficient security checks on the length argument of
-	  the copy operation, by having gcc prove that the argument is
-	  within bounds.
-
-	  If unsure, or if you run an older (pre 4.4) gcc, say N.
-
 endmenu
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bda6ba6f3cf..ce640aff61a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -91,6 +91,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_EXTABLE_SORT
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index fc32a2df497..c56878e1245 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -17,20 +17,6 @@ config STRICT_DEVMEM
 
 	  If you are unsure, say Y.
 
-config DEBUG_STRICT_USER_COPY_CHECKS
-	def_bool n
-	prompt "Strict user copy size checks"
-	---help---
-	  Enabling this option turns a certain set of sanity checks for user
-	  copy operations into compile time warnings.
-
-	  The copy_from_user() etc checks are there to help test if there
-	  are sufficient security checks on the length argument of
-	  the copy operation, by having gcc prove that the argument is
-	  within bounds.
-
-	  If unsure, or if you run an older (pre 4.4) gcc, say N.
-
 config S390_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 6ab0d0b5cec..20b0e97a7df 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -3,7 +3,6 @@
 #
 
 lib-y += delay.o string.o uaccess_std.o uaccess_pt.o
-obj-y += usercopy.o
 obj-$(CONFIG_32BIT) += div64.o qrnnd.o ucmpdi2.o mem32.o
 obj-$(CONFIG_64BIT) += mem64.o
 lib-$(CONFIG_64BIT) += uaccess_mvcos.o
diff --git a/arch/s390/lib/usercopy.c b/arch/s390/lib/usercopy.c
deleted file mode 100644
index 14b363fec8a..00000000000
--- a/arch/s390/lib/usercopy.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <linux/module.h>
-#include <linux/bug.h>
-
-void copy_from_user_overflow(void)
-{
-	WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 8410065f286..dbe119b63b4 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -45,4 +45,3 @@ obj-y                 += iomap.o
 obj-$(CONFIG_SPARC32) += atomic32.o ucmpdi2.o
 obj-y                 += ksyms.o
 obj-$(CONFIG_SPARC64) += PeeCeeI.o
-obj-y                 += usercopy.o
diff --git a/arch/sparc/lib/usercopy.c b/arch/sparc/lib/usercopy.c
deleted file mode 100644
index 5c4284ce1c0..00000000000
--- a/arch/sparc/lib/usercopy.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-
-void copy_from_user_overflow(void)
-{
-	WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 25877aebc68..0f712f4e1b3 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -19,6 +19,7 @@ config TILE
 	select HAVE_SYSCALL_WRAPPERS if TILEGX
 	select VIRT_TO_BUS
 	select SYS_HYPERVISOR
+	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
@@ -114,13 +115,6 @@ config STRICT_DEVMEM
 config SMP
 	def_bool y
 
-# Allow checking for compile-time determined overflow errors in
-# copy_from_user().  There are still unprovable places in the
-# generic code as of 2.6.34, so this option is not really compatible
-# with -Werror, which is more useful in general.
-config DEBUG_COPY_FROM_USER
-	def_bool n
-
 config HVC_TILE
 	depends on TTY
 	select HVC_DRIVER
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index 9ab078a4605..8a082bc6bca 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -395,7 +395,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 
-#ifdef CONFIG_DEBUG_COPY_FROM_USER
+#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
+/*
+ * There are still unprovable places in the generic code as of 2.6.34, so this
+ * option is not really compatible with -Werror, which is more useful in
+ * general.
+ */
 extern void copy_from_user_overflow(void)
 	__compiletime_warning("copy_from_user() size is not provably correct");
 
diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c
index f8d398c9ee7..030abe3ee4f 100644
--- a/arch/tile/lib/uaccess.c
+++ b/arch/tile/lib/uaccess.c
@@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size)
 		 is_arch_mappable_range(addr, size));
 }
 EXPORT_SYMBOL(__range_ok);
-
-#ifdef CONFIG_DEBUG_COPY_FROM_USER
-void copy_from_user_overflow(void)
-{
-       WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
-#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 05b057dca4a..5db2117ae28 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -20,6 +20,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select HAVE_AOUT if X86_32
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 16f738385dc..c198b7e13e7 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -292,20 +292,6 @@ config OPTIMIZE_INLINING
 
 	  If unsure, say N.
 
-config DEBUG_STRICT_USER_COPY_CHECKS
-	bool "Strict copy size checks"
-	depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
-	---help---
-	  Enabling this option turns a certain set of sanity checks for user
-	  copy operations into compile time failures.
-
-	  The copy_from_user() etc checks are there to help test if there
-	  are sufficient security checks on the length argument of
-	  the copy operation, by having gcc prove that the argument is
-	  within bounds.
-
-	  If unsure, or if you run an older (pre 4.4) gcc, say N.
-
 config DEBUG_NMI_SELFTEST
 	bool "NMI Selftest"
 	depends on DEBUG_KERNEL && X86_LOCAL_APIC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index f0312d74640..3eb18acd0e4 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -689,9 +689,3 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 EXPORT_SYMBOL(_copy_from_user);
-
-void copy_from_user_overflow(void)
-{
-	WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 77ebaa3dfa1..770a422a42e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1292,6 +1292,24 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	bool
+
+config DEBUG_STRICT_USER_COPY_CHECKS
+	bool "Strict user copy size checks"
+	depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
+	help
+	  Enabling this option turns a certain set of sanity checks for user
+	  copy operations into compile time failures.
+
+	  The copy_from_user() etc checks are there to help test if there
+	  are sufficient security checks on the length argument of
+	  the copy operation, by having gcc prove that the argument is
+	  within bounds.
+
+	  If unsure, say N.
+
 source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
diff --git a/lib/Makefile b/lib/Makefile
index 23c9a0fe74f..e9c52e1b853 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,6 +15,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o
 
+obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 
diff --git a/lib/usercopy.c b/lib/usercopy.c
new file mode 100644
index 00000000000..4f5b1ddbcd2
--- /dev/null
+++ b/lib/usercopy.c
@@ -0,0 +1,9 @@
+#include <linux/export.h>
+#include <linux/bug.h>
+#include <linux/uaccess.h>
+
+void copy_from_user_overflow(void)
+{
+	WARN(1, "Buffer overflow detected!\n");
+}
+EXPORT_SYMBOL(copy_from_user_overflow);
-- 
cgit v1.2.3-70-g09d2


From e12a95f40ac78f223deab743a5228f9eb3df8ed1 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 30 Apr 2013 15:28:49 -0700
Subject: notifier-error-inject: fix module names in Kconfig

The Kconfig help text for MEMORY_NOTIFIER_ERROR_INJECT and
OF_RECONFIG_NOTIFIER_ERROR_INJECT has mismatched module names.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 770a422a42e..566cf2bc08e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1192,7 +1192,7 @@ config MEMORY_NOTIFIER_ERROR_INJECT
 	  bash: echo: write error: Cannot allocate memory
 
 	  To compile this code as a module, choose M here: the module will
-	  be called pSeries-reconfig-notifier-error-inject.
+	  be called memory-notifier-error-inject.
 
 	  If unsure, say N.
 
@@ -1209,7 +1209,7 @@ config OF_RECONFIG_NOTIFIER_ERROR_INJECT
 	  notified, write the error code to "actions/<notifier event>/error".
 
 	  To compile this code as a module, choose M here: the module will
-	  be called memory-notifier-error-inject.
+	  be called of-reconfig-notifier-error-inject.
 
 	  If unsure, say N.
 
-- 
cgit v1.2.3-70-g09d2


From 6f9982bdde99d4e6f248613ca780c63de26ba086 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 30 Apr 2013 15:28:50 -0700
Subject: lib/decompress.c: fix initconst

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/decompress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/decompress.c b/lib/decompress.c
index 31a80427728..f8fdedaf7b3 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -38,7 +38,7 @@ struct compress_format {
 	decompress_fn decompressor;
 };
 
-static const struct compress_format compressed_formats[] __initdata = {
+static const struct compress_format compressed_formats[] __initconst = {
 	{ {037, 0213}, "gzip", gunzip },
 	{ {037, 0236}, "gzip", gunzip },
 	{ {0x42, 0x5a}, "bzip2", bunzip2 },
-- 
cgit v1.2.3-70-g09d2


From 9e6879460c8edb0cd3c24c09b83d06541b5af0dc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 4 May 2013 08:48:27 +0100
Subject: Give the OID registry file module info to avoid kernel tainting

Give the OID registry file module information so that it doesn't taint the
kernel when compiled as a module and loaded.

Reported-by: Dros Adamson <Weston.Adamson@netapp.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Trond Myklebust <Trond.Myklebust@netapp.com>
cc: stable@vger.kernel.org
cc: linux-nfs@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/oid_registry.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'lib')

diff --git a/lib/oid_registry.c b/lib/oid_registry.c
index d8de11f4590..318f382a010 100644
--- a/lib/oid_registry.c
+++ b/lib/oid_registry.c
@@ -9,6 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#include <linux/module.h>
 #include <linux/export.h>
 #include <linux/oid_registry.h>
 #include <linux/kernel.h>
@@ -16,6 +17,10 @@
 #include <linux/bug.h>
 #include "oid_registry_data.c"
 
+MODULE_DESCRIPTION("OID Registry");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
 /**
  * look_up_OID - Find an OID registration for the specified data
  * @data: Binary representation of the OID
-- 
cgit v1.2.3-70-g09d2


From e2d57f782c8a906f80d5b5f54da803c1638929d7 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:49 -0700
Subject: rwsem: make the waiter type an enumeration rather than a bitmask

We are not planning to add some new waiter flags, so we can convert the
waiter type into an enumeration.

Background: David Howells suggested I do this back when I tried adding
a new waiter type for unfair readers. However, I believe the cleanup
applies regardless of that use case.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem-spinlock.c | 19 +++++++++++--------
 lib/rwsem.c          | 23 +++++++++++++----------
 2 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7542afbb22b..5f117f37ac0 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -9,12 +9,15 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 
+enum rwsem_waiter_type {
+	RWSEM_WAITING_FOR_WRITE,
+	RWSEM_WAITING_FOR_READ
+};
+
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	unsigned int flags;
-#define RWSEM_WAITING_FOR_READ	0x00000001
-#define RWSEM_WAITING_FOR_WRITE	0x00000002
+	enum rwsem_waiter_type type;
 };
 
 int rwsem_is_locked(struct rw_semaphore *sem)
@@ -68,7 +71,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 
 	if (!wakewrite) {
-		if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
+		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 			goto out;
 		goto dont_wake_writers;
 	}
@@ -78,7 +81,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	 * to -1 here to indicate we get the lock. Instead, we wake it up
 	 * to let it go get it again.
 	 */
-	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
+	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
 		wake_up_process(waiter->task);
 		goto out;
 	}
@@ -86,7 +89,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	/* grant an infinite number of read locks to the front of the queue */
  dont_wake_writers:
 	woken = 0;
-	while (waiter->flags & RWSEM_WAITING_FOR_READ) {
+	while (waiter->type == RWSEM_WAITING_FOR_READ) {
 		struct list_head *next = waiter->list.next;
 
 		list_del(&waiter->list);
@@ -144,7 +147,7 @@ void __sched __down_read(struct rw_semaphore *sem)
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-	waiter.flags = RWSEM_WAITING_FOR_READ;
+	waiter.type = RWSEM_WAITING_FOR_READ;
 	get_task_struct(tsk);
 
 	list_add_tail(&waiter.list, &sem->wait_list);
@@ -201,7 +204,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 	/* set up my own style of waitqueue */
 	tsk = current;
 	waiter.task = tsk;
-	waiter.flags = RWSEM_WAITING_FOR_WRITE;
+	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	list_add_tail(&waiter.list, &sem->wait_list);
 
 	/* wait for someone to release the lock */
diff --git a/lib/rwsem.c b/lib/rwsem.c
index ad5e0df16ab..672eb33218a 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -30,12 +30,15 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 
 EXPORT_SYMBOL(__init_rwsem);
 
+enum rwsem_waiter_type {
+	RWSEM_WAITING_FOR_WRITE,
+	RWSEM_WAITING_FOR_READ
+};
+
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	unsigned int flags;
-#define RWSEM_WAITING_FOR_READ	0x00000001
-#define RWSEM_WAITING_FOR_WRITE	0x00000002
+	enum rwsem_waiter_type type;
 };
 
 /* Wake types for __rwsem_do_wake().  Note that RWSEM_WAKE_NO_ACTIVE and
@@ -65,7 +68,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
+	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
 		goto readers_only;
 
 	if (wake_type == RWSEM_WAKE_READ_OWNED)
@@ -112,10 +115,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		waiter = list_entry(waiter->list.next,
 					struct rwsem_waiter, list);
 
-	} while (waiter->flags & RWSEM_WAITING_FOR_READ);
+	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
 	adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
-	if (waiter->flags & RWSEM_WAITING_FOR_READ)
+	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
 		/* hit end of list above */
 		adjustment -= RWSEM_WAITING_BIAS;
 
@@ -148,7 +151,7 @@ static int try_get_writer_sem(struct rw_semaphore *sem,
 
 	/* only steal when first waiter is writing */
 	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+	if (fwaiter->type != RWSEM_WAITING_FOR_WRITE)
 		return 0;
 
 	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
@@ -179,7 +182,7 @@ try_again_write:
  */
 static struct rw_semaphore __sched *
 rwsem_down_failed_common(struct rw_semaphore *sem,
-			 unsigned int flags, signed long adjustment)
+			 enum rwsem_waiter_type type, signed long adjustment)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
@@ -190,7 +193,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	/* set up my own style of waitqueue */
 	raw_spin_lock_irq(&sem->wait_lock);
 	waiter.task = tsk;
-	waiter.flags = flags;
+	waiter.type = type;
 	get_task_struct(tsk);
 
 	if (list_empty(&sem->wait_list))
@@ -221,7 +224,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 
 		raw_spin_lock_irq(&sem->wait_lock);
 		/* Try to get the writer sem, may steal from the head writer: */
-		if (flags == RWSEM_WAITING_FOR_WRITE)
+		if (type == RWSEM_WAITING_FOR_WRITE)
 			if (try_get_writer_sem(sem, &waiter)) {
 				raw_spin_unlock_irq(&sem->wait_lock);
 				return sem;
-- 
cgit v1.2.3-70-g09d2


From f7dd1cee9a4e2b1450e4a3732636dfbf28562ee4 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:50 -0700
Subject: rwsem: shorter spinlocked section in rwsem_down_failed_common()

This change reduces the size of the spinlocked and TASK_UNINTERRUPTIBLE
sections in rwsem_down_failed_common():

- We only need the sem->wait_lock to insert ourselves on the wait_list;
  the waiter node can be prepared outside of the wait_lock.

- The task state only needs to be set to TASK_UNINTERRUPTIBLE immediately
  before checking if we actually need to sleep; it doesn't need to protect
  the entire function.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 672eb33218a..40636454cf3 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -188,14 +188,12 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	struct task_struct *tsk = current;
 	signed long count;
 
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
-	raw_spin_lock_irq(&sem->wait_lock);
 	waiter.task = tsk;
 	waiter.type = type;
 	get_task_struct(tsk);
 
+	raw_spin_lock_irq(&sem->wait_lock);
 	if (list_empty(&sem->wait_list))
 		adjustment += RWSEM_WAITING_BIAS;
 	list_add_tail(&waiter.list, &sem->wait_list);
@@ -218,7 +216,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	raw_spin_unlock_irq(&sem->wait_lock);
 
 	/* wait to be given the lock */
-	for (;;) {
+	while (true) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 
@@ -231,7 +230,6 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 			}
 		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	}
 
 	tsk->state = TASK_RUNNING;
-- 
cgit v1.2.3-70-g09d2


From 1e78277ccbbb48af32a618d1ef0e8534e0b648d7 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:51 -0700
Subject: rwsem: move rwsem_down_failed_common code into
 rwsem_down_{read,write}_failed

Remove the rwsem_down_failed_common function and replace it with two
identical copies of its code in rwsem_down_{read,write}_failed.

This is because we want to make different optimizations in
rwsem_down_{read,write}_failed; we are adding this pure-duplication
step as a separate commit in order to make it easier to check the
following steps.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 57 insertions(+), 15 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 40636454cf3..fb658af1c12 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -178,12 +178,12 @@ try_again_write:
 }
 
 /*
- * wait for a lock to be granted
+ * wait for the read lock to be granted
  */
-static struct rw_semaphore __sched *
-rwsem_down_failed_common(struct rw_semaphore *sem,
-			 enum rwsem_waiter_type type, signed long adjustment)
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
+	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ;
+	signed long adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
 	signed long count;
@@ -237,22 +237,64 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	return sem;
 }
 
-/*
- * wait for the read lock to be granted
- */
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
-{
-	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
-					-RWSEM_ACTIVE_READ_BIAS);
-}
-
 /*
  * wait for the write lock to be granted
  */
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
-					-RWSEM_ACTIVE_WRITE_BIAS);
+	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE;
+	signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+	struct rwsem_waiter waiter;
+	struct task_struct *tsk = current;
+	signed long count;
+
+	/* set up my own style of waitqueue */
+	waiter.task = tsk;
+	waiter.type = type;
+	get_task_struct(tsk);
+
+	raw_spin_lock_irq(&sem->wait_lock);
+	if (list_empty(&sem->wait_list))
+		adjustment += RWSEM_WAITING_BIAS;
+	list_add_tail(&waiter.list, &sem->wait_list);
+
+	/* we're now waiting on the lock, but no longer actively locking */
+	count = rwsem_atomic_update(adjustment, sem);
+
+	/* If there are no active locks, wake the front queued process(es) up.
+	 *
+	 * Alternatively, if we're called from a failed down_write(), there
+	 * were already threads queued before us and there are no active
+	 * writers, the lock must be read owned; so we try to wake any read
+	 * locks that were queued ahead of us. */
+	if (count == RWSEM_WAITING_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
+	else if (count > RWSEM_WAITING_BIAS &&
+		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	/* wait to be given the lock */
+	while (true) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!waiter.task)
+			break;
+
+		raw_spin_lock_irq(&sem->wait_lock);
+		/* Try to get the writer sem, may steal from the head writer: */
+		if (type == RWSEM_WAITING_FOR_WRITE)
+			if (try_get_writer_sem(sem, &waiter)) {
+				raw_spin_unlock_irq(&sem->wait_lock);
+				return sem;
+			}
+		raw_spin_unlock_irq(&sem->wait_lock);
+		schedule();
+	}
+
+	tsk->state = TASK_RUNNING;
+
+	return sem;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From da16922cc031b9c0221c836994276ab193b31de8 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:52 -0700
Subject: rwsem: simplify rwsem_down_read_failed

When trying to acquire a read lock, the RWSEM_ACTIVE_READ_BIAS
adjustment doesn't cause other readers to block, so we never have to
worry about waking them back after canceling this adjustment in
rwsem_down_read_failed().

We also never want to steal the lock in rwsem_down_read_failed(), so we
don't have to grab the wait_lock either.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index fb658af1c12..66f307e9076 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -182,7 +182,6 @@ try_again_write:
  */
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
-	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ;
 	signed long adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
@@ -190,7 +189,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-	waiter.type = type;
+	waiter.type = RWSEM_WAITING_FOR_READ;
 	get_task_struct(tsk);
 
 	raw_spin_lock_irq(&sem->wait_lock);
@@ -201,17 +200,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* If there are no active locks, wake the front queued process(es) up.
-	 *
-	 * Alternatively, if we're called from a failed down_write(), there
-	 * were already threads queued before us and there are no active
-	 * writers, the lock must be read owned; so we try to wake any read
-	 * locks that were queued ahead of us. */
+	/* If there are no active locks, wake the front queued process(es). */
 	if (count == RWSEM_WAITING_BIAS)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
-	else if (count > RWSEM_WAITING_BIAS &&
-		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
 
@@ -220,15 +211,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
-
-		raw_spin_lock_irq(&sem->wait_lock);
-		/* Try to get the writer sem, may steal from the head writer: */
-		if (type == RWSEM_WAITING_FOR_WRITE)
-			if (try_get_writer_sem(sem, &waiter)) {
-				raw_spin_unlock_irq(&sem->wait_lock);
-				return sem;
-			}
-		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 023fe4f712028d25b42d31984abae1f3d3f0e3e2 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:53 -0700
Subject: rwsem: simplify rwsem_down_write_failed

When waking writers, we never grant them the lock - instead, they have
to acquire it themselves when they run, and remove themselves from the
wait_list when they succeed.

As a result, we can do a few simplifications in rwsem_down_write_failed():

- We don't need to check for !waiter.task since __rwsem_do_wake() doesn't
  remove writers from the wait_list

- There is no point releaseing the wait_lock before entering the wait loop,
  as we will need to reacquire it immediately. We can change the loop so
  that the lock is always held at the start of each loop iteration.

- We don't need to get a reference on the task structure, since the task
  is responsible for removing itself from the wait_list. There is no risk,
  like in the rwsem_down_read_failed() case, that a task would wake up and
  exit (thus destroying its task structure) while __rwsem_do_wake() is
  still running - wait_lock protects against that.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 66f307e9076..c73bd96dc30 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -161,16 +161,8 @@ static int try_get_writer_sem(struct rw_semaphore *sem,
 
 try_again_write:
 	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (!(oldcount & RWSEM_ACTIVE_MASK)) {
-		/* No active lock: */
-		struct task_struct *tsk = waiter->task;
-
-		list_del(&waiter->list);
-		smp_mb();
-		put_task_struct(tsk);
-		tsk->state = TASK_RUNNING;
+	if (!(oldcount & RWSEM_ACTIVE_MASK))
 		return 1;
-	}
 	/* some one grabbed the sem already */
 	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
 		return 0;
@@ -220,11 +212,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 }
 
 /*
- * wait for the write lock to be granted
+ * wait until we successfully acquire the write lock
  */
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE;
 	signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
@@ -232,8 +223,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-	waiter.type = type;
-	get_task_struct(tsk);
+	waiter.type = RWSEM_WAITING_FOR_WRITE;
 
 	raw_spin_lock_irq(&sem->wait_lock);
 	if (list_empty(&sem->wait_list))
@@ -255,25 +245,20 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
-	raw_spin_unlock_irq(&sem->wait_lock);
-
-	/* wait to be given the lock */
+	/* wait until we successfully acquire the lock */
 	while (true) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		if (!waiter.task)
+
+		if (try_get_writer_sem(sem, &waiter))
 			break;
 
-		raw_spin_lock_irq(&sem->wait_lock);
-		/* Try to get the writer sem, may steal from the head writer: */
-		if (type == RWSEM_WAITING_FOR_WRITE)
-			if (try_get_writer_sem(sem, &waiter)) {
-				raw_spin_unlock_irq(&sem->wait_lock);
-				return sem;
-			}
 		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
+		raw_spin_lock_irq(&sem->wait_lock);
 	}
 
+	list_del(&waiter.list);
+	raw_spin_unlock_irq(&sem->wait_lock);
 	tsk->state = TASK_RUNNING;
 
 	return sem;
-- 
cgit v1.2.3-70-g09d2


From ed00f64346631dff035adfb9b0240daaa8b46c4e Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:54 -0700
Subject: rwsem: more agressive lock stealing in rwsem_down_write_failed

Some small code simplifications can be achieved by doing more agressive
lock stealing:

- When rwsem_down_write_failed() notices that there are no active locks
  (and thus no thread to wake us if we decided to sleep), it used to wake
  the first queued process. However, stealing the lock is also sufficient
  to deal with this case, so we don't need this check anymore.

- In try_get_writer_sem(), we can steal the lock even when the first waiter
  is a reader. This is correct because the code path that wakes readers is
  protected by the wait_lock. As to the performance effects of this change,
  they are expected to be minimal: readers are still granted the lock
  (rather than having to acquire it themselves) when they reach the front
  of the wait queue, so we have essentially the same behavior as in
  rwsem-spinlock.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index c73bd96dc30..2360bf20409 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -143,20 +143,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 }
 
 /* Try to get write sem, caller holds sem->wait_lock: */
-static int try_get_writer_sem(struct rw_semaphore *sem,
-					struct rwsem_waiter *waiter)
+static int try_get_writer_sem(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter *fwaiter;
 	long oldcount, adjustment;
 
-	/* only steal when first waiter is writing */
-	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (fwaiter->type != RWSEM_WAITING_FOR_WRITE)
-		return 0;
-
 	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	/* Only one waiter in the queue: */
-	if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+	if (list_is_singular(&sem->wait_list))
 		adjustment -= RWSEM_WAITING_BIAS;
 
 try_again_write:
@@ -233,23 +225,18 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* If there are no active locks, wake the front queued process(es) up.
-	 *
-	 * Alternatively, if we're called from a failed down_write(), there
-	 * were already threads queued before us and there are no active
-	 * writers, the lock must be read owned; so we try to wake any read
-	 * locks that were queued ahead of us. */
-	if (count == RWSEM_WAITING_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
-	else if (count > RWSEM_WAITING_BIAS &&
-		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+	/* If there were already threads queued before us and there are no
+	 * active writers, the lock must be read owned; so we try to wake
+	 * any read locks that were queued ahead of us. */
+	if (count > RWSEM_WAITING_BIAS &&
+	    adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	/* wait until we successfully acquire the lock */
 	while (true) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 
-		if (try_get_writer_sem(sem, &waiter))
+		if (try_get_writer_sem(sem))
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3-70-g09d2


From 5ede972df1cd9294c82e9515949fd2103be81d7b Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:55 -0700
Subject: rwsem: use cmpxchg for trying to steal write lock

Using rwsem_atomic_update to try stealing the write lock forced us to
undo the adjustment in the failure path.  We can have simpler and faster
code by using cmpxchg instead.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 2360bf20409..64c2dc007be 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -142,25 +142,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	return sem;
 }
 
-/* Try to get write sem, caller holds sem->wait_lock: */
-static int try_get_writer_sem(struct rw_semaphore *sem)
-{
-	long oldcount, adjustment;
-
-	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	if (list_is_singular(&sem->wait_list))
-		adjustment -= RWSEM_WAITING_BIAS;
-
-try_again_write:
-	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (!(oldcount & RWSEM_ACTIVE_MASK))
-		return 1;
-	/* some one grabbed the sem already */
-	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-		return 0;
-	goto try_again_write;
-}
-
 /*
  * wait for the read lock to be granted
  */
@@ -236,7 +217,12 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	while (true) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 
-		if (try_get_writer_sem(sem))
+		/* Try acquiring the write lock. */
+		count = RWSEM_ACTIVE_WRITE_BIAS;
+		if (!list_is_singular(&sem->wait_list))
+			count += RWSEM_WAITING_BIAS;
+		if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+							RWSEM_WAITING_BIAS)
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3-70-g09d2


From a7d2c573ae7fad1b2c877d1a1342fa5bb0d6478c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:56 -0700
Subject: rwsem: avoid taking wait_lock in rwsem_down_write_failed

In rwsem_down_write_failed(), if there are active locks after we wake up
(i.e.  the lock got stolen from us), skip taking the wait_lock and go
back to sleep immediately.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 64c2dc007be..edf3d9ca670 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -214,8 +214,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	/* wait until we successfully acquire the lock */
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	while (true) {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 
 		/* Try acquiring the write lock. */
 		count = RWSEM_ACTIVE_WRITE_BIAS;
@@ -226,7 +226,13 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
-		schedule();
+
+		/* Block until there are no active lockers. */
+		do {
+			schedule();
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		} while (sem->count & RWSEM_ACTIVE_MASK);
+
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 9b0fc9c09f1b262b7fe697eba6b05095d78850e5 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:57 -0700
Subject: rwsem: skip initial trylock in rwsem_down_write_failed

We can skip the initial trylock in rwsem_down_write_failed() if there
are known active lockers already, thus saving one likely-to-fail
cmpxchg.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index edf3d9ca670..0d50e46d5b0 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -216,14 +216,15 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	/* wait until we successfully acquire the lock */
 	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	while (true) {
-
-		/* Try acquiring the write lock. */
-		count = RWSEM_ACTIVE_WRITE_BIAS;
-		if (!list_is_singular(&sem->wait_list))
-			count += RWSEM_WAITING_BIAS;
-		if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+		if (!(count & RWSEM_ACTIVE_MASK)) {
+			/* Try acquiring the write lock. */
+			count = RWSEM_ACTIVE_WRITE_BIAS;
+			if (!list_is_singular(&sem->wait_list))
+				count += RWSEM_WAITING_BIAS;
+			if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
 							RWSEM_WAITING_BIAS)
-			break;
+				break;
+		}
 
 		raw_spin_unlock_irq(&sem->wait_lock);
 
@@ -231,7 +232,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		do {
 			schedule();
 			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		} while (sem->count & RWSEM_ACTIVE_MASK);
+		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
-- 
cgit v1.2.3-70-g09d2


From 8cf5322ce69afea1fab6a6270db24d057d664798 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:58 -0700
Subject: rwsem: simplify __rwsem_do_wake

This is mostly for cleanup value:

- We don't need several gotos to handle the case where the first
  waiter is a writer. Two simple tests will do (and generate very
  similar code).

- In the remainder of the function, we know the first waiter is a reader,
  so we don't have to double check that. We can use do..while loops
  to iterate over the readers to wake (generates slightly better code).

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem-spinlock.c | 23 +++++++----------------
 lib/rwsem.c          | 26 ++++++++++++--------------
 2 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 5f117f37ac0..9be8a914497 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -70,26 +70,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 
-	if (!wakewrite) {
-		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
-			goto out;
-		goto dont_wake_writers;
-	}
-
-	/*
-	 * as we support write lock stealing, we can't set sem->activity
-	 * to -1 here to indicate we get the lock. Instead, we wake it up
-	 * to let it go get it again.
-	 */
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-		wake_up_process(waiter->task);
+		if (wakewrite)
+			/* Wake up a writer. Note that we do not grant it the
+			 * lock - it will have to acquire it when it runs. */
+			wake_up_process(waiter->task);
 		goto out;
 	}
 
 	/* grant an infinite number of read locks to the front of the queue */
- dont_wake_writers:
 	woken = 0;
-	while (waiter->type == RWSEM_WAITING_FOR_READ) {
+	do {
 		struct list_head *next = waiter->list.next;
 
 		list_del(&waiter->list);
@@ -99,10 +90,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 		wake_up_process(tsk);
 		put_task_struct(tsk);
 		woken++;
-		if (list_empty(&sem->wait_list))
+		if (next == &sem->wait_list)
 			break;
 		waiter = list_entry(next, struct rwsem_waiter, list);
-	}
+	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
 	sem->activity += woken;
 
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 0d50e46d5b0..9a675fa9d78 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -68,20 +68,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
-		goto readers_only;
-
-	if (wake_type == RWSEM_WAKE_READ_OWNED)
-		/* Another active reader was observed, so wakeup is not
-		 * likely to succeed. Save the atomic op.
-		 */
+	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+		if (wake_type != RWSEM_WAKE_READ_OWNED)
+			/* Wake writer at the front of the queue, but do not
+			 * grant it the lock yet as we want other writers
+			 * to be able to steal it.  Readers, on the other hand,
+			 * will block as they will notice the queued writer.
+			 */
+			wake_up_process(waiter->task);
 		goto out;
+	}
 
-	/* Wake up the writing waiter and let the task grab the sem: */
-	wake_up_process(waiter->task);
-	goto out;
-
- readers_only:
 	/* If we come here from up_xxxx(), another thread might have reached
 	 * rwsem_down_failed_common() before we acquired the spinlock and
 	 * woken up a waiter, making it now active.  We prefer to check for
@@ -125,7 +122,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	rwsem_atomic_add(adjustment, sem);
 
 	next = sem->wait_list.next;
-	for (loop = woken; loop > 0; loop--) {
+	loop = woken;
+	do {
 		waiter = list_entry(next, struct rwsem_waiter, list);
 		next = waiter->list.next;
 		tsk = waiter->task;
@@ -133,7 +131,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		waiter->task = NULL;
 		wake_up_process(tsk);
 		put_task_struct(tsk);
-	}
+	} while (--loop);
 
 	sem->wait_list.next = next;
 	next->prev = &sem->wait_list;
-- 
cgit v1.2.3-70-g09d2


From fe6e674c6187d4f452a679ced7e95262bd517936 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:59 -0700
Subject: rwsem: implement support for write lock stealing on the fastpath

When we decide to wake up readers, we must first grant them as many read
locks as necessary, and then actually wake up all these readers.  But in
order to know how many read shares to grant, we must first count the
readers at the head of the queue.  This might take a while if there are
many readers, and we want to be protected against a writer stealing the
lock while we're counting.  To that end, we grant the first reader lock
before counting how many more readers are queued.

We also require some adjustments to the wake_type semantics.

RWSEM_WAKE_NO_ACTIVE used to mean that we had found the count to be
RWSEM_WAITING_BIAS, in which case the rwsem was known to be free as
nobody could steal it while we hold the wait_lock.  This doesn't make
sense once we implement fastpath write lock stealing, so we now use
RWSEM_WAKE_ANY in that case.

Similarly, when rwsem_down_write_failed found that a read lock was
active, it would use RWSEM_WAKE_READ_OWNED which signalled that new
readers could be woken without checking first that the rwsem was
available.  We can't do that anymore since the existing readers might
release their read locks, and a writer could steal the lock before we
wake up additional readers.  So, we have to use a new RWSEM_WAKE_READERS
value to indicate we only want to wake readers, but we don't currently
hold any read lock.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 64 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 9a675fa9d78..bbe48c04f36 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -4,6 +4,7 @@
  * Derived from arch/i386/kernel/semaphore.c
  *
  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -41,13 +42,11 @@ struct rwsem_waiter {
 	enum rwsem_waiter_type type;
 };
 
-/* Wake types for __rwsem_do_wake().  Note that RWSEM_WAKE_NO_ACTIVE and
- * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held
- * since the rwsem value was observed.
- */
-#define RWSEM_WAKE_ANY        0 /* Wake whatever's at head of wait list */
-#define RWSEM_WAKE_NO_ACTIVE  1 /* rwsem was observed with no active thread */
-#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */
+enum rwsem_wake_type {
+	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
+	RWSEM_WAKE_READERS,	/* Wake readers only */
+	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
+};
 
 /*
  * handle the lock release when processes blocked on it that can now run
@@ -60,16 +59,16 @@ struct rwsem_waiter {
  * - writers are only woken if downgrading is false
  */
 static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
+__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long woken, loop, adjustment;
+	signed long oldcount, woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-		if (wake_type != RWSEM_WAKE_READ_OWNED)
+		if (wake_type == RWSEM_WAKE_ANY)
 			/* Wake writer at the front of the queue, but do not
 			 * grant it the lock yet as we want other writers
 			 * to be able to steal it.  Readers, on the other hand,
@@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		goto out;
 	}
 
-	/* If we come here from up_xxxx(), another thread might have reached
-	 * rwsem_down_failed_common() before we acquired the spinlock and
-	 * woken up a waiter, making it now active.  We prefer to check for
-	 * this first in order to not spend too much time with the spinlock
-	 * held if we're not going to be able to wake up readers in the end.
-	 *
-	 * Note that we do not need to update the rwsem count: any writer
-	 * trying to acquire rwsem will run rwsem_down_write_failed() due
-	 * to the waiting threads and block trying to acquire the spinlock.
-	 *
-	 * We use a dummy atomic update in order to acquire the cache line
-	 * exclusively since we expect to succeed and run the final rwsem
-	 * count adjustment pretty soon.
+	/* Writers might steal the lock before we grant it to the next reader.
+	 * We prefer to do the first reader grant before counting readers
+	 * so we can bail out early if a writer stole the lock.
 	 */
-	if (wake_type == RWSEM_WAKE_ANY &&
-	    rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS)
-		/* Someone grabbed the sem for write already */
-		goto out;
+	adjustment = 0;
+	if (wake_type != RWSEM_WAKE_READ_OWNED) {
+		adjustment = RWSEM_ACTIVE_READ_BIAS;
+ try_reader_grant:
+		oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+			/* A writer stole the lock. Undo our reader grant. */
+			if (rwsem_atomic_update(-adjustment, sem) &
+						RWSEM_ACTIVE_MASK)
+				goto out;
+			/* Last active locker left. Retry waking readers. */
+			goto try_reader_grant;
+		}
+	}
 
 	/* Grant an infinite number of read locks to the readers at the front
 	 * of the queue.  Note we increment the 'active part' of the count by
@@ -114,12 +113,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
 	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
-	adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
+	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
 	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
 		/* hit end of list above */
 		adjustment -= RWSEM_WAITING_BIAS;
 
-	rwsem_atomic_add(adjustment, sem);
+	if (adjustment)
+		rwsem_atomic_add(adjustment, sem);
 
 	next = sem->wait_list.next;
 	loop = woken;
@@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	count = rwsem_atomic_update(adjustment, sem);
 
 	/* If there are no active locks, wake the front queued process(es). */
-	if (count == RWSEM_WAITING_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
+	if (!(count & RWSEM_ACTIVE_MASK))
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
 
@@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	 * any read locks that were queued ahead of us. */
 	if (count > RWSEM_WAITING_BIAS &&
 	    adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
 
 	/* wait until we successfully acquire the lock */
 	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-- 
cgit v1.2.3-70-g09d2


From 25c39325968bbcebe6cd2a1991228c9dfb48d655 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:46:00 -0700
Subject: rwsem: do not block readers at head of queue if other readers are
 active

This change fixes a race condition where a reader might determine it
needs to block, but by the time it acquires the wait_lock the rwsem has
active readers and no queued waiters.

In this situation the reader can run in parallel with the existing
active readers; it does not need to block until the active readers
complete.

Thanks to Peter Hurley for noticing this possible race.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index bbe48c04f36..61f91ca75e4 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -163,8 +163,14 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* If there are no active locks, wake the front queued process(es). */
-	if (!(count & RWSEM_ACTIVE_MASK))
+	/* If there are no active locks, wake the front queued process(es).
+	 *
+	 * If there are no writers and we are first in the queue,
+	 * wake our own waiter to join the existing active readers !
+	 */
+	if (count == RWSEM_WAITING_BIAS ||
+	    (count > RWSEM_WAITING_BIAS &&
+	     adjustment != -RWSEM_ACTIVE_READ_BIAS))
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3-70-g09d2


From b5f541810ea9fb98d93c0ee0e00e07a22874856f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 7 May 2013 06:46:02 -0700
Subject: rwsem: no need for explicit signed longs

Change explicit "signed long" declarations into plain "long" as suggested
by Peter Hurley.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 61f91ca75e4..cf0ad2ad19f 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -64,7 +64,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	long oldcount, woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
@@ -145,10 +145,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
  */
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
-	signed long adjustment = -RWSEM_ACTIVE_READ_BIAS;
+	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
-	signed long count;
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
@@ -193,10 +192,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
  */
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+	long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
-	signed long count;
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-- 
cgit v1.2.3-70-g09d2


From 2d864e41710f1d2ba406fb62018ab0487152e6f2 Mon Sep 17 00:00:00 2001
From: Anatol Pomozov <anatol.pomozov@gmail.com>
Date: Tue, 7 May 2013 15:37:48 -0700
Subject: kref: minor cleanup

 - make warning smp-safe
 - result of atomic _unless_zero functions should be checked by caller
   to avoid use-after-free error
 - trivial whitespace fix.

Link: https://lkml.org/lkml/2013/4/12/391

Tested: compile x86, boot machine and run xfstests
Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
[ Removed line-break, changed to use WARN_ON_ONCE()  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kref.h | 9 ++++++---
 lib/kobject.c        | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 4972e6e9ca9..e15828fd71f 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -39,8 +39,11 @@ static inline void kref_init(struct kref *kref)
  */
 static inline void kref_get(struct kref *kref)
 {
-	WARN_ON(!atomic_read(&kref->refcount));
-	atomic_inc(&kref->refcount);
+	/* If refcount was 0 before incrementing then we have a race
+	 * condition when this kref is freeing by some other thread right now.
+	 * In this case one should use kref_get_unless_zero()
+	 */
+	WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
 }
 
 /**
@@ -100,7 +103,7 @@ static inline int kref_put_mutex(struct kref *kref,
 				 struct mutex *lock)
 {
 	WARN_ON(release == NULL);
-        if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
+	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
 		mutex_lock(lock);
 		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
 			mutex_unlock(lock);
diff --git a/lib/kobject.c b/lib/kobject.c
index a65486613d7..b7e29a6056d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -529,7 +529,7 @@ struct kobject *kobject_get(struct kobject *kobj)
 	return kobj;
 }
 
-static struct kobject *kobject_get_unless_zero(struct kobject *kobj)
+static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
 {
 	if (!kref_get_unless_zero(&kobj->kref))
 		kobj = NULL;
-- 
cgit v1.2.3-70-g09d2


From 9607a85b67a9714290a78c1a56630ab1c9fa2c23 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 7 May 2013 15:39:03 -0700
Subject: rwsem: check counter to avoid cmpxchg calls

This patch tries to reduce the amount of cmpxchg calls in the writer
failed path by checking the counter value first before issuing the
instruction.  If ->count is not set to RWSEM_WAITING_BIAS then there is
no point wasting a cmpxchg call.

Furthermore, Michel states "I suppose it helps due to the case where
someone else steals the lock while we're trying to acquire
sem->wait_lock."

Two very different workloads and machines were used to see how this
patch improves throughput: pgbench on a quad-core laptop and aim7 on a
large 8 socket box with 80 cores.

Some results comparing Michel's fast-path write lock stealing
(tps-rwsem) on a quad-core laptop running pgbench:

  | db_size | clients  |  tps-rwsem     |   tps-patch  |
  +---------+----------+----------------+--------------+
  | 160 MB   |       1 |           6906 |         9153 | + 32.5
  | 160 MB   |       2 |          15931 |        22487 | + 41.1%
  | 160 MB   |       4 |          33021 |        32503 |
  | 160 MB   |       8 |          34626 |        34695 |
  | 160 MB   |      16 |          33098 |        34003 |
  | 160 MB   |      20 |          31343 |        31440 |
  | 160 MB   |      30 |          28961 |        28987 |
  | 160 MB   |      40 |          26902 |        26970 |
  | 160 MB   |      50 |          25760 |        25810 |
  ------------------------------------------------------
  | 1.6 GB   |       1 |           7729 |         7537 |
  | 1.6 GB   |       2 |          19009 |        23508 | + 23.7%
  | 1.6 GB   |       4 |          33185 |        32666 |
  | 1.6 GB   |       8 |          34550 |        34318 |
  | 1.6 GB   |      16 |          33079 |        32689 |
  | 1.6 GB   |      20 |          31494 |        31702 |
  | 1.6 GB   |      30 |          28535 |        28755 |
  | 1.6 GB   |      40 |          27054 |        27017 |
  | 1.6 GB   |      50 |          25591 |        25560 |
  ------------------------------------------------------
  | 7.6 GB   |       1 |           6224 |         7469 | + 20.0%
  | 7.6 GB   |       2 |          13611 |        12778 |
  | 7.6 GB   |       4 |          33108 |        32927 |
  | 7.6 GB   |       8 |          34712 |        34878 |
  | 7.6 GB   |      16 |          32895 |        33003 |
  | 7.6 GB   |      20 |          31689 |        31974 |
  | 7.6 GB   |      30 |          29003 |        28806 |
  | 7.6 GB   |      40 |          26683 |        26976 |
  | 7.6 GB   |      50 |          25925 |        25652 |
  ------------------------------------------------------

For the aim7 worloads, they overall improved on top of Michel's
patchset.  For full graphs on how the rwsem series plus this patch
behaves on a large 8 socket machine against a vanilla kernel:

  http://stgolabs.net/rwsem-aim7-results.tar.gz

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index cf0ad2ad19f..19c5fa95e0b 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -223,7 +223,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 			count = RWSEM_ACTIVE_WRITE_BIAS;
 			if (!list_is_singular(&sem->wait_list))
 				count += RWSEM_WAITING_BIAS;
-			if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+
+			if (sem->count == RWSEM_WAITING_BIAS &&
+			    cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
 							RWSEM_WAITING_BIAS)
 				break;
 		}
-- 
cgit v1.2.3-70-g09d2