From f792685006274a850e6cc0ea9ade275ccdfc90bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 5 Mar 2013 18:05:46 +0100 Subject: math64: New div64_u64_rem helper Provide an extended version of div64_u64() that also returns the remainder of the division. We are going to need this to refine the cputime scaling code. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Andrew Morton --- lib/div64.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/div64.c b/lib/div64.c index a163b6caef7..3af5728d95f 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -79,9 +79,10 @@ EXPORT_SYMBOL(div_s64_rem); #endif /** - * div64_u64 - unsigned 64bit divide with 64bit divisor + * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder * @dividend: 64bit dividend * @divisor: 64bit divisor + * @remainder: 64bit remainder * * This implementation is a modified version of the algorithm proposed * by the book 'Hacker's Delight'. The original source and full proof @@ -89,27 +90,33 @@ EXPORT_SYMBOL(div_s64_rem); * * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' */ -#ifndef div64_u64 -u64 div64_u64(u64 dividend, u64 divisor) +#ifndef div64_u64_rem +u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { u32 high = divisor >> 32; u64 quot; if (high == 0) { - quot = div_u64(dividend, divisor); + u32 rem32; + quot = div_u64_rem(dividend, divisor, &rem32); + *remainder = rem32; } else { int n = 1 + fls(high); quot = div_u64(dividend >> n, divisor >> n); if (quot != 0) quot--; - if ((dividend - quot * divisor) >= divisor) + + *remainder = dividend - quot * divisor; + if (*remainder >= divisor) { quot++; + *remainder -= divisor; + } } return quot; } -EXPORT_SYMBOL(div64_u64); +EXPORT_SYMBOL(div64_u64_rem); #endif /** -- cgit v1.2.3-70-g09d2 From cbe5e6109538ddab57764a88d9f0c2accd0c7d48 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 22 Mar 2013 22:17:36 -0600 Subject: lru_cache: introduce lc_get_cumulative() New helper to be able to consolidate more updates into a single transaction. Without this, we can only grab a single refcount on an updated element while preparing a transaction. lc_get_cumulative - like lc_get; also finds to-be-changed elements @lc: the lru cache to operate on @enr: the label to look up Unlike lc_get this also returns the element for @enr, if it is belonging to a pending transaction, so the return values are like for lc_get(), plus: pointer to an element already on the "to_be_changed" list. In this case, the cache was already marked %LC_DIRTY. Caller needs to make sure that the pending transaction is completed, before proceeding to actually use this element. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Fixed up by Jens to export lc_get_cumulative(). Signed-off-by: Jens Axboe --- include/linux/lru_cache.h | 1 + lib/lru_cache.c | 56 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 4019013c659..46262284de4 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc); extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); extern void lc_del(struct lru_cache *lc, struct lc_element *element); +extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 8335d39d2cc..4a83ecd0365 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc) return 0; } -static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) +/* used as internal flags to __lc_get */ +enum { + LC_GET_MAY_CHANGE = 1, + LC_GET_MAY_USE_UNCOMMITTED = 2, +}; + +static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags) { struct lc_element *e; @@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool * this enr is currently being pulled in already, * and will be available once the pending transaction * has been committed. */ - if (e && e->lc_new_number == e->lc_number) { + if (e) { + if (e->lc_new_number != e->lc_number) { + /* It has been found above, but on the "to_be_changed" + * list, not yet committed. Don't pull it in twice, + * wait for the transaction, then try again... + */ + if (!(flags & LC_GET_MAY_USE_UNCOMMITTED)) + RETURN(NULL); + /* ... unless the caller is aware of the implications, + * probably preparing a cumulative transaction. */ + ++e->refcnt; + ++lc->hits; + RETURN(e); + } + /* else: lc_new_number == lc_number; a real hit. */ ++lc->hits; if (e->refcnt++ == 0) lc->used++; list_move(&e->list, &lc->in_use); /* Not evictable... */ RETURN(e); } + /* e == NULL */ ++lc->misses; - if (!may_change) - RETURN(NULL); - - /* It has been found above, but on the "to_be_changed" list, not yet - * committed. Don't pull it in twice, wait for the transaction, then - * try again */ - if (e) + if (!(flags & LC_GET_MAY_CHANGE)) RETURN(NULL); /* To avoid races with lc_try_lock(), first, mark us dirty @@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool */ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) { - return __lc_get(lc, enr, 1); + return __lc_get(lc, enr, LC_GET_MAY_CHANGE); +} + +/** + * lc_get_cumulative - like lc_get; also finds to-be-changed elements + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Unlike lc_get this also returns the element for @enr, if it is belonging to + * a pending transaction, so the return values are like for lc_get(), + * plus: + * + * pointer to an element already on the "to_be_changed" list. + * In this case, the cache was already marked %LC_DIRTY. + * + * Caller needs to make sure that the pending transaction is completed, + * before proceeding to actually use this element. + */ +struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr) +{ + return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED); } /** @@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats); EXPORT_SYMBOL(lc_seq_dump_details); EXPORT_SYMBOL(lc_try_lock); EXPORT_SYMBOL(lc_is_used); +EXPORT_SYMBOL(lc_get_cumulative); -- cgit v1.2.3-70-g09d2 From 2db76d7c3c6db93058f983c8240f7c7c25e87ee6 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 26 Mar 2013 15:14:18 +0200 Subject: lib/scatterlist: sg_page_iter: support sg lists w/o backing pages The i915 driver uses sg lists for memory without backing 'struct page' pages, similarly to other IO memory regions, setting only the DMA address for these. It does this, so that it can program the HW MMU tables in a uniform way both for sg lists with and without backing pages. Without a valid page pointer we can't call nth_page to get the current page in __sg_page_iter_next, so add a helper that relevant users can call separately. Also add a helper to get the DMA address of the current page (idea from Daniel). Convert all places in i915, to use the new API. Signed-off-by: Imre Deak Reviewed-by: Damien Lespiau Signed-off-by: Daniel Vetter --- drivers/gpu/drm/drm_cache.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 8 ++++---- drivers/gpu/drm/i915/i915_gem_dmabuf.c | 2 +- drivers/gpu/drm/i915/i915_gem_gtt.c | 6 ++---- drivers/gpu/drm/i915/i915_gem_tiling.c | 4 ++-- include/linux/scatterlist.h | 28 +++++++++++++++++++++++----- lib/scatterlist.c | 4 +--- 8 files changed, 35 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index bc8edbeca3f..bb8f5801218 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -109,7 +109,7 @@ drm_clflush_sg(struct sg_table *st) mb(); for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) - drm_clflush_page(sg_iter.page); + drm_clflush_page(sg_page_iter_page(&sg_iter)); mb(); return; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 1d091ea12fa..f69538508d8 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1543,7 +1543,7 @@ static inline struct page *i915_gem_object_get_page(struct drm_i915_gem_object * struct sg_page_iter sg_iter; for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, n) - return sg_iter.page; + return sg_page_iter_page(&sg_iter); return NULL; } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index a1123a32dc2..911bd40ef51 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -442,7 +442,7 @@ i915_gem_shmem_pread(struct drm_device *dev, for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, offset >> PAGE_SHIFT) { - struct page *page = sg_iter.page; + struct page *page = sg_page_iter_page(&sg_iter); if (remain <= 0) break; @@ -765,7 +765,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, offset >> PAGE_SHIFT) { - struct page *page = sg_iter.page; + struct page *page = sg_page_iter_page(&sg_iter); int partial_cacheline_write; if (remain <= 0) @@ -1647,7 +1647,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj) obj->dirty = 0; for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) { - struct page *page = sg_iter.page; + struct page *page = sg_page_iter_page(&sg_iter); if (obj->dirty) set_page_dirty(page); @@ -1827,7 +1827,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) err_pages: sg_mark_end(sg); for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) - page_cache_release(sg_iter.page); + page_cache_release(sg_page_iter_page(&sg_iter)); sg_free_table(st); kfree(st); return PTR_ERR(page); diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c index 898615d2d5e..c6dfc1466e3 100644 --- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c @@ -130,7 +130,7 @@ static void *i915_gem_dmabuf_vmap(struct dma_buf *dma_buf) i = 0; for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0); - pages[i++] = sg_iter.page; + pages[i++] = sg_page_iter_page(&sg_iter); obj->dma_buf_vmapping = vmap(pages, i, 0, PAGE_KERNEL); drm_free_large(pages); diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 4cbae7bbb83..24a23b31b55 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -123,8 +123,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt, for_each_sg_page(pages->sgl, &sg_iter, pages->nents, 0) { dma_addr_t page_addr; - page_addr = sg_dma_address(sg_iter.sg) + - (sg_iter.sg_pgoffset << PAGE_SHIFT); + page_addr = sg_page_iter_dma_address(&sg_iter); pt_vaddr[act_pte] = gen6_pte_encode(ppgtt->dev, page_addr, cache_level); if (++act_pte == I915_PPGTT_PT_ENTRIES) { @@ -424,8 +423,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev, dma_addr_t addr; for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) { - addr = sg_dma_address(sg_iter.sg) + - (sg_iter.sg_pgoffset << PAGE_SHIFT); + addr = sg_page_iter_dma_address(&sg_iter); iowrite32(gen6_pte_encode(dev, addr, level), >t_entries[i]); i++; } diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c index f799708bcb8..c807eb93755 100644 --- a/drivers/gpu/drm/i915/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c @@ -481,7 +481,7 @@ i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj) i = 0; for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) { - struct page *page = sg_iter.page; + struct page *page = sg_page_iter_page(&sg_iter); char new_bit_17 = page_to_phys(page) >> 17; if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) { @@ -511,7 +511,7 @@ i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj) i = 0; for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) { - if (page_to_phys(sg_iter.page) & (1 << 17)) + if (page_to_phys(sg_page_iter_page(&sg_iter)) & (1 << 17)) __set_bit(i, obj->bit_17); else __clear_bit(i, obj->bit_17); diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 2d8bdaef961..e96b9546c4c 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -235,13 +235,13 @@ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, * sg page iterator * * Iterates over sg entries page-by-page. On each successful iteration, - * @piter->page points to the current page, @piter->sg to the sg holding this - * page and @piter->sg_pgoffset to the page's page offset within the sg. The - * iteration will stop either when a maximum number of sg entries was reached - * or a terminating sg (sg_last(sg) == true) was reached. + * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter) + * to get the current page and its dma address. @piter->sg will point to the + * sg holding this page and @piter->sg_pgoffset to the page's page offset + * within the sg. The iteration will stop either when a maximum number of sg + * entries was reached or a terminating sg (sg_last(sg) == true) was reached. */ struct sg_page_iter { - struct page *page; /* current page */ struct scatterlist *sg; /* sg holding the page */ unsigned int sg_pgoffset; /* page offset within the sg */ @@ -255,6 +255,24 @@ bool __sg_page_iter_next(struct sg_page_iter *piter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); +/** + * sg_page_iter_page - get the current page held by the page iterator + * @piter: page iterator holding the page + */ +static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) +{ + return nth_page(sg_page(piter->sg), piter->sg_pgoffset); +} + +/** + * sg_page_iter_dma_address - get the dma address of the current page held by + * the page iterator. + * @piter: page iterator holding the page + */ +static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) +{ + return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT); +} /** * for_each_sg_page - iterate over the pages of the given sg list diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b83c144d731..a1cf8cae60e 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -401,7 +401,6 @@ void __sg_page_iter_start(struct sg_page_iter *piter, piter->__pg_advance = 0; piter->__nents = nents; - piter->page = NULL; piter->sg = sglist; piter->sg_pgoffset = pgoffset; } @@ -426,7 +425,6 @@ bool __sg_page_iter_next(struct sg_page_iter *piter) if (!--piter->__nents || !piter->sg) return false; } - piter->page = nth_page(sg_page(piter->sg), piter->sg_pgoffset); return true; } @@ -496,7 +494,7 @@ bool sg_miter_next(struct sg_mapping_iter *miter) miter->__remaining = min_t(unsigned long, miter->__remaining, PAGE_SIZE - miter->__offset); } - miter->page = miter->piter.page; + miter->page = sg_page_iter_page(&miter->piter); miter->consumed = miter->length = miter->__remaining; if (miter->__flags & SG_MITER_ATOMIC) -- cgit v1.2.3-70-g09d2 From 0ecc833bac594099505a090cbca6ccd5b83d5975 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Mar 2013 12:23:28 -0400 Subject: mode_t, whack-a-mole at 11... Signed-off-by: Al Viro --- drivers/net/wireless/ath/wil6210/debugfs.c | 4 ++-- drivers/staging/dgrp/dgrp_common.h | 2 +- drivers/staging/dgrp/dgrp_specproc.c | 2 +- fs/f2fs/acl.c | 2 +- fs/f2fs/dir.c | 2 +- fs/proc/self.c | 2 +- lib/notifier-error-inject.c | 4 ++-- 7 files changed, 9 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c index 65fc9683bfd..76c7694518d 100644 --- a/drivers/net/wireless/ath/wil6210/debugfs.c +++ b/drivers/net/wireless/ath/wil6210/debugfs.c @@ -216,7 +216,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_iomem_x32, wil_debugfs_iomem_x32_get, wil_debugfs_iomem_x32_set, "0x%08llx\n"); static struct dentry *wil_debugfs_create_iomem_x32(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void __iomem *value) { @@ -367,7 +367,7 @@ static const struct file_operations fops_ioblob = { static struct dentry *wil_debugfs_create_ioblob(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { diff --git a/drivers/staging/dgrp/dgrp_common.h b/drivers/staging/dgrp/dgrp_common.h index 0583fe9c7b0..fde116236e9 100644 --- a/drivers/staging/dgrp/dgrp_common.h +++ b/drivers/staging/dgrp/dgrp_common.h @@ -120,7 +120,7 @@ enum { struct dgrp_proc_entry { int id; /* Integer identifier */ const char *name; /* ASCII identifier */ - mode_t mode; /* File access permissions */ + umode_t mode; /* File access permissions */ struct dgrp_proc_entry *child; /* Child pointer */ /* file ops to use, pass NULL to use default */ diff --git a/drivers/staging/dgrp/dgrp_specproc.c b/drivers/staging/dgrp/dgrp_specproc.c index 73f287f9660..dddf8a2e396 100644 --- a/drivers/staging/dgrp/dgrp_specproc.c +++ b/drivers/staging/dgrp/dgrp_specproc.c @@ -228,7 +228,7 @@ static void register_proc_table(struct dgrp_proc_entry *table, { struct proc_dir_entry *de; int len; - mode_t mode; + umode_t mode; if (table == NULL) return; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 137af4255da..44abc2f286e 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct posix_acl *acl; int error; - mode_t mode = get_inode_mode(inode); + umode_t mode = get_inode_mode(inode); if (!test_opt(sbi, POSIX_ACL)) return 0; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a1f38443ece..1be948768e2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -60,7 +60,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/proc/self.c b/fs/proc/self.c index aa5cc3bff14..d8a02529661 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -51,7 +51,7 @@ static const struct inode_operations proc_self_inode_operations = { void __init proc_self_init(void) { struct proc_dir_entry *proc_self_symlink; - mode_t mode; + umode_t mode; mode = S_IFLNK | S_IRWXUGO; proc_self_symlink = proc_create("self", mode, NULL, NULL ); diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c index 44b92cb6224..eb4a04afea8 100644 --- a/lib/notifier-error-inject.c +++ b/lib/notifier-error-inject.c @@ -17,7 +17,7 @@ static int debugfs_errno_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set, "%lld\n"); -static struct dentry *debugfs_create_errno(const char *name, mode_t mode, +static struct dentry *debugfs_create_errno(const char *name, umode_t mode, struct dentry *parent, int *value) { return debugfs_create_file(name, mode, parent, value, &fops_errno); @@ -50,7 +50,7 @@ struct dentry *notifier_err_inject_init(const char *name, struct dentry *parent, struct notifier_err_inject *err_inject, int priority) { struct notifier_err_inject_action *action; - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; struct dentry *actions_dir; -- cgit v1.2.3-70-g09d2 From a49b7e82cab0f9b41f483359be83f44fbb6b4979 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 13 Apr 2013 15:15:30 -0700 Subject: kobject: fix kset_find_obj() race with concurrent last kobject_put() Anatol Pomozov identified a race condition that hits module unloading and re-loading. To quote Anatol: "This is a race codition that exists between kset_find_obj() and kobject_put(). kset_find_obj() might return kobject that has refcount equal to 0 if this kobject is freeing by kobject_put() in other thread. Here is timeline for the crash in case if kset_find_obj() searches for an object tht nobody holds and other thread is doing kobject_put() on the same kobject: THREAD A (calls kset_find_obj()) THREAD B (calls kobject_put()) splin_lock() atomic_dec_return(kobj->kref), counter gets zero here ... starts kobject cleanup .... spin_lock() // WAIT thread A in kobj_kset_leave() iterate over kset->list atomic_inc(kobj->kref) (counter becomes 1) spin_unlock() spin_lock() // taken // it does not know that thread A increased counter so it remove obj from list spin_unlock() vfree(module) // frees module object with containing kobj // kobj points to freed memory area!! kobject_put(kobj) // OOPS!!!! The race above happens because module.c tries to use kset_find_obj() when somebody unloads module. The module.c code was introduced in commit 6494a93d55fa" Anatol supplied a patch specific for module.c that worked around the problem by simply not using kset_find_obj() at all, but rather than make a local band-aid, this just fixes kset_find_obj() to be thread-safe using the proper model of refusing the get a new reference if the refcount has already dropped to zero. See examples of this proper refcount handling not only in the kref documentation, but in various other equivalent uses of this pattern by grepping for atomic_inc_not_zero(). [ Side note: the module race does indicate that module loading and unloading is not properly serialized wrt sysfs information using the module mutex. That may require further thought, but this is the correct fix at the kobject layer regardless. ] Reported-analyzed-and-tested-by: Anatol Pomozov Cc: Greg Kroah-Hartman Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- lib/kobject.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index e07ee1fcd6f..a65486613d7 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -529,6 +529,13 @@ struct kobject *kobject_get(struct kobject *kobj) return kobj; } +static struct kobject *kobject_get_unless_zero(struct kobject *kobj) +{ + if (!kref_get_unless_zero(&kobj->kref)) + kobj = NULL; + return kobj; +} + /* * kobject_cleanup - free kobject resources. * @kobj: object to cleanup @@ -751,7 +758,7 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name) list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { - ret = kobject_get(k); + ret = kobject_get_unless_zero(k); break; } } -- cgit v1.2.3-70-g09d2 From 0635eb8a54cf0fea64b174bb68bc36b9c3d622db Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 15 Apr 2013 13:09:45 -0700 Subject: Move utf16 functions to kernel core and rename We want to be able to use the utf16 functions that are currently present in the EFI variables code in platform-specific code as well. Move them to the kernel core, and in the process rename them to accurately describe what they do - they don't handle UTF16, only UCS2. Signed-off-by: Matthew Garrett Signed-off-by: Matt Fleming --- drivers/firmware/Kconfig | 1 + drivers/firmware/efivars.c | 80 ++++++++++----------------------------------- include/linux/ucs2_string.h | 14 ++++++++ lib/Kconfig | 3 ++ lib/Makefile | 2 ++ lib/ucs2_string.c | 51 +++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 62 deletions(-) create mode 100644 include/linux/ucs2_string.h create mode 100644 lib/ucs2_string.c (limited to 'lib') diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 42c759a4d04..3e532002e4d 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -39,6 +39,7 @@ config FIRMWARE_MEMMAP config EFI_VARS tristate "EFI Variable Support via sysfs" depends on EFI + select UCS2_STRING default n help If you say Y here, you are able to get EFI (Extensible Firmware diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index bf15d81d74e..182ce947117 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -172,51 +173,6 @@ static void efivar_update_sysfs_entries(struct work_struct *); static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries); static bool efivar_wq_enabled = true; -/* Return the number of unicode characters in data */ -static unsigned long -utf16_strnlen(efi_char16_t *s, size_t maxlength) -{ - unsigned long length = 0; - - while (*s++ != 0 && length < maxlength) - length++; - return length; -} - -static inline unsigned long -utf16_strlen(efi_char16_t *s) -{ - return utf16_strnlen(s, ~0UL); -} - -/* - * Return the number of bytes is the length of this string - * Note: this is NOT the same as the number of unicode characters - */ -static inline unsigned long -utf16_strsize(efi_char16_t *data, unsigned long maxlength) -{ - return utf16_strnlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t); -} - -static inline int -utf16_strncmp(const efi_char16_t *a, const efi_char16_t *b, size_t len) -{ - while (1) { - if (len == 0) - return 0; - if (*a < *b) - return -1; - if (*a > *b) - return 1; - if (*a == 0) /* implies *b == 0 */ - return 0; - a++; - b++; - len--; - } -} - static bool validate_device_path(struct efi_variable *var, int match, u8 *buffer, unsigned long len) @@ -268,7 +224,7 @@ validate_load_option(struct efi_variable *var, int match, u8 *buffer, u16 filepathlength; int i, desclength = 0, namelen; - namelen = utf16_strnlen(var->VariableName, sizeof(var->VariableName)); + namelen = ucs2_strnlen(var->VariableName, sizeof(var->VariableName)); /* Either "Boot" or "Driver" followed by four digits of hex */ for (i = match; i < match+4; i++) { @@ -291,7 +247,7 @@ validate_load_option(struct efi_variable *var, int match, u8 *buffer, * There's no stored length for the description, so it has to be * found by hand */ - desclength = utf16_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2; + desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2; /* Each boot entry must have a descriptor */ if (!desclength) @@ -581,7 +537,7 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count) spin_lock_irq(&efivars->lock); status = check_var_size_locked(efivars, new_var->Attributes, - new_var->DataSize + utf16_strsize(new_var->VariableName, 1024)); + new_var->DataSize + ucs2_strsize(new_var->VariableName, 1024)); if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED) status = efivars->ops->set_variable(new_var->VariableName, @@ -759,7 +715,7 @@ static ssize_t efivarfs_file_write(struct file *file, * QueryVariableInfo() isn't supported by the firmware. */ - varsize = datasize + utf16_strsize(var->var.VariableName, 1024); + varsize = datasize + ucs2_strsize(var->var.VariableName, 1024); status = check_var_size(efivars, attributes, varsize); if (status != EFI_SUCCESS) { @@ -1211,7 +1167,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) inode = NULL; - len = utf16_strlen(entry->var.VariableName); + len = ucs2_strlen(entry->var.VariableName); /* name, plus '-', plus GUID, plus NUL*/ name = kmalloc(len + 1 + GUID_LEN + 1, GFP_ATOMIC); @@ -1469,8 +1425,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, if (efi_guidcmp(entry->var.VendorGuid, vendor)) continue; - if (utf16_strncmp(entry->var.VariableName, efi_name, - utf16_strlen(efi_name))) { + if (ucs2_strncmp(entry->var.VariableName, efi_name, + ucs2_strlen(efi_name))) { /* * Check if an old format, * which doesn't support holding @@ -1482,8 +1438,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, for (i = 0; i < DUMP_NAME_LEN; i++) efi_name_old[i] = name_old[i]; - if (utf16_strncmp(entry->var.VariableName, efi_name_old, - utf16_strlen(efi_name_old))) + if (ucs2_strncmp(entry->var.VariableName, efi_name_old, + ucs2_strlen(efi_name_old))) continue; } @@ -1561,8 +1517,8 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, * Does this variable already exist? */ list_for_each_entry_safe(search_efivar, n, &efivars->list, list) { - strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024); - strsize2 = utf16_strsize(new_var->VariableName, 1024); + strsize1 = ucs2_strsize(search_efivar->var.VariableName, 1024); + strsize2 = ucs2_strsize(new_var->VariableName, 1024); if (strsize1 == strsize2 && !memcmp(&(search_efivar->var.VariableName), new_var->VariableName, strsize1) && @@ -1578,7 +1534,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, } status = check_var_size_locked(efivars, new_var->Attributes, - new_var->DataSize + utf16_strsize(new_var->VariableName, 1024)); + new_var->DataSize + ucs2_strsize(new_var->VariableName, 1024)); if (status && status != EFI_UNSUPPORTED) { spin_unlock_irq(&efivars->lock); @@ -1602,7 +1558,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, /* Create the entry in sysfs. Locking is not required here */ status = efivar_create_sysfs_entry(efivars, - utf16_strsize(new_var->VariableName, + ucs2_strsize(new_var->VariableName, 1024), new_var->VariableName, &new_var->VendorGuid); @@ -1632,8 +1588,8 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj, * Does this variable already exist? */ list_for_each_entry_safe(search_efivar, n, &efivars->list, list) { - strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024); - strsize2 = utf16_strsize(del_var->VariableName, 1024); + strsize1 = ucs2_strsize(search_efivar->var.VariableName, 1024); + strsize2 = ucs2_strsize(del_var->VariableName, 1024); if (strsize1 == strsize2 && !memcmp(&(search_efivar->var.VariableName), del_var->VariableName, strsize1) && @@ -1679,9 +1635,9 @@ static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor) unsigned long strsize1, strsize2; bool found = false; - strsize1 = utf16_strsize(variable_name, 1024); + strsize1 = ucs2_strsize(variable_name, 1024); list_for_each_entry_safe(entry, n, &efivars->list, list) { - strsize2 = utf16_strsize(entry->var.VariableName, 1024); + strsize2 = ucs2_strsize(entry->var.VariableName, 1024); if (strsize1 == strsize2 && !memcmp(variable_name, &(entry->var.VariableName), strsize2) && diff --git a/include/linux/ucs2_string.h b/include/linux/ucs2_string.h new file mode 100644 index 00000000000..cbb20afdbc0 --- /dev/null +++ b/include/linux/ucs2_string.h @@ -0,0 +1,14 @@ +#ifndef _LINUX_UCS2_STRING_H_ +#define _LINUX_UCS2_STRING_H_ + +#include /* for size_t */ +#include /* for NULL */ + +typedef u16 ucs2_char_t; + +unsigned long ucs2_strnlen(const ucs2_char_t *s, size_t maxlength); +unsigned long ucs2_strlen(const ucs2_char_t *s); +unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength); +int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len); + +#endif /* _LINUX_UCS2_STRING_H_ */ diff --git a/lib/Kconfig b/lib/Kconfig index 3958dc4389f..fe01d418b09 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -404,4 +404,7 @@ config OID_REGISTRY help Enable fast lookup object identifier registry. +config UCS2_STRING + tristate + endmenu diff --git a/lib/Makefile b/lib/Makefile index d7946ff75b2..6e2cc561f76 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -174,3 +174,5 @@ quiet_cmd_build_OID_registry = GEN $@ cmd_build_OID_registry = perl $(srctree)/$(src)/build_OID_registry $< $@ clean-files += oid_registry_data.c + +obj-$(CONFIG_UCS2_STRING) += ucs2_string.o diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c new file mode 100644 index 00000000000..6f500ef2301 --- /dev/null +++ b/lib/ucs2_string.c @@ -0,0 +1,51 @@ +#include +#include + +/* Return the number of unicode characters in data */ +unsigned long +ucs2_strnlen(const ucs2_char_t *s, size_t maxlength) +{ + unsigned long length = 0; + + while (*s++ != 0 && length < maxlength) + length++; + return length; +} +EXPORT_SYMBOL(ucs2_strnlen); + +unsigned long +ucs2_strlen(const ucs2_char_t *s) +{ + return ucs2_strnlen(s, ~0UL); +} +EXPORT_SYMBOL(ucs2_strlen); + +/* + * Return the number of bytes is the length of this string + * Note: this is NOT the same as the number of unicode characters + */ +unsigned long +ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength) +{ + return ucs2_strnlen(data, maxlength/sizeof(ucs2_char_t)) * sizeof(ucs2_char_t); +} +EXPORT_SYMBOL(ucs2_strsize); + +int +ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len) +{ + while (1) { + if (len == 0) + return 0; + if (*a < *b) + return -1; + if (*a > *b) + return 1; + if (*a == 0) /* implies *b == 0 */ + return 0; + a++; + b++; + len--; + } +} +EXPORT_SYMBOL(ucs2_strncmp); -- cgit v1.2.3-70-g09d2 From c729de8fcea37a1c444e81857eace12494c804a9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:45 -0700 Subject: x86, kdump: Set crashkernel_low automatically Chao said that kdump does does work well on his system on 3.8 without extra parameter, even iommu does not work with kdump. And now have to append crashkernel_low=Y in first kernel to make kdump work. We have now modified crashkernel=X to allocate memory beyong 4G (if available) and do not allocate low range for crashkernel if the user does not specify that with crashkernel_low=Y. This causes regression if iommu is not enabled. Without iommu, swiotlb needs to be setup in first 4G and there is no low memory available to second kernel. Set crashkernel_low automatically if the user does not specify that. For system that does support IOMMU with kdump properly, user could specify crashkernel_low=0 to save that 72M low ram. -v3: add swiotlb_size() according to Konrad. -v4: add comments what 8M is for according to hpa. also update more crashkernel_low= in kernel-parameters.txt -v5: update changelog according to Vivek. -v6: Change description about swiotlb referring according to HATAYAMA. Reported-by: WANG Chao Tested-by: WANG Chao Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-2-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 14 +++++++++++--- arch/x86/kernel/setup.c | 21 ++++++++++++++++++--- include/linux/swiotlb.h | 1 + lib/swiotlb.c | 19 +++++++++++++++---- 4 files changed, 45 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc3..cff672da248 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -596,9 +596,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. is selected automatically. Check Documentation/kdump/kdump.txt for further details. - crashkernel_low=size[KMG] - [KNL, x86] parts under 4G. - crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is @@ -606,6 +603,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. + crashkernel_low=size[KMG] + [KNL, x86_64] range under 4G. When crashkernel= is + passed, kernel allocate physical memory region + above 4G, that cause second kernel crash on system + that require some amount of low memory, e.g. swiotlb + requires at least 64M+32K low memory. Kernel would + try to allocate 72M below 4G automatically. + This one let user to specify own low range under 4G + for second kernel instead. + 0: to disable low allocation. + cs89x0_dma= [HW,NET] Format: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 90d8cc930f5..12349202cae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -521,19 +521,34 @@ static void __init reserve_crashkernel_low(void) unsigned long long low_base = 0, low_size = 0; unsigned long total_low_mem; unsigned long long base; + bool auto_set = false; int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); - if (ret != 0 || low_size <= 0) - return; + if (ret != 0) { + /* + * two parts from lib/swiotlb.c: + * swiotlb size: user specified with swiotlb= or default. + * swiotlb overflow buffer: now is hardcoded to 32k. + * We round it to 8M for other buffers that + * may need to stay low too. + */ + low_size = swiotlb_size_or_default() + (8UL<<20); + auto_set = true; + } else { + /* passed with crashkernel_low=0 ? */ + if (!low_size) + return; + } low_base = memblock_find_in_range(low_size, (1ULL<<32), low_size, alignment); if (!low_base) { - pr_info("crashkernel low reservation failed - No suitable area found.\n"); + if (!auto_set) + pr_info("crashkernel low reservation failed - No suitable area found.\n"); return; } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 2de42f9401d..a5ffd32642f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -25,6 +25,7 @@ extern int swiotlb_force; extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); +unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); /* diff --git a/lib/swiotlb.c b/lib/swiotlb.c index bfe02b8fc55..d23762e6652 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -105,9 +105,9 @@ setup_io_tlb_npages(char *str) if (!strcmp(str, "force")) swiotlb_force = 1; - return 1; + return 0; } -__setup("swiotlb=", setup_io_tlb_npages); +early_param("swiotlb", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ unsigned long swiotlb_nr_tbl(void) @@ -115,6 +115,18 @@ unsigned long swiotlb_nr_tbl(void) return io_tlb_nslabs; } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); + +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) +unsigned long swiotlb_size_or_default(void) +{ + unsigned long size; + + size = io_tlb_nslabs << IO_TLB_SHIFT; + + return size ? size : (IO_TLB_DEFAULT_SIZE); +} + /* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) @@ -188,8 +200,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { - /* default to 64MB */ - size_t default_size = 64UL<<20; + size_t default_size = IO_TLB_DEFAULT_SIZE; unsigned char *vstart; unsigned long bytes; -- cgit v1.2.3-70-g09d2 From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:06:11 -0700 Subject: mm, show_mem: suppress page counts in non-blockable contexts On large systems with a lot of memory, walking all RAM to determine page types may take a half second or even more. In non-blockable contexts, the page allocator will emit a page allocation failure warning unless __GFP_NOWARN is specified. In such contexts, irqs are typically disabled and such a lengthy delay may even result in NMI watchdog timeouts. To fix this, suppress the page walk in such contexts when printing the page allocation failure warning. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/init.c | 3 +++ arch/ia64/mm/contig.c | 2 ++ arch/ia64/mm/discontig.c | 2 ++ arch/parisc/mm/init.c | 2 ++ arch/unicore32/mm/init.c | 3 +++ include/linux/mm.h | 3 ++- lib/show_mem.c | 3 +++ mm/page_alloc.c | 7 +++++++ 8 files changed, 24 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index ad722f1208a..ad9a9f3f032 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -99,6 +99,9 @@ void show_mem(unsigned int filter) printk("Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_bank (i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 80dab509dfb..67c59ebec89 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -47,6 +47,8 @@ void show_mem(unsigned int filter) printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); printk(KERN_INFO "Node memory in pages:\n"); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; for_each_online_pgdat(pgdat) { unsigned long present; unsigned long flags; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c2e955ee79a..a57436e5d40 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -623,6 +623,8 @@ void show_mem(unsigned int filter) printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; printk(KERN_INFO "Node memory in pages:\n"); for_each_online_pgdat(pgdat) { unsigned long present; diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3ac462de53a..cf2da13c41e 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -697,6 +697,8 @@ void show_mem(unsigned int filter) printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; #ifndef CONFIG_DISCONTIGMEM i = max_mapnr; while (i-- > 0) { diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index de186bde897..644482882ba 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -66,6 +66,9 @@ void show_mem(unsigned int filter) printk(KERN_DEFAULT "Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_bank(i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/include/linux/mm.h b/include/linux/mm.h index e2091b88d24..f3c7b1f9d1d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -899,7 +899,8 @@ extern void pagefault_out_of_memory(void); * Flags passed to show_mem() and show_free_areas() to suppress output in * various contexts. */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ +#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/lib/show_mem.c b/lib/show_mem.c index 4407f8c9b1f..b7c72311ad0 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -18,6 +18,9 @@ void show_mem(unsigned int filter) printk("Mem-Info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_online_pgdat(pgdat) { unsigned long i, flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ff1536f01b..da7a2fe7332 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2002,6 +2002,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) debug_guardpage_minorder() > 0) return; + /* + * Walking all memory to count page types is very expensive and should + * be inhibited in non-blockable contexts. + */ + if (!(gfp_mask & __GFP_WAIT)) + filter |= SHOW_MEM_FILTER_PAGE_COUNT; + /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set -- cgit v1.2.3-70-g09d2 From 9375db07adeaeea5f5ea7ca0463a8b371d71ddbb Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Mon, 29 Apr 2013 16:17:10 -0700 Subject: genalloc: add devres support, allow to find a managed pool by device This patch adds three exported functions to lib/genalloc.c: devm_gen_pool_create, dev_get_gen_pool, and of_get_named_gen_pool. devm_gen_pool_create is a managed version of gen_pool_create that keeps track of the pool via devres and allows the management code to automatically destroy it after device removal. dev_get_gen_pool retrieves the gen_pool for a given device, if it was created with devm_gen_pool_create, using devres_find. of_get_named_gen_pool retrieves the gen_pool for a given device node and property name, where the property must contain a phandle pointing to a platform device node. The corresponding platform device is then fed into dev_get_gen_pool and the resulting gen_pool is returned. [akpm@linux-foundation.org: make the of_get_named_gen_pool() stub static, fixing a zillion link errors] [akpm@linux-foundation.org: squish "struct device declared inside parameter list" warning] Signed-off-by: Philipp Zabel Acked-by: Grant Likely Tested-by: Michal Simek Cc: Fabio Estevam Cc: Matt Porter Cc: Dong Aisheng Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Paul Gortmaker Cc: Javier Martin Cc: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 18 +++++++++++ lib/genalloc.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'lib') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index dd7c569aaca..661d374aeb2 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -29,6 +29,10 @@ #ifndef __GENALLOC_H__ #define __GENALLOC_H__ + +struct device; +struct device_node; + /** * Allocation callback function type definition * @map: Pointer to bitmap @@ -105,4 +109,18 @@ extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data); +extern struct gen_pool *devm_gen_pool_create(struct device *dev, + int min_alloc_order, int nid); +extern struct gen_pool *dev_get_gen_pool(struct device *dev); + +#ifdef CONFIG_OF +extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, + const char *propname, int index); +#else +static inline struct gen_pool *of_get_named_gen_pool(struct device_node *np, + const char *propname, int index) +{ + return NULL; +} +#endif #endif /* __GENALLOC_H__ */ diff --git a/lib/genalloc.c b/lib/genalloc.c index 54920433705..b35cfa9bc3d 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) { @@ -480,3 +482,82 @@ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, return start_bit; } EXPORT_SYMBOL(gen_pool_best_fit); + +static void devm_gen_pool_release(struct device *dev, void *res) +{ + gen_pool_destroy(*(struct gen_pool **)res); +} + +/** + * devm_gen_pool_create - managed gen_pool_create + * @dev: device that provides the gen_pool + * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents + * @nid: node id of the node the pool structure should be allocated on, or -1 + * + * Create a new special memory pool that can be used to manage special purpose + * memory not managed by the regular kmalloc/kfree interface. The pool will be + * automatically destroyed by the device management code. + */ +struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, + int nid) +{ + struct gen_pool **ptr, *pool; + + ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); + + pool = gen_pool_create(min_alloc_order, nid); + if (pool) { + *ptr = pool; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return pool; +} + +/** + * dev_get_gen_pool - Obtain the gen_pool (if any) for a device + * @dev: device to retrieve the gen_pool from + * @name: Optional name for the gen_pool, usually NULL + * + * Returns the gen_pool for the device if one is present, or NULL. + */ +struct gen_pool *dev_get_gen_pool(struct device *dev) +{ + struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, + NULL); + + if (!p) + return NULL; + return *p; +} +EXPORT_SYMBOL_GPL(dev_get_gen_pool); + +#ifdef CONFIG_OF +/** + * of_get_named_gen_pool - find a pool by phandle property + * @np: device node + * @propname: property name containing phandle(s) + * @index: index into the phandle array + * + * Returns the pool that contains the chunk starting at the physical + * address of the device tree node pointed at by the phandle property, + * or NULL if not found. + */ +struct gen_pool *of_get_named_gen_pool(struct device_node *np, + const char *propname, int index) +{ + struct platform_device *pdev; + struct device_node *np_pool; + + np_pool = of_parse_phandle(np, propname, index); + if (!np_pool) + return NULL; + pdev = of_find_device_by_node(np_pool); + if (!pdev) + return NULL; + return dev_get_gen_pool(&pdev->dev); +} +EXPORT_SYMBOL_GPL(of_get_named_gen_pool); +#endif /* CONFIG_OF */ -- cgit v1.2.3-70-g09d2 From 30493cc9dddb68066dcc4878015660fdaa8e0965 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 29 Apr 2013 16:18:09 -0700 Subject: lib/int_sqrt.c: optimize square root algorithm Optimize the current version of the shift-and-subtract (hardware) algorithm, described by John von Newmann[1] and Guy L Steele. Iterating 1,000,000 times, perf shows for the current version: Performance counter stats for './sqrt-curr' (10 runs): 27.170996 task-clock # 0.979 CPUs utilized ( +- 3.19% ) 3 context-switches # 0.103 K/sec ( +- 4.76% ) 0 cpu-migrations # 0.004 K/sec ( +-100.00% ) 104 page-faults # 0.004 M/sec ( +- 0.16% ) 64,921,199 cycles # 2.389 GHz ( +- 0.03% ) 28,967,789 stalled-cycles-frontend # 44.62% frontend cycles idle ( +- 0.18% ) stalled-cycles-backend 104,502,623 instructions # 1.61 insns per cycle # 0.28 stalled cycles per insn ( +- 0.00% ) 34,088,368 branches # 1254.587 M/sec ( +- 0.00% ) 4,901 branch-misses # 0.01% of all branches ( +- 1.32% ) 0.027763015 seconds time elapsed ( +- 3.22% ) And for the new version: Performance counter stats for './sqrt-new' (10 runs): 0.496869 task-clock # 0.519 CPUs utilized ( +- 2.38% ) 0 context-switches # 0.000 K/sec 0 cpu-migrations # 0.403 K/sec ( +-100.00% ) 104 page-faults # 0.209 M/sec ( +- 0.15% ) 590,760 cycles # 1.189 GHz ( +- 2.35% ) 395,053 stalled-cycles-frontend # 66.87% frontend cycles idle ( +- 3.67% ) stalled-cycles-backend 398,963 instructions # 0.68 insns per cycle # 0.99 stalled cycles per insn ( +- 0.39% ) 70,228 branches # 141.341 M/sec ( +- 0.36% ) 3,364 branch-misses # 4.79% of all branches ( +- 5.45% ) 0.000957440 seconds time elapsed ( +- 2.42% ) Furthermore, this saves space in instruction text: text data bss dec hex filename 111 0 0 111 6f lib/int_sqrt-baseline.o 89 0 0 89 59 lib/int_sqrt.o [1] http://en.wikipedia.org/wiki/First_Draft_of_a_Report_on_the_EDVAC Signed-off-by: Davidlohr Bueso Reviewed-by: Jonathan Gonzalez Tested-by: Jonathan Gonzalez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/int_sqrt.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index fc2eeb7cb2e..1ef4cc34497 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -1,3 +1,9 @@ +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * Based on the shift-and-subtract algorithm for computing integer + * square root from Guy L. Steele. + */ #include #include @@ -10,23 +16,23 @@ */ unsigned long int_sqrt(unsigned long x) { - unsigned long op, res, one; + unsigned long b, m, y = 0; - op = x; - res = 0; + if (x <= 1) + return x; - one = 1UL << (BITS_PER_LONG - 2); - while (one > op) - one >>= 2; + m = 1UL << (BITS_PER_LONG - 2); + while (m != 0) { + b = y + m; + y >>= 1; - while (one != 0) { - if (op >= res + one) { - op = op - (res + one); - res = res + 2 * one; + if (x >= b) { + x -= b; + y += m; } - res /= 2; - one /= 4; + m >>= 2; } - return res; + + return y; } EXPORT_SYMBOL(int_sqrt); -- cgit v1.2.3-70-g09d2 From 095d141b2e40665289d44a42d387ffac2841ef82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 29 Apr 2013 16:18:10 -0700 Subject: argv_split(): teach it to handle mutable strings argv_split() allocates argv[count_argc(str)] array and assumes that it will find the same number of arguments later. This is obviously wrong if this string can be changed, say, by sysctl. With this patch argv_split() kstrndup's the whole string and does not split it, we simply replace the spaces with zeroes and keep the allocated memory in argv[-1] for argv_free(arg). We do not use argv[0] because: - str can be all-spaces or empty. In fact this case is fine, we could kfree() it before return, but: - str can have a space at the start, and we can not rely on kstrndup(skip_spaces(str)) because it can equally race if this string is mutable. Also, simplify count_argc() and kill the no longer used skip_arg(). Signed-off-by: Oleg Nesterov Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/argv_split.c | 87 ++++++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 47 deletions(-) (limited to 'lib') diff --git a/lib/argv_split.c b/lib/argv_split.c index 1e9a6cbc368..e927ed0e18a 100644 --- a/lib/argv_split.c +++ b/lib/argv_split.c @@ -8,23 +8,17 @@ #include #include -static const char *skip_arg(const char *cp) -{ - while (*cp && !isspace(*cp)) - cp++; - - return cp; -} - static int count_argc(const char *str) { int count = 0; + bool was_space; - while (*str) { - str = skip_spaces(str); - if (*str) { + for (was_space = true; *str; str++) { + if (isspace(*str)) { + was_space = true; + } else if (was_space) { + was_space = false; count++; - str = skip_arg(str); } } @@ -39,10 +33,8 @@ static int count_argc(const char *str) */ void argv_free(char **argv) { - char **p; - for (p = argv; *p; p++) - kfree(*p); - + argv--; + kfree(argv[0]); kfree(argv); } EXPORT_SYMBOL(argv_free); @@ -59,43 +51,44 @@ EXPORT_SYMBOL(argv_free); * considered to be a single argument separator. The returned array * is always NULL-terminated. Returns NULL on memory allocation * failure. + * + * The source string at `str' may be undergoing concurrent alteration via + * userspace sysctl activity (at least). The argv_split() implementation + * attempts to handle this gracefully by taking a local copy to work on. */ char **argv_split(gfp_t gfp, const char *str, int *argcp) { - int argc = count_argc(str); - char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp); - char **argvp; - - if (argv == NULL) - goto out; - - if (argcp) - *argcp = argc; - - argvp = argv; - - while (*str) { - str = skip_spaces(str); - - if (*str) { - const char *p = str; - char *t; - - str = skip_arg(str); + char *argv_str; + bool was_space; + char **argv, **argv_ret; + int argc; + + argv_str = kstrndup(str, KMALLOC_MAX_SIZE - 1, gfp); + if (!argv_str) + return NULL; + + argc = count_argc(argv_str); + argv = kmalloc(sizeof(*argv) * (argc + 2), gfp); + if (!argv) { + kfree(argv_str); + return NULL; + } - t = kstrndup(p, str-p, gfp); - if (t == NULL) - goto fail; - *argvp++ = t; + *argv = argv_str; + argv_ret = ++argv; + for (was_space = true; *argv_str; argv_str++) { + if (isspace(*argv_str)) { + was_space = true; + *argv_str = 0; + } else if (was_space) { + was_space = false; + *argv++ = argv_str; } } - *argvp = NULL; - - out: - return argv; + *argv = NULL; - fail: - argv_free(argv); - return NULL; + if (argcp) + *argcp = argc; + return argv_ret; } EXPORT_SYMBOL(argv_split); -- cgit v1.2.3-70-g09d2 From 2e0fb404c86d6c86dc33e284310eb5d28d192dcf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2013 16:18:11 -0700 Subject: lib, net: make isodigit() public and use it There are at least two users of isodigit(). Let's make it a public function of ctype.h. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ctype.h | 6 ++++++ lib/dynamic_debug.c | 1 - net/sunrpc/cache.c | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/include/linux/ctype.h b/include/linux/ctype.h index 8acfe312f94..653589e3e30 100644 --- a/include/linux/ctype.h +++ b/include/linux/ctype.h @@ -61,4 +61,10 @@ static inline char _tolower(const char c) return c | 0x20; } +/* Fast check for octal digit */ +static inline int isodigit(const char c) +{ + return c >= '0' && c <= '7'; +} + #endif diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 5276b99ca65..46032453abd 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -281,7 +281,6 @@ static inline int parse_lineno(const char *str, unsigned int *val) * allow the user to express a query which matches a format * containing embedded spaces. */ -#define isodigit(c) ((c) >= '0' && (c) <= '7') static char *unescape(char *str) { char *in = str; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 25d58e76601..ce2d180d05a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1208,7 +1208,6 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall); * key and content are both parsed by cache */ -#define isodigit(c) (isdigit(c) && c <= '7') int qword_get(char **bpp, char *dest, int bufsize) { /* return bytes copied, or -1 on error */ -- cgit v1.2.3-70-g09d2 From 3e6628c4b347a558965041290c5a92791dd4c741 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 29 Apr 2013 16:21:16 -0700 Subject: idr: introduce idr_alloc_cyclic() As Tejun points out, there are several users of the IDR facility that attempt to use it in a cyclic fashion. These users are likely to see -ENOSPC errors after the counter wraps one or more times however. This patchset adds a new idr_alloc_cyclic routine and converts several of these users to it. Many of these users are in obscure parts of the kernel, and I don't have a good way to test some of them. The change is pretty straightforward though, so hopefully it won't be an issue. There is one other cyclic user of idr_alloc that I didn't touch in ipc/util.c. That one is doing some strange stuff that I didn't quite understand, but it looks like it should probably be converted later somehow. This patch: Thus spake Tejun Heo: Ooh, BTW, the cyclic allocation is broken. It's prone to -ENOSPC after the first wraparound. There are several cyclic users in the kernel and I think it probably would be best to implement cyclic support in idr. This patch does that by adding new idr_alloc_cyclic function that such users in the kernel can use. With this, there's no need for a caller to keep track of the last value used as that's now tracked internally. This should prevent the ENOSPC problems that can hit when the "last allocated" counter exceeds INT_MAX. Later patches will convert existing cyclic users to the new interface. Signed-off-by: Jeff Layton Reviewed-by: Tejun Heo Cc: "David S. Miller" Cc: "J. Bruce Fields" Cc: Eric Paris Cc: Jack Morgenstein Cc: John McCutchan Cc: Neil Horman Cc: Or Gerlitz Cc: Robert Love Cc: Roland Dreier Cc: Sridhar Samudrala Cc: Steve Wise Cc: Tom Tucker Cc: Vlad Yasevich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 2 ++ lib/idr.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'lib') diff --git a/include/linux/idr.h b/include/linux/idr.h index 2640c7e99e5..a470ac3ef49 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -42,6 +42,7 @@ struct idr { struct idr_layer *id_free; int layers; /* only valid w/o concurrent changes */ int id_free_cnt; + int cur; /* current pos for cyclic allocation */ spinlock_t lock; }; @@ -75,6 +76,7 @@ struct idr { void *idr_find_slowpath(struct idr *idp, int id); void idr_preload(gfp_t gfp_mask); int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask); +int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask); int idr_for_each(struct idr *idp, int (*fn)(int id, void *p, void *data), void *data); void *idr_get_next(struct idr *idp, int *nextid); diff --git a/lib/idr.c b/lib/idr.c index 322e2816f2f..cca4b9302a7 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -495,6 +495,33 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(idr_alloc); +/** + * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion + * @idr: the (initialized) idr + * @ptr: pointer to be associated with the new id + * @start: the minimum id (inclusive) + * @end: the maximum id (exclusive, <= 0 for max) + * @gfp_mask: memory allocation flags + * + * Essentially the same as idr_alloc, but prefers to allocate progressively + * higher ids if it can. If the "cur" counter wraps, then it will start again + * at the "start" end of the range and allocate one that has already been used. + */ +int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int id; + + id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask); + if (id == -ENOSPC) + id = idr_alloc(idr, ptr, start, end, gfp_mask); + + if (likely(id >= 0)) + idr->cur = id + 1; + return id; +} +EXPORT_SYMBOL(idr_alloc_cyclic); + static void idr_remove_warning(int id) { printk(KERN_WARNING -- cgit v1.2.3-70-g09d2 From cedddb00023b1321f2bb2739739aa8c9927fc063 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 29 Apr 2013 16:21:25 -0700 Subject: uuid: use prandom_bytes() Use prandom_bytes() to generate 16 bytes of pseudo-random bytes. Signed-off-by: Akinobu Mita Cc: "Theodore Ts'o" Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/uuid.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/uuid.c b/lib/uuid.c index 52a6fe6387d..398821e4dce 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -25,13 +25,7 @@ static void __uuid_gen_common(__u8 b[16]) { - int i; - u32 r; - - for (i = 0; i < 4; i++) { - r = random32(); - memcpy(b + i * 4, &r, 4); - } + prandom_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } -- cgit v1.2.3-70-g09d2 From f39fee5f11f7adfcf5c9643f87718b80450be18a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 29 Apr 2013 16:21:28 -0700 Subject: lib/: rename random32() to prandom_u32() Use preferable function name which implies using a pseudo-random number generator. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/fault-inject.c | 2 +- lib/list_sort.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/fault-inject.c b/lib/fault-inject.c index f7210ad6cff..c5c7a762b85 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -122,7 +122,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) return false; } - if (attr->probability <= random32() % 100) + if (attr->probability <= prandom_u32() % 100) return false; if (!fail_stacktrace(attr)) diff --git a/lib/list_sort.c b/lib/list_sort.c index d7325c6b103..1183fa70a44 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -229,7 +229,7 @@ static int __init list_sort_test(void) goto exit; } /* force some equivalencies */ - el->value = random32() % (TEST_LIST_LEN/3); + el->value = prandom_u32() % (TEST_LIST_LEN / 3); el->serial = i; el->poison1 = TEST_POISON1; el->poison2 = TEST_POISON2; -- cgit v1.2.3-70-g09d2 From f3002134158092178be81339ec5a22ff80e6c308 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 30 Apr 2013 11:35:07 +0200 Subject: Revert "math64: New div64_u64_rem helper" This reverts commit f792685006274a850e6cc0ea9ade275ccdfc90bc. The cputime scaling code was changed/fixed and does not need the div64_u64_rem() primitive anymore. It has no other users, so let's remove them. Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1367314507-9728-4-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- include/linux/math64.h | 19 +------------------ lib/div64.c | 19 ++++++------------- 2 files changed, 7 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/include/linux/math64.h b/include/linux/math64.h index 931a619407b..b8ba8554472 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -29,15 +29,6 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) return dividend / divisor; } -/** - * div64_u64_rem - unsigned 64bit divide with 64bit divisor - */ -static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) -{ - *remainder = dividend % divisor; - return dividend / divisor; -} - /** * div64_u64 - unsigned 64bit divide with 64bit divisor */ @@ -70,16 +61,8 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); #endif -#ifndef div64_u64_rem -extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); -#endif - #ifndef div64_u64 -static inline u64 div64_u64(u64 dividend, u64 divisor) -{ - u64 remainder; - return div64_u64_rem(dividend, divisor, &remainder); -} +extern u64 div64_u64(u64 dividend, u64 divisor); #endif #ifndef div64_s64 diff --git a/lib/div64.c b/lib/div64.c index 3af5728d95f..a163b6caef7 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem); #endif /** - * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder + * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: 64bit dividend * @divisor: 64bit divisor - * @remainder: 64bit remainder * * This implementation is a modified version of the algorithm proposed * by the book 'Hacker's Delight'. The original source and full proof @@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem); * * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' */ -#ifndef div64_u64_rem -u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) +#ifndef div64_u64 +u64 div64_u64(u64 dividend, u64 divisor) { u32 high = divisor >> 32; u64 quot; if (high == 0) { - u32 rem32; - quot = div_u64_rem(dividend, divisor, &rem32); - *remainder = rem32; + quot = div_u64(dividend, divisor); } else { int n = 1 + fls(high); quot = div_u64(dividend >> n, divisor >> n); if (quot != 0) quot--; - - *remainder = dividend - quot * divisor; - if (*remainder >= divisor) { + if ((dividend - quot * divisor) >= divisor) quot++; - *remainder -= divisor; - } } return quot; } -EXPORT_SYMBOL(div64_u64_rem); +EXPORT_SYMBOL(div64_u64); #endif /** -- cgit v1.2.3-70-g09d2 From b0d33c2bd77bcf2d7c9427d2361ac57fe5b33aa1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Dec 2012 10:18:50 -0800 Subject: vsprintf: Add extension %pSR - print_symbol replacement print_symbol takes a long and converts it to a function name and offset. %pS does something similar, but doesn't translate the address via __builtin_extract_return_addr. %pSR does the translation. This will enable replacing multiple calls like printk(...); printk_symbol(addr); printk("\n"); with a single non-interleavable in dmesg printk("... %pSR\n", (void *)addr); Update documentation too. Signed-off-by: Joe Perches Signed-off-by: Jiri Kosina --- Documentation/printk-formats.txt | 2 ++ lib/vsprintf.c | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index e8a6aa473ba..e3d0215013d 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -17,6 +17,8 @@ Symbols/Function Pointers: %pF versatile_init+0x0/0x110 %pf versatile_init %pS versatile_init+0x0/0x110 + %pSR versatile_init+0x9/0x110 + (with __builtin_extract_return_addr() translation) %ps versatile_init %pB prev_fn_of_versatile_init+0x88/0x88 diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 0d62fd700f6..e149c641638 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -534,14 +534,21 @@ char *string(char *buf, char *end, const char *s, struct printf_spec spec) static noinline_for_stack char *symbol_string(char *buf, char *end, void *ptr, - struct printf_spec spec, char ext) + struct printf_spec spec, const char *fmt) { - unsigned long value = (unsigned long) ptr; + unsigned long value; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - if (ext == 'B') +#endif + + if (fmt[1] == 'R') + ptr = __builtin_extract_return_addr(ptr); + value = (unsigned long)ptr; + +#ifdef CONFIG_KALLSYMS + if (*fmt == 'B') sprint_backtrace(sym, value); - else if (ext != 'f' && ext != 's') + else if (*fmt != 'f' && *fmt != 's') sprint_symbol(sym, value); else sprint_symbol_no_offset(sym, value); @@ -987,6 +994,7 @@ int kptr_restrict __read_mostly; * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers with offset * - 's' For symbolic direct pointers without offset + * - '[FfSs]R' as above with __builtin_extract_return_addr() translation * - 'B' For backtraced symbolic direct pointers with offset * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] @@ -1060,7 +1068,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, case 'S': case 's': case 'B': - return symbol_string(buf, end, ptr, spec, *fmt); + return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': return resource_string(buf, end, ptr, spec, fmt); -- cgit v1.2.3-70-g09d2 From 196779b9b4ce1922afabdc20d0270720603bd46c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 30 Apr 2013 15:27:12 -0700 Subject: dump_stack: consolidate dump_stack() implementations and unify their behaviors Both dump_stack() and show_stack() are currently implemented by each architecture. show_stack(NULL, NULL) dumps the backtrace for the current task as does dump_stack(). On some archs, dump_stack() prints extra information - pid, utsname and so on - in addition to the backtrace while the two are identical on other archs. The usages in arch-independent code of the two functions indicate show_stack(NULL, NULL) should print out bare backtrace while dump_stack() is used for debugging purposes when something went wrong, so it does make sense to print additional information on the task which triggered dump_stack(). There's no reason to require archs to implement two separate but mostly identical functions. It leads to unnecessary subtle information. This patch expands the dummy fallback dump_stack() implementation in lib/dump_stack.c such that it prints out debug information (taken from x86) and invokes show_stack(NULL, NULL) and drops arch-specific dump_stack() implementations in all archs except blackfin. Blackfin's dump_stack() does something wonky that I don't understand. Debug information can be printed separately by calling dump_stack_print_info() so that arch-specific dump_stack() implementation can still emit the same debug information. This is used in blackfin. This patch brings the following behavior changes. * On some archs, an extra level in backtrace for show_stack() could be printed. This is because the top frame was determined in dump_stack() on those archs while generic dump_stack() can't do that reliably. It can be compensated by inlining dump_stack() but not sure whether that'd be necessary. * Most archs didn't use to print debug info on dump_stack(). They do now. An example WARN dump follows. WARNING: at kernel/workqueue.c:4841 init_workqueues+0x35/0x505() Hardware name: empty Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #9 0000000000000009 ffff88007c861e08 ffffffff81c614dc ffff88007c861e48 ffffffff8108f50f ffffffff82228240 0000000000000040 ffffffff8234a03c 0000000000000000 0000000000000000 0000000000000000 ffff88007c861e58 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] init_workqueues+0x35/0x505 ... v2: CPU number added to the generic debug info as requested by s390 folks and dropped the s390 specific dump_stack(). This loses %ksp from the debug message which the maintainers think isn't important enough to keep the s390-specific dump_stack() implementation. dump_stack_print_info() is moved to kernel/printk.c from lib/dump_stack.c. Because linkage is per objecct file, dump_stack_print_info() living in the same lib file as generic dump_stack() means that archs which implement custom dump_stack() - at this point, only blackfin - can't use dump_stack_print_info() as that will bring in the generic version of dump_stack() too. v1 The v1 patch broke build on blackfin due to this issue. The build breakage was reported by Fengguang Wu. Signed-off-by: Tejun Heo Acked-by: David S. Miller Acked-by: Vineet Gupta Acked-by: Jesper Nilsson Acked-by: Vineet Gupta Acked-by: Martin Schwidefsky [s390 bits] Cc: Heiko Carstens Cc: Mike Frysinger Cc: Fengguang Wu Cc: Bjorn Helgaas Cc: Sam Ravnborg Acked-by: Richard Kuo [hexagon bits] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/traps.c | 7 ------- arch/arc/kernel/stacktrace.c | 7 ------- arch/arm/kernel/traps.c | 7 ------- arch/arm64/kernel/traps.c | 7 ------- arch/avr32/kernel/process.c | 8 -------- arch/blackfin/kernel/dumpstack.c | 1 + arch/c6x/kernel/traps.c | 9 --------- arch/cris/kernel/traps.c | 7 ------- arch/frv/kernel/traps.c | 11 ----------- arch/h8300/kernel/traps.c | 7 ------- arch/hexagon/kernel/traps.c | 8 -------- arch/ia64/kernel/process.c | 8 -------- arch/m32r/kernel/traps.c | 9 --------- arch/m68k/kernel/traps.c | 12 ------------ arch/metag/kernel/traps.c | 6 ------ arch/microblaze/kernel/traps.c | 6 ------ arch/mips/kernel/traps.c | 13 ------------- arch/mn10300/kernel/traps.c | 11 ----------- arch/openrisc/kernel/traps.c | 11 ----------- arch/parisc/kernel/traps.c | 8 -------- arch/powerpc/kernel/process.c | 6 ------ arch/s390/kernel/dumpstack.c | 17 ----------------- arch/score/kernel/traps.c | 10 ---------- arch/sh/kernel/dumpstack.c | 6 ------ arch/sparc/kernel/process_32.c | 7 ------- arch/sparc/kernel/traps_64.c | 7 ------- arch/um/kernel/sysrq.c | 12 ------------ arch/unicore32/kernel/traps.c | 6 ------ arch/x86/kernel/dumpstack.c | 18 ------------------ arch/xtensa/kernel/traps.c | 8 -------- include/linux/printk.h | 5 +++++ kernel/printk.c | 18 ++++++++++++++++++ lib/dump_stack.c | 11 ++++++++--- 33 files changed, 32 insertions(+), 262 deletions(-) (limited to 'lib') diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 4037461a649..affccb959a9 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -169,13 +169,6 @@ void show_stack(struct task_struct *task, unsigned long *sp) dik_show_trace(sp); } -void dump_stack(void) -{ - show_stack(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - void die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) { diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index a63ff842564..ca0207b9d5b 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -220,13 +220,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) show_stacktrace(tsk, NULL); } -/* Expected by Rest of kernel code */ -void dump_stack(void) -{ - show_stacktrace(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); - /* Another API expected by schedular, shows up in "ps" as Wait Channel * Ofcourse just returning schedule( ) would be pointless so unwind until * the function is not in schedular code diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1c089119b2d..18b32e8e449 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -204,13 +204,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) } #endif -void dump_stack(void) -{ - dump_backtrace(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - void show_stack(struct task_struct *tsk, unsigned long *sp) { dump_backtrace(NULL, tsk); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index b3c5f628bdb..61d7dd29f75 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -167,13 +167,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) } } -void dump_stack(void) -{ - dump_backtrace(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - void show_stack(struct task_struct *tsk, unsigned long *sp) { dump_backtrace(NULL, tsk); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 073c3c2fa52..89a8017e8ca 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -204,14 +204,6 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) show_stack_log_lvl(tsk, (unsigned long)stack, NULL, ""); } -void dump_stack(void) -{ - unsigned long stack; - - show_trace_log_lvl(current, &stack, NULL, ""); -} -EXPORT_SYMBOL(dump_stack); - static const char *cpu_modes[] = { "Application", "Supervisor", "Interrupt level 0", "Interrupt level 1", "Interrupt level 2", "Interrupt level 3", "Exception", "NMI" diff --git a/arch/blackfin/kernel/dumpstack.c b/arch/blackfin/kernel/dumpstack.c index 5cfbaa29821..95ba6d9e9a3 100644 --- a/arch/blackfin/kernel/dumpstack.c +++ b/arch/blackfin/kernel/dumpstack.c @@ -168,6 +168,7 @@ void dump_stack(void) #endif trace_buffer_save(tflags); dump_bfin_trace_buffer(); + dump_stack_print_info(KERN_DEFAULT); show_stack(current, &stack); trace_buffer_restore(tflags); } diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index 1be74e5b478..d0b96ef25c1 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -67,15 +67,6 @@ void show_regs(struct pt_regs *regs) pr_err("A31: %08lx B31: %08lx\n", regs->a31, regs->b31); } -void dump_stack(void) -{ - unsigned long stack; - - show_stack(current, &stack); -} -EXPORT_SYMBOL(dump_stack); - - void die(char *str, struct pt_regs *fp, int nr) { console_verbose(); diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c index a11ad3229f8..0ffda73734f 100644 --- a/arch/cris/kernel/traps.c +++ b/arch/cris/kernel/traps.c @@ -146,13 +146,6 @@ show_stack(void) } #endif -void -dump_stack(void) -{ - show_stack(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); - void set_nmi_handler(void (*handler)(struct pt_regs *)) { diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c index 5cfd1420b09..cfcd802d6f9 100644 --- a/arch/frv/kernel/traps.c +++ b/arch/frv/kernel/traps.c @@ -466,17 +466,6 @@ asmlinkage void compound_exception(unsigned long esfr1, BUG(); } /* end compound_exception() */ -/*****************************************************************************/ -/* - * The architecture-independent backtrace generator - */ -void dump_stack(void) -{ - show_stack(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - void show_stack(struct task_struct *task, unsigned long *sp) { } diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index 7833aa3e7c7..cfe494dbe3d 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -164,10 +164,3 @@ void show_trace_task(struct task_struct *tsk) { show_stack(tsk,(unsigned long *)tsk->thread.esp0); } - -void dump_stack(void) -{ - show_stack(NULL,NULL); -} - -EXPORT_SYMBOL(dump_stack); diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index be5e2dd9c9d..cc2171b2aa0 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -191,14 +191,6 @@ void show_stack(struct task_struct *task, unsigned long *fp) do_show_stack(task, fp, 0); } -void dump_stack(void) -{ - unsigned long *fp; - asm("%0 = r30" : "=r" (fp)); - show_stack(current, fp); -} -EXPORT_SYMBOL(dump_stack); - int die(const char *str, struct pt_regs *regs, long err) { static struct { diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index a26fc640e4c..182bd64cc72 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -95,14 +95,6 @@ show_stack (struct task_struct *task, unsigned long *sp) } } -void -dump_stack (void) -{ - show_stack(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - void show_regs (struct pt_regs *regs) { diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index 9fe3467a513..a7a424f852e 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -167,15 +167,6 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_trace(task, sp); } -void dump_stack(void) -{ - unsigned long stack; - - show_trace(current, &stack); -} - -EXPORT_SYMBOL(dump_stack); - static void show_registers(struct pt_regs *regs) { int i = 0; diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index f32ab22e7ed..88fcd8c70e7 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -991,18 +991,6 @@ void show_stack(struct task_struct *task, unsigned long *stack) show_trace(stack); } -/* - * The architecture-independent backtrace generator - */ -void dump_stack(void) -{ - unsigned long stack; - - show_trace(&stack); -} - -EXPORT_SYMBOL(dump_stack); - /* * The vector number returned in the frame pointer may also contain * the "fs" (Fault Status) bits on ColdFire. These are in the bottom diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c index 8961f247b50..2ceeaae5b19 100644 --- a/arch/metag/kernel/traps.c +++ b/arch/metag/kernel/traps.c @@ -987,9 +987,3 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) show_trace(tsk, sp, NULL); } - -void dump_stack(void) -{ - show_stack(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index 30e6b5004a6..cb619533a19 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -75,9 +75,3 @@ void show_stack(struct task_struct *task, unsigned long *sp) debug_show_held_locks(task); } - -void dump_stack(void) -{ - show_stack(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index c3abb88170f..b512b28cf78 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -206,19 +206,6 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_stacktrace(task, ®s); } -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - struct pt_regs regs; - - prepare_frametrace(®s); - show_backtrace(current, ®s); -} - -EXPORT_SYMBOL(dump_stack); - static void show_code(unsigned int __user *pc) { long i; diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index b900e5afa0a..a7a987c7954 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -293,17 +293,6 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_trace(sp); } -/* - * the architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long stack; - - show_stack(current, &stack); -} -EXPORT_SYMBOL(dump_stack); - /* * dump the register file in the specified exception frame */ diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 5cce396016d..3d3f6062f49 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -105,17 +105,6 @@ void show_trace_task(struct task_struct *tsk) */ } -/* - * The architecture-independent backtrace generator - */ -void dump_stack(void) -{ - unsigned long stack; - - show_stack(current, &stack); -} -EXPORT_SYMBOL(dump_stack); - void show_registers(struct pt_regs *regs) { int i; diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index aeb8f8f2c07..e64cf5f09b6 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -158,14 +158,6 @@ void show_regs(struct pt_regs *regs) } } - -void dump_stack(void) -{ - show_stack(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - static void do_show_stack(struct unwind_frame_info *info) { int i = 1; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 16e77a81ab4..624d44bb44d 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1362,12 +1362,6 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) } while (count++ < kstack_depth_to_print); } -void dump_stack(void) -{ - show_stack(current, NULL); -} -EXPORT_SYMBOL(dump_stack); - #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void __ppc64_runlatch_on(void) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 03dce39d01e..2f1f639d1a3 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -129,23 +129,6 @@ static void show_last_breaking_event(struct pt_regs *regs) #endif } -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - printk("CPU: %d %s %s %.*s\n", - task_thread_info(current)->cpu, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk("Process %s (pid: %d, task: %p, ksp: %p)\n", - current->comm, current->pid, current, - (void *) current->thread.ksp); - show_stack(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); - static inline int mask_bits(struct pt_regs *regs, unsigned long bits) { return (regs->psw.mask & bits) / ((~bits + 1) & bits); diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c index 0e46fb19a84..a38f435fba7 100644 --- a/arch/score/kernel/traps.c +++ b/arch/score/kernel/traps.c @@ -149,16 +149,6 @@ static void show_registers(struct pt_regs *regs) printk(KERN_NOTICE "\n"); } -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - show_stack(current_thread_info()->task, - (long *) get_irq_regs()->regs[0]); -} -EXPORT_SYMBOL(dump_stack); - void __die(const char *str, struct pt_regs *regs, const char *file, const char *func, unsigned long line) { diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 7617dc4129a..b959f559260 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -158,9 +158,3 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) (unsigned long)task_stack_page(tsk)); show_trace(tsk, sp, NULL); } - -void dump_stack(void) -{ - show_stack(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 2be4214b390..dccf5f58d70 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -164,13 +164,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) printk("\n"); } -void dump_stack(void) -{ - show_stack(current, NULL); -} - -EXPORT_SYMBOL(dump_stack); - /* * Note: sparc64 has a pretty intricated thread_saved_pc, check it out. */ diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 8d38ca97aa2..b3f833ab90e 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2350,13 +2350,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) } while (++count < 16); } -void dump_stack(void) -{ - show_stack(current, NULL); -} - -EXPORT_SYMBOL(dump_stack); - static inline struct reg_window *kernel_stack_up(struct reg_window *rw) { unsigned long fp = rw->ins[6]; diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index e562ff80409..7d101a2a154 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -35,18 +35,6 @@ void show_trace(struct task_struct *task, unsigned long * stack) } #endif -/* - * stack dumps generator - this is used by arch-independent code. - * And this is identical to i386 currently. - */ -void dump_stack(void) -{ - unsigned long stack; - - show_trace(current, &stack); -} -EXPORT_SYMBOL(dump_stack); - /*Stolen from arch/i386/kernel/traps.c */ static const int kstack_depth_to_print = 24; diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 0870b68d2ad..c54e32410ea 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -170,12 +170,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) c_backtrace(fp, mode); } -void dump_stack(void) -{ - dump_backtrace(NULL, NULL); -} -EXPORT_SYMBOL(dump_stack); - void show_stack(struct task_struct *tsk, unsigned long *sp) { dump_backtrace(NULL, tsk); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dd1a7c391c9..deb6421c9e6 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -191,24 +191,6 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_stack_log_lvl(task, NULL, sp, bp, ""); } -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp; - unsigned long stack; - - bp = stack_frame(current, NULL); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 923db5c1527..384b7c7c2f6 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -481,14 +481,6 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_trace(task, stack); } -void dump_stack(void) -{ - show_stack(current, NULL); -} - -EXPORT_SYMBOL(dump_stack); - - void show_code(unsigned int *pc) { long i; diff --git a/include/linux/printk.h b/include/linux/printk.h index 4890fe62c01..7ce1f878cf6 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -145,6 +145,7 @@ extern void wake_up_klogd(void); void log_buf_kexec_setup(void); void __init setup_log_buf(int early); +void dump_stack_print_info(const char *log_lvl); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -182,6 +183,10 @@ static inline void log_buf_kexec_setup(void) static inline void setup_log_buf(int early) { } + +static inline void dump_stack_print_info(const char *log_lvl) +{ +} #endif extern void dump_stack(void) __cold; diff --git a/kernel/printk.c b/kernel/printk.c index 376914e2869..70b4b94a0ec 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -43,6 +43,7 @@ #include #include #include +#include #include @@ -2849,4 +2850,21 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) raw_spin_unlock_irqrestore(&logbuf_lock, flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +/** + * dump_stack_print_info - print generic debug info for dump_stack() + * @log_lvl: log level + * + * Arch-specific dump_stack() implementations can use this function to + * print out the same debug information as the generic dump_stack(). + */ +void dump_stack_print_info(const char *log_lvl) +{ + printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", + log_lvl, raw_smp_processor_id(), current->pid, current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +} + #endif diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 42f4f55c945..53bad099ebd 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -5,11 +5,16 @@ #include #include +#include +/** + * dump_stack - dump the current task information and its stack trace + * + * Architectures can override this implementation by implementing its own. + */ void dump_stack(void) { - printk(KERN_NOTICE - "This architecture does not implement dump_stack()\n"); + dump_stack_print_info(KERN_DEFAULT); + show_stack(NULL, NULL); } - EXPORT_SYMBOL(dump_stack); -- cgit v1.2.3-70-g09d2 From 16c7fa05829e8b91db48e3539c5d6ff3c2b18a23 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Apr 2013 15:27:30 -0700 Subject: lib/string_helpers: introduce generic string_unescape There are several places in kernel where modules unescapes input to convert C-Style Escape Sequences into byte codes. The patch provides generic implementation of such approach. Test cases are also included into the patch. [akpm@linux-foundation.org: clarify comment] [akpm@linux-foundation.org: export get_random_int() to modules] Signed-off-by: Andy Shevchenko Cc: Samuel Thibault Cc: Greg Kroah-Hartman Cc: Jason Baron Cc: Alexander Viro Cc: William Hubbs Cc: Chris Brannon Cc: Kirk Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/random.c | 1 + include/linux/string_helpers.h | 58 ++++++++++++++++++ lib/Kconfig.debug | 3 + lib/Makefile | 4 +- lib/string_helpers.c | 133 +++++++++++++++++++++++++++++++++++++++++ lib/test-string_helpers.c | 103 +++++++++++++++++++++++++++++++ 6 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 lib/test-string_helpers.c (limited to 'lib') diff --git a/drivers/char/random.c b/drivers/char/random.c index 32a6c576495..cd9a6211dca 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1485,6 +1485,7 @@ unsigned int get_random_int(void) return ret; } +EXPORT_SYMBOL(get_random_int); /* * randomize_range() returns a start address such that diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index a3eb2f65b65..3eeee9672a4 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -13,4 +13,62 @@ enum string_size_units { int string_get_size(u64 size, enum string_size_units units, char *buf, int len); +#define UNESCAPE_SPACE 0x01 +#define UNESCAPE_OCTAL 0x02 +#define UNESCAPE_HEX 0x04 +#define UNESCAPE_SPECIAL 0x08 +#define UNESCAPE_ANY \ + (UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL) + +/** + * string_unescape - unquote characters in the given string + * @src: source buffer (escaped) + * @dst: destination buffer (unescaped) + * @size: size of the destination buffer (0 to unlimit) + * @flags: combination of the flags (bitwise OR): + * %UNESCAPE_SPACE: + * '\f' - form feed + * '\n' - new line + * '\r' - carriage return + * '\t' - horizontal tab + * '\v' - vertical tab + * %UNESCAPE_OCTAL: + * '\NNN' - byte with octal value NNN (1 to 3 digits) + * %UNESCAPE_HEX: + * '\xHH' - byte with hexadecimal value HH (1 to 2 digits) + * %UNESCAPE_SPECIAL: + * '\"' - double quote + * '\\' - backslash + * '\a' - alert (BEL) + * '\e' - escape + * %UNESCAPE_ANY: + * all previous together + * + * Returns amount of characters processed to the destination buffer excluding + * trailing '\0'. + * + * Because the size of the output will be the same as or less than the size of + * the input, the transformation may be performed in place. + * + * Caller must provide valid source and destination pointers. Be aware that + * destination buffer will always be NULL-terminated. Source string must be + * NULL-terminated as well. + */ +int string_unescape(char *src, char *dst, size_t size, unsigned int flags); + +static inline int string_unescape_inplace(char *buf, unsigned int flags) +{ + return string_unescape(buf, buf, 0, flags); +} + +static inline int string_unescape_any(char *src, char *dst, size_t size) +{ + return string_unescape(src, dst, size, UNESCAPE_ANY); +} + +static inline int string_unescape_any_inplace(char *buf) +{ + return string_unescape_any(buf, buf, 0); +} + #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 28be08c09ba..77ebaa3dfa1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1463,5 +1463,8 @@ source "lib/Kconfig.kgdb" source "lib/Kconfig.kmemcheck" +config TEST_STRING_HELPERS + tristate "Test functions located in the string_helpers module at runtime" + config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" diff --git a/lib/Makefile b/lib/Makefile index 6e2cc561f76..23c9a0fe74f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,8 +22,10 @@ lib-y += kobject.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ + gcd.o lcm.o list_sort.o uuid.o flex_array.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o +obj-y += string_helpers.o +obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 1cffc223bff..ed5c1454dd6 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -2,10 +2,12 @@ * Helpers for formatting and printing strings * * Copyright 31 August 2008 James Bottomley + * Copyright (C) 2013, Intel Corporation */ #include #include #include +#include #include /** @@ -66,3 +68,134 @@ int string_get_size(u64 size, const enum string_size_units units, return 0; } EXPORT_SYMBOL(string_get_size); + +static bool unescape_space(char **src, char **dst) +{ + char *p = *dst, *q = *src; + + switch (*q) { + case 'n': + *p = '\n'; + break; + case 'r': + *p = '\r'; + break; + case 't': + *p = '\t'; + break; + case 'v': + *p = '\v'; + break; + case 'f': + *p = '\f'; + break; + default: + return false; + } + *dst += 1; + *src += 1; + return true; +} + +static bool unescape_octal(char **src, char **dst) +{ + char *p = *dst, *q = *src; + u8 num; + + if (isodigit(*q) == 0) + return false; + + num = (*q++) & 7; + while (num < 32 && isodigit(*q) && (q - *src < 3)) { + num <<= 3; + num += (*q++) & 7; + } + *p = num; + *dst += 1; + *src = q; + return true; +} + +static bool unescape_hex(char **src, char **dst) +{ + char *p = *dst, *q = *src; + int digit; + u8 num; + + if (*q++ != 'x') + return false; + + num = digit = hex_to_bin(*q++); + if (digit < 0) + return false; + + digit = hex_to_bin(*q); + if (digit >= 0) { + q++; + num = (num << 4) | digit; + } + *p = num; + *dst += 1; + *src = q; + return true; +} + +static bool unescape_special(char **src, char **dst) +{ + char *p = *dst, *q = *src; + + switch (*q) { + case '\"': + *p = '\"'; + break; + case '\\': + *p = '\\'; + break; + case 'a': + *p = '\a'; + break; + case 'e': + *p = '\e'; + break; + default: + return false; + } + *dst += 1; + *src += 1; + return true; +} + +int string_unescape(char *src, char *dst, size_t size, unsigned int flags) +{ + char *out = dst; + + while (*src && --size) { + if (src[0] == '\\' && src[1] != '\0' && size > 1) { + src++; + size--; + + if (flags & UNESCAPE_SPACE && + unescape_space(&src, &out)) + continue; + + if (flags & UNESCAPE_OCTAL && + unescape_octal(&src, &out)) + continue; + + if (flags & UNESCAPE_HEX && + unescape_hex(&src, &out)) + continue; + + if (flags & UNESCAPE_SPECIAL && + unescape_special(&src, &out)) + continue; + + *out++ = '\\'; + } + *out++ = *src++; + } + *out = '\0'; + + return out - dst; +} +EXPORT_SYMBOL(string_unescape); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c new file mode 100644 index 00000000000..6ac48de04c0 --- /dev/null +++ b/lib/test-string_helpers.c @@ -0,0 +1,103 @@ +/* + * Test cases for lib/string_helpers.c module. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +struct test_string { + const char *in; + const char *out; + unsigned int flags; +}; + +static const struct test_string strings[] __initconst = { + { + .in = "\\f\\ \\n\\r\\t\\v", + .out = "\f\\ \n\r\t\v", + .flags = UNESCAPE_SPACE, + }, + { + .in = "\\40\\1\\387\\0064\\05\\040\\8a\\110\\777", + .out = " \001\00387\0064\005 \\8aH?7", + .flags = UNESCAPE_OCTAL, + }, + { + .in = "\\xv\\xa\\x2c\\xD\\x6f2", + .out = "\\xv\n,\ro2", + .flags = UNESCAPE_HEX, + }, + { + .in = "\\h\\\\\\\"\\a\\e\\", + .out = "\\h\\\"\a\e\\", + .flags = UNESCAPE_SPECIAL, + }, +}; + +static void __init test_string_unescape(unsigned int flags, bool inplace) +{ + char in[256]; + char out_test[256]; + char out_real[256]; + int i, p = 0, q_test = 0, q_real = sizeof(out_real); + + for (i = 0; i < ARRAY_SIZE(strings); i++) { + const char *s = strings[i].in; + int len = strlen(strings[i].in); + + /* Copy string to in buffer */ + memcpy(&in[p], s, len); + p += len; + + /* Copy expected result for given flags */ + if (flags & strings[i].flags) { + s = strings[i].out; + len = strlen(strings[i].out); + } + memcpy(&out_test[q_test], s, len); + q_test += len; + } + in[p++] = '\0'; + + /* Call string_unescape and compare result */ + if (inplace) { + memcpy(out_real, in, p); + if (flags == UNESCAPE_ANY) + q_real = string_unescape_any_inplace(out_real); + else + q_real = string_unescape_inplace(out_real, flags); + } else if (flags == UNESCAPE_ANY) { + q_real = string_unescape_any(in, out_real, q_real); + } else { + q_real = string_unescape(in, out_real, q_real, flags); + } + + if (q_real != q_test || memcmp(out_test, out_real, q_test)) { + pr_warn("Test failed: flags = %u\n", flags); + print_hex_dump(KERN_WARNING, "Input: ", + DUMP_PREFIX_NONE, 16, 1, in, p - 1, true); + print_hex_dump(KERN_WARNING, "Expected: ", + DUMP_PREFIX_NONE, 16, 1, out_test, q_test, true); + print_hex_dump(KERN_WARNING, "Got: ", + DUMP_PREFIX_NONE, 16, 1, out_real, q_real, true); + } +} + +static int __init test_string_helpers_init(void) +{ + unsigned int i; + + pr_info("Running tests...\n"); + for (i = 0; i < UNESCAPE_ANY + 1; i++) + test_string_unescape(i, false); + test_string_unescape(get_random_int() % (UNESCAPE_ANY + 1), true); + + return -EINVAL; +} +module_init(test_string_helpers_init); +MODULE_LICENSE("Dual BSD/GPL"); -- cgit v1.2.3-70-g09d2 From d338b1379f96b20e63aa65082e19e9a18a93b608 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Apr 2013 15:27:32 -0700 Subject: dynamic_debug: reuse generic string_unescape function There is kernel function to do the job in generic way. Let's use it. Signed-off-by: Andy Shevchenko Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dynamic_debug.c | 48 +++++------------------------------------------- 1 file changed, 5 insertions(+), 43 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 46032453abd..99fec3ae405 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -276,47 +277,6 @@ static inline int parse_lineno(const char *str, unsigned int *val) return 0; } -/* - * Undo octal escaping in a string, inplace. This is useful to - * allow the user to express a query which matches a format - * containing embedded spaces. - */ -static char *unescape(char *str) -{ - char *in = str; - char *out = str; - - while (*in) { - if (*in == '\\') { - if (in[1] == '\\') { - *out++ = '\\'; - in += 2; - continue; - } else if (in[1] == 't') { - *out++ = '\t'; - in += 2; - continue; - } else if (in[1] == 'n') { - *out++ = '\n'; - in += 2; - continue; - } else if (isodigit(in[1]) && - isodigit(in[2]) && - isodigit(in[3])) { - *out++ = (((in[1] - '0') << 6) | - ((in[2] - '0') << 3) | - (in[3] - '0')); - in += 4; - continue; - } - } - *out++ = *in++; - } - *out = '\0'; - - return str; -} - static int check_set(const char **dest, char *src, char *name) { int rc = 0; @@ -370,8 +330,10 @@ static int ddebug_parse_query(char *words[], int nwords, } else if (!strcmp(words[i], "module")) { rc = check_set(&query->module, words[i+1], "module"); } else if (!strcmp(words[i], "format")) { - rc = check_set(&query->format, unescape(words[i+1]), - "format"); + string_unescape_inplace(words[i+1], UNESCAPE_SPACE | + UNESCAPE_OCTAL | + UNESCAPE_SPECIAL); + rc = check_set(&query->format, words[i+1], "format"); } else if (!strcmp(words[i], "line")) { char *first = words[i+1]; char *last = strchr(first, '-'); -- cgit v1.2.3-70-g09d2 From 4130f0efbfe5adb360328b777afa8e45f7e467f7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 15:28:24 -0700 Subject: rbtree_test: add extra rbtree integrity check Account for the rbtree having 2**bh(v)-1 internal nodes. While this can be seen as a consequence of other checks, Michel states that it nicely sums up what the other properties are for. Signed-off-by: Davidlohr Bueso Reviewed-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index af38aedbd87..99515038ff6 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -117,8 +117,7 @@ static int black_path_count(struct rb_node *rb) static void check(int nr_nodes) { struct rb_node *rb; - int count = 0; - int blacks = 0; + int count = 0, blacks = 0; u32 prev_key = 0; for (rb = rb_first(&root); rb; rb = rb_next(rb)) { @@ -134,7 +133,9 @@ static void check(int nr_nodes) prev_key = node->key; count++; } + WARN_ON_ONCE(count != nr_nodes); + WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1); } static void check_augmented(int nr_nodes) -- cgit v1.2.3-70-g09d2 From c75aaa8ed03eb1312ed2990f1a716b2b9cc0df42 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 15:28:25 -0700 Subject: rbtree_test: add __init/__exit annotations Signed-off-by: Davidlohr Bueso Reviewed-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 99515038ff6..122f02f9941 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -149,7 +149,7 @@ static void check_augmented(int nr_nodes) } } -static int rbtree_test_init(void) +static int __init rbtree_test_init(void) { int i, j; cycles_t time1, time2, time; @@ -222,7 +222,7 @@ static int rbtree_test_init(void) return -EAGAIN; /* Fail will directly unload the module */ } -static void rbtree_test_exit(void) +static void __exit rbtree_test_exit(void) { printk(KERN_ALERT "test exit\n"); } -- cgit v1.2.3-70-g09d2 From 446f24d1199e8a546ba7c97da3fbb9a505a94795 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 30 Apr 2013 15:28:42 -0700 Subject: Kconfig: consolidate CONFIG_DEBUG_STRICT_USER_COPY_CHECKS The help text for this config is duplicated across the x86, parisc, and s390 Kconfig.debug files. Arnd Bergman noted that the help text was slightly misleading and should be fixed to state that enabling this option isn't a problem when using pre 4.4 gcc. To simplify the rewording, consolidate the text into lib/Kconfig.debug and modify it there to be more explicit about when you should say N to this config. Also, make the text a bit more generic by stating that this option enables compile time checks so we can cover architectures which emit warnings vs. ones which emit errors. The details of how an architecture decided to implement the checks isn't as important as the concept of compile time checking of copy_from_user() calls. While we're doing this, remove all the copy_from_user_overflow() code that's duplicated many times and place it into lib/ so that any architecture supporting this option can get the function for free. Signed-off-by: Stephen Boyd Acked-by: Arnd Bergmann Acked-by: Ingo Molnar Acked-by: H. Peter Anvin Cc: Arjan van de Ven Acked-by: Helge Deller Cc: Heiko Carstens Cc: Stephen Rothwell Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/Kconfig | 1 + arch/parisc/Kconfig.debug | 14 -------------- arch/s390/Kconfig | 1 + arch/s390/Kconfig.debug | 14 -------------- arch/s390/lib/Makefile | 1 - arch/s390/lib/usercopy.c | 8 -------- arch/sparc/lib/Makefile | 1 - arch/sparc/lib/usercopy.c | 9 --------- arch/tile/Kconfig | 8 +------- arch/tile/include/asm/uaccess.h | 7 ++++++- arch/tile/lib/uaccess.c | 8 -------- arch/x86/Kconfig | 1 + arch/x86/Kconfig.debug | 14 -------------- arch/x86/lib/usercopy_32.c | 6 ------ lib/Kconfig.debug | 18 ++++++++++++++++++ lib/Makefile | 1 + lib/usercopy.c | 9 +++++++++ 17 files changed, 38 insertions(+), 83 deletions(-) delete mode 100644 arch/s390/lib/usercopy.c delete mode 100644 arch/sparc/lib/usercopy.c create mode 100644 lib/usercopy.c (limited to 'lib') diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 0339181bf3a..433e75a2ee9 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -1,5 +1,6 @@ config PARISC def_bool y + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select HAVE_IDE select HAVE_OPROFILE select HAVE_FUNCTION_TRACER if 64BIT diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug index 7305ac8f7f5..bc989e522a0 100644 --- a/arch/parisc/Kconfig.debug +++ b/arch/parisc/Kconfig.debug @@ -12,18 +12,4 @@ config DEBUG_RODATA portion of the kernel code won't be covered by a TLB anymore. If in doubt, say "N". -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict copy size checks" - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - ---help--- - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, or if you run an older (pre 4.4) gcc, say N. - endmenu diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bda6ba6f3cf..ce640aff61a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -91,6 +91,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index fc32a2df497..c56878e1245 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -17,20 +17,6 @@ config STRICT_DEVMEM If you are unsure, say Y. -config DEBUG_STRICT_USER_COPY_CHECKS - def_bool n - prompt "Strict user copy size checks" - ---help--- - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time warnings. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, or if you run an older (pre 4.4) gcc, say N. - config S390_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 6ab0d0b5cec..20b0e97a7df 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,7 +3,6 @@ # lib-y += delay.o string.o uaccess_std.o uaccess_pt.o -obj-y += usercopy.o obj-$(CONFIG_32BIT) += div64.o qrnnd.o ucmpdi2.o mem32.o obj-$(CONFIG_64BIT) += mem64.o lib-$(CONFIG_64BIT) += uaccess_mvcos.o diff --git a/arch/s390/lib/usercopy.c b/arch/s390/lib/usercopy.c deleted file mode 100644 index 14b363fec8a..00000000000 --- a/arch/s390/lib/usercopy.c +++ /dev/null @@ -1,8 +0,0 @@ -#include -#include - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 8410065f286..dbe119b63b4 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -45,4 +45,3 @@ obj-y += iomap.o obj-$(CONFIG_SPARC32) += atomic32.o ucmpdi2.o obj-y += ksyms.o obj-$(CONFIG_SPARC64) += PeeCeeI.o -obj-y += usercopy.o diff --git a/arch/sparc/lib/usercopy.c b/arch/sparc/lib/usercopy.c deleted file mode 100644 index 5c4284ce1c0..00000000000 --- a/arch/sparc/lib/usercopy.c +++ /dev/null @@ -1,9 +0,0 @@ -#include -#include -#include - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 25877aebc68..0f712f4e1b3 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -19,6 +19,7 @@ config TILE select HAVE_SYSCALL_WRAPPERS if TILEGX select VIRT_TO_BUS select SYS_HYPERVISOR + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA @@ -114,13 +115,6 @@ config STRICT_DEVMEM config SMP def_bool y -# Allow checking for compile-time determined overflow errors in -# copy_from_user(). There are still unprovable places in the -# generic code as of 2.6.34, so this option is not really compatible -# with -Werror, which is more useful in general. -config DEBUG_COPY_FROM_USER - def_bool n - config HVC_TILE depends on TTY select HVC_DRIVER diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h index 9ab078a4605..8a082bc6bca 100644 --- a/arch/tile/include/asm/uaccess.h +++ b/arch/tile/include/asm/uaccess.h @@ -395,7 +395,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -#ifdef CONFIG_DEBUG_COPY_FROM_USER +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS +/* + * There are still unprovable places in the generic code as of 2.6.34, so this + * option is not really compatible with -Werror, which is more useful in + * general. + */ extern void copy_from_user_overflow(void) __compiletime_warning("copy_from_user() size is not provably correct"); diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c index f8d398c9ee7..030abe3ee4f 100644 --- a/arch/tile/lib/uaccess.c +++ b/arch/tile/lib/uaccess.c @@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size) is_arch_mappable_range(addr, size)); } EXPORT_SYMBOL(__range_ok); - -#ifdef CONFIG_DEBUG_COPY_FROM_USER -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); -#endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 05b057dca4a..5db2117ae28 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -20,6 +20,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 16f738385dc..c198b7e13e7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -292,20 +292,6 @@ config OPTIMIZE_INLINING If unsure, say N. -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict copy size checks" - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - ---help--- - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, or if you run an older (pre 4.4) gcc, say N. - config DEBUG_NMI_SELFTEST bool "NMI Selftest" depends on DEBUG_KERNEL && X86_LOCAL_APIC diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index f0312d74640..3eb18acd0e4 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -689,9 +689,3 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return n; } EXPORT_SYMBOL(_copy_from_user); - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 77ebaa3dfa1..770a422a42e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1292,6 +1292,24 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. +config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + bool + +config DEBUG_STRICT_USER_COPY_CHECKS + bool "Strict user copy size checks" + depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING + help + Enabling this option turns a certain set of sanity checks for user + copy operations into compile time failures. + + The copy_from_user() etc checks are there to help test if there + are sufficient security checks on the length argument of + the copy operation, by having gcc prove that the argument is + within bounds. + + If unsure, say N. + source mm/Kconfig.debug source kernel/trace/Kconfig diff --git a/lib/Makefile b/lib/Makefile index 23c9a0fe74f..e9c52e1b853 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,6 +15,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o +obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/usercopy.c b/lib/usercopy.c new file mode 100644 index 00000000000..4f5b1ddbcd2 --- /dev/null +++ b/lib/usercopy.c @@ -0,0 +1,9 @@ +#include +#include +#include + +void copy_from_user_overflow(void) +{ + WARN(1, "Buffer overflow detected!\n"); +} +EXPORT_SYMBOL(copy_from_user_overflow); -- cgit v1.2.3-70-g09d2 From e12a95f40ac78f223deab743a5228f9eb3df8ed1 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 30 Apr 2013 15:28:49 -0700 Subject: notifier-error-inject: fix module names in Kconfig The Kconfig help text for MEMORY_NOTIFIER_ERROR_INJECT and OF_RECONFIG_NOTIFIER_ERROR_INJECT has mismatched module names. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 770a422a42e..566cf2bc08e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1192,7 +1192,7 @@ config MEMORY_NOTIFIER_ERROR_INJECT bash: echo: write error: Cannot allocate memory To compile this code as a module, choose M here: the module will - be called pSeries-reconfig-notifier-error-inject. + be called memory-notifier-error-inject. If unsure, say N. @@ -1209,7 +1209,7 @@ config OF_RECONFIG_NOTIFIER_ERROR_INJECT notified, write the error code to "actions//error". To compile this code as a module, choose M here: the module will - be called memory-notifier-error-inject. + be called of-reconfig-notifier-error-inject. If unsure, say N. -- cgit v1.2.3-70-g09d2 From 6f9982bdde99d4e6f248613ca780c63de26ba086 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 30 Apr 2013 15:28:50 -0700 Subject: lib/decompress.c: fix initconst Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/decompress.c b/lib/decompress.c index 31a80427728..f8fdedaf7b3 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -38,7 +38,7 @@ struct compress_format { decompress_fn decompressor; }; -static const struct compress_format compressed_formats[] __initdata = { +static const struct compress_format compressed_formats[] __initconst = { { {037, 0213}, "gzip", gunzip }, { {037, 0236}, "gzip", gunzip }, { {0x42, 0x5a}, "bzip2", bunzip2 }, -- cgit v1.2.3-70-g09d2 From 9e6879460c8edb0cd3c24c09b83d06541b5af0dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 4 May 2013 08:48:27 +0100 Subject: Give the OID registry file module info to avoid kernel tainting Give the OID registry file module information so that it doesn't taint the kernel when compiled as a module and loaded. Reported-by: Dros Adamson Signed-off-by: David Howells cc: Trond Myklebust cc: stable@vger.kernel.org cc: linux-nfs@vger.kernel.org Signed-off-by: Linus Torvalds --- lib/oid_registry.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib') diff --git a/lib/oid_registry.c b/lib/oid_registry.c index d8de11f4590..318f382a010 100644 --- a/lib/oid_registry.c +++ b/lib/oid_registry.c @@ -9,6 +9,7 @@ * 2 of the Licence, or (at your option) any later version. */ +#include #include #include #include @@ -16,6 +17,10 @@ #include #include "oid_registry_data.c" +MODULE_DESCRIPTION("OID Registry"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + /** * look_up_OID - Find an OID registration for the specified data * @data: Binary representation of the OID -- cgit v1.2.3-70-g09d2 From e2d57f782c8a906f80d5b5f54da803c1638929d7 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:49 -0700 Subject: rwsem: make the waiter type an enumeration rather than a bitmask We are not planning to add some new waiter flags, so we can convert the waiter type into an enumeration. Background: David Howells suggested I do this back when I tried adding a new waiter type for unfair readers. However, I believe the cleanup applies regardless of that use case. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem-spinlock.c | 19 +++++++++++-------- lib/rwsem.c | 23 +++++++++++++---------- 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'lib') diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 7542afbb22b..5f117f37ac0 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -9,12 +9,15 @@ #include #include +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + struct rwsem_waiter { struct list_head list; struct task_struct *task; - unsigned int flags; -#define RWSEM_WAITING_FOR_READ 0x00000001 -#define RWSEM_WAITING_FOR_WRITE 0x00000002 + enum rwsem_waiter_type type; }; int rwsem_is_locked(struct rw_semaphore *sem) @@ -68,7 +71,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (!wakewrite) { - if (waiter->flags & RWSEM_WAITING_FOR_WRITE) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) goto out; goto dont_wake_writers; } @@ -78,7 +81,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) * to -1 here to indicate we get the lock. Instead, we wake it up * to let it go get it again. */ - if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { wake_up_process(waiter->task); goto out; } @@ -86,7 +89,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) /* grant an infinite number of read locks to the front of the queue */ dont_wake_writers: woken = 0; - while (waiter->flags & RWSEM_WAITING_FOR_READ) { + while (waiter->type == RWSEM_WAITING_FOR_READ) { struct list_head *next = waiter->list.next; list_del(&waiter->list); @@ -144,7 +147,7 @@ void __sched __down_read(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; - waiter.flags = RWSEM_WAITING_FOR_READ; + waiter.type = RWSEM_WAITING_FOR_READ; get_task_struct(tsk); list_add_tail(&waiter.list, &sem->wait_list); @@ -201,7 +204,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) /* set up my own style of waitqueue */ tsk = current; waiter.task = tsk; - waiter.flags = RWSEM_WAITING_FOR_WRITE; + waiter.type = RWSEM_WAITING_FOR_WRITE; list_add_tail(&waiter.list, &sem->wait_list); /* wait for someone to release the lock */ diff --git a/lib/rwsem.c b/lib/rwsem.c index ad5e0df16ab..672eb33218a 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -30,12 +30,15 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, EXPORT_SYMBOL(__init_rwsem); +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + struct rwsem_waiter { struct list_head list; struct task_struct *task; - unsigned int flags; -#define RWSEM_WAITING_FOR_READ 0x00000001 -#define RWSEM_WAITING_FOR_WRITE 0x00000002 + enum rwsem_waiter_type type; }; /* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and @@ -65,7 +68,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) signed long woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) + if (waiter->type != RWSEM_WAITING_FOR_WRITE) goto readers_only; if (wake_type == RWSEM_WAKE_READ_OWNED) @@ -112,10 +115,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) waiter = list_entry(waiter->list.next, struct rwsem_waiter, list); - } while (waiter->flags & RWSEM_WAITING_FOR_READ); + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); adjustment = woken * RWSEM_ACTIVE_READ_BIAS; - if (waiter->flags & RWSEM_WAITING_FOR_READ) + if (waiter->type != RWSEM_WAITING_FOR_WRITE) /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; @@ -148,7 +151,7 @@ static int try_get_writer_sem(struct rw_semaphore *sem, /* only steal when first waiter is writing */ fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE)) + if (fwaiter->type != RWSEM_WAITING_FOR_WRITE) return 0; adjustment = RWSEM_ACTIVE_WRITE_BIAS; @@ -179,7 +182,7 @@ try_again_write: */ static struct rw_semaphore __sched * rwsem_down_failed_common(struct rw_semaphore *sem, - unsigned int flags, signed long adjustment) + enum rwsem_waiter_type type, signed long adjustment) { struct rwsem_waiter waiter; struct task_struct *tsk = current; @@ -190,7 +193,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); waiter.task = tsk; - waiter.flags = flags; + waiter.type = type; get_task_struct(tsk); if (list_empty(&sem->wait_list)) @@ -221,7 +224,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, raw_spin_lock_irq(&sem->wait_lock); /* Try to get the writer sem, may steal from the head writer: */ - if (flags == RWSEM_WAITING_FOR_WRITE) + if (type == RWSEM_WAITING_FOR_WRITE) if (try_get_writer_sem(sem, &waiter)) { raw_spin_unlock_irq(&sem->wait_lock); return sem; -- cgit v1.2.3-70-g09d2 From f7dd1cee9a4e2b1450e4a3732636dfbf28562ee4 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:50 -0700 Subject: rwsem: shorter spinlocked section in rwsem_down_failed_common() This change reduces the size of the spinlocked and TASK_UNINTERRUPTIBLE sections in rwsem_down_failed_common(): - We only need the sem->wait_lock to insert ourselves on the wait_list; the waiter node can be prepared outside of the wait_lock. - The task state only needs to be set to TASK_UNINTERRUPTIBLE immediately before checking if we actually need to sleep; it doesn't need to protect the entire function. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 672eb33218a..40636454cf3 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -188,14 +188,12 @@ rwsem_down_failed_common(struct rw_semaphore *sem, struct task_struct *tsk = current; signed long count; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ - raw_spin_lock_irq(&sem->wait_lock); waiter.task = tsk; waiter.type = type; get_task_struct(tsk); + raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) adjustment += RWSEM_WAITING_BIAS; list_add_tail(&waiter.list, &sem->wait_list); @@ -218,7 +216,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem, raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ - for (;;) { + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!waiter.task) break; @@ -231,7 +230,6 @@ rwsem_down_failed_common(struct rw_semaphore *sem, } raw_spin_unlock_irq(&sem->wait_lock); schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); } tsk->state = TASK_RUNNING; -- cgit v1.2.3-70-g09d2 From 1e78277ccbbb48af32a618d1ef0e8534e0b648d7 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:51 -0700 Subject: rwsem: move rwsem_down_failed_common code into rwsem_down_{read,write}_failed Remove the rwsem_down_failed_common function and replace it with two identical copies of its code in rwsem_down_{read,write}_failed. This is because we want to make different optimizations in rwsem_down_{read,write}_failed; we are adding this pure-duplication step as a separate commit in order to make it easier to check the following steps. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 40636454cf3..fb658af1c12 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -178,12 +178,12 @@ try_again_write: } /* - * wait for a lock to be granted + * wait for the read lock to be granted */ -static struct rw_semaphore __sched * -rwsem_down_failed_common(struct rw_semaphore *sem, - enum rwsem_waiter_type type, signed long adjustment) +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { + enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ; + signed long adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; signed long count; @@ -237,22 +237,64 @@ rwsem_down_failed_common(struct rw_semaphore *sem, return sem; } -/* - * wait for the read lock to be granted - */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, - -RWSEM_ACTIVE_READ_BIAS); -} - /* * wait for the write lock to be granted */ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, - -RWSEM_ACTIVE_WRITE_BIAS); + enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE; + signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + signed long count; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = type; + get_task_struct(tsk); + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there are no active locks, wake the front queued process(es) up. + * + * Alternatively, if we're called from a failed down_write(), there + * were already threads queued before us and there are no active + * writers, the lock must be read owned; so we try to wake any read + * locks that were queued ahead of us. */ + if (count == RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); + else if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + + raw_spin_unlock_irq(&sem->wait_lock); + + /* wait to be given the lock */ + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!waiter.task) + break; + + raw_spin_lock_irq(&sem->wait_lock); + /* Try to get the writer sem, may steal from the head writer: */ + if (type == RWSEM_WAITING_FOR_WRITE) + if (try_get_writer_sem(sem, &waiter)) { + raw_spin_unlock_irq(&sem->wait_lock); + return sem; + } + raw_spin_unlock_irq(&sem->wait_lock); + schedule(); + } + + tsk->state = TASK_RUNNING; + + return sem; } /* -- cgit v1.2.3-70-g09d2 From da16922cc031b9c0221c836994276ab193b31de8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:52 -0700 Subject: rwsem: simplify rwsem_down_read_failed When trying to acquire a read lock, the RWSEM_ACTIVE_READ_BIAS adjustment doesn't cause other readers to block, so we never have to worry about waking them back after canceling this adjustment in rwsem_down_read_failed(). We also never want to steal the lock in rwsem_down_read_failed(), so we don't have to grab the wait_lock either. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index fb658af1c12..66f307e9076 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -182,7 +182,6 @@ try_again_write: */ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { - enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ; signed long adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; @@ -190,7 +189,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; - waiter.type = type; + waiter.type = RWSEM_WAITING_FOR_READ; get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); @@ -201,17 +200,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* If there are no active locks, wake the front queued process(es) up. - * - * Alternatively, if we're called from a failed down_write(), there - * were already threads queued before us and there are no active - * writers, the lock must be read owned; so we try to wake any read - * locks that were queued ahead of us. */ + /* If there are no active locks, wake the front queued process(es). */ if (count == RWSEM_WAITING_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); - else if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); raw_spin_unlock_irq(&sem->wait_lock); @@ -220,15 +211,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!waiter.task) break; - - raw_spin_lock_irq(&sem->wait_lock); - /* Try to get the writer sem, may steal from the head writer: */ - if (type == RWSEM_WAITING_FOR_WRITE) - if (try_get_writer_sem(sem, &waiter)) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } - raw_spin_unlock_irq(&sem->wait_lock); schedule(); } -- cgit v1.2.3-70-g09d2 From 023fe4f712028d25b42d31984abae1f3d3f0e3e2 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:53 -0700 Subject: rwsem: simplify rwsem_down_write_failed When waking writers, we never grant them the lock - instead, they have to acquire it themselves when they run, and remove themselves from the wait_list when they succeed. As a result, we can do a few simplifications in rwsem_down_write_failed(): - We don't need to check for !waiter.task since __rwsem_do_wake() doesn't remove writers from the wait_list - There is no point releaseing the wait_lock before entering the wait loop, as we will need to reacquire it immediately. We can change the loop so that the lock is always held at the start of each loop iteration. - We don't need to get a reference on the task structure, since the task is responsible for removing itself from the wait_list. There is no risk, like in the rwsem_down_read_failed() case, that a task would wake up and exit (thus destroying its task structure) while __rwsem_do_wake() is still running - wait_lock protects against that. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 66f307e9076..c73bd96dc30 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -161,16 +161,8 @@ static int try_get_writer_sem(struct rw_semaphore *sem, try_again_write: oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (!(oldcount & RWSEM_ACTIVE_MASK)) { - /* No active lock: */ - struct task_struct *tsk = waiter->task; - - list_del(&waiter->list); - smp_mb(); - put_task_struct(tsk); - tsk->state = TASK_RUNNING; + if (!(oldcount & RWSEM_ACTIVE_MASK)) return 1; - } /* some one grabbed the sem already */ if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) return 0; @@ -220,11 +212,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } /* - * wait for the write lock to be granted + * wait until we successfully acquire the write lock */ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE; signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; @@ -232,8 +223,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; - waiter.type = type; - get_task_struct(tsk); + waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) @@ -255,25 +245,20 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) adjustment == -RWSEM_ACTIVE_WRITE_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - raw_spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ + /* wait until we successfully acquire the lock */ while (true) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!waiter.task) + + if (try_get_writer_sem(sem, &waiter)) break; - raw_spin_lock_irq(&sem->wait_lock); - /* Try to get the writer sem, may steal from the head writer: */ - if (type == RWSEM_WAITING_FOR_WRITE) - if (try_get_writer_sem(sem, &waiter)) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } raw_spin_unlock_irq(&sem->wait_lock); schedule(); + raw_spin_lock_irq(&sem->wait_lock); } + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); tsk->state = TASK_RUNNING; return sem; -- cgit v1.2.3-70-g09d2 From ed00f64346631dff035adfb9b0240daaa8b46c4e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:54 -0700 Subject: rwsem: more agressive lock stealing in rwsem_down_write_failed Some small code simplifications can be achieved by doing more agressive lock stealing: - When rwsem_down_write_failed() notices that there are no active locks (and thus no thread to wake us if we decided to sleep), it used to wake the first queued process. However, stealing the lock is also sufficient to deal with this case, so we don't need this check anymore. - In try_get_writer_sem(), we can steal the lock even when the first waiter is a reader. This is correct because the code path that wakes readers is protected by the wait_lock. As to the performance effects of this change, they are expected to be minimal: readers are still granted the lock (rather than having to acquire it themselves) when they reach the front of the wait queue, so we have essentially the same behavior as in rwsem-spinlock. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index c73bd96dc30..2360bf20409 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -143,20 +143,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) } /* Try to get write sem, caller holds sem->wait_lock: */ -static int try_get_writer_sem(struct rw_semaphore *sem, - struct rwsem_waiter *waiter) +static int try_get_writer_sem(struct rw_semaphore *sem) { - struct rwsem_waiter *fwaiter; long oldcount, adjustment; - /* only steal when first waiter is writing */ - fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (fwaiter->type != RWSEM_WAITING_FOR_WRITE) - return 0; - adjustment = RWSEM_ACTIVE_WRITE_BIAS; - /* Only one waiter in the queue: */ - if (fwaiter == waiter && waiter->list.next == &sem->wait_list) + if (list_is_singular(&sem->wait_list)) adjustment -= RWSEM_WAITING_BIAS; try_again_write: @@ -233,23 +225,18 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* If there are no active locks, wake the front queued process(es) up. - * - * Alternatively, if we're called from a failed down_write(), there - * were already threads queued before us and there are no active - * writers, the lock must be read owned; so we try to wake any read - * locks that were queued ahead of us. */ - if (count == RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); - else if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + /* If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. */ + if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); /* wait until we successfully acquire the lock */ while (true) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (try_get_writer_sem(sem, &waiter)) + if (try_get_writer_sem(sem)) break; raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3-70-g09d2 From 5ede972df1cd9294c82e9515949fd2103be81d7b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:55 -0700 Subject: rwsem: use cmpxchg for trying to steal write lock Using rwsem_atomic_update to try stealing the write lock forced us to undo the adjustment in the failure path. We can have simpler and faster code by using cmpxchg instead. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- lib/rwsem.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 2360bf20409..64c2dc007be 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -142,25 +142,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) return sem; } -/* Try to get write sem, caller holds sem->wait_lock: */ -static int try_get_writer_sem(struct rw_semaphore *sem) -{ - long oldcount, adjustment; - - adjustment = RWSEM_ACTIVE_WRITE_BIAS; - if (list_is_singular(&sem->wait_list)) - adjustment -= RWSEM_WAITING_BIAS; - -try_again_write: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (!(oldcount & RWSEM_ACTIVE_MASK)) - return 1; - /* some one grabbed the sem already */ - if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) - return 0; - goto try_again_write; -} - /* * wait for the read lock to be granted */ @@ -236,7 +217,12 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) while (true) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (try_get_writer_sem(sem)) + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + RWSEM_WAITING_BIAS) break; raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3-70-g09d2 From a7d2c573ae7fad1b2c877d1a1342fa5bb0d6478c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:56 -0700 Subject: rwsem: avoid taking wait_lock in rwsem_down_write_failed In rwsem_down_write_failed(), if there are active locks after we wake up (i.e. the lock got stolen from us), skip taking the wait_lock and go back to sleep immediately. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- lib/rwsem.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 64c2dc007be..edf3d9ca670 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -214,8 +214,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); /* wait until we successfully acquire the lock */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); /* Try acquiring the write lock. */ count = RWSEM_ACTIVE_WRITE_BIAS; @@ -226,7 +226,13 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) break; raw_spin_unlock_irq(&sem->wait_lock); - schedule(); + + /* Block until there are no active lockers. */ + do { + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } while (sem->count & RWSEM_ACTIVE_MASK); + raw_spin_lock_irq(&sem->wait_lock); } -- cgit v1.2.3-70-g09d2 From 9b0fc9c09f1b262b7fe697eba6b05095d78850e5 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:57 -0700 Subject: rwsem: skip initial trylock in rwsem_down_write_failed We can skip the initial trylock in rwsem_down_write_failed() if there are known active lockers already, thus saving one likely-to-fail cmpxchg. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- lib/rwsem.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index edf3d9ca670..0d50e46d5b0 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -216,14 +216,15 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* wait until we successfully acquire the lock */ set_task_state(tsk, TASK_UNINTERRUPTIBLE); while (true) { - - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + if (!(count & RWSEM_ACTIVE_MASK)) { + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) - break; + break; + } raw_spin_unlock_irq(&sem->wait_lock); @@ -231,7 +232,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) do { schedule(); set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } while (sem->count & RWSEM_ACTIVE_MASK); + } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } -- cgit v1.2.3-70-g09d2 From 8cf5322ce69afea1fab6a6270db24d057d664798 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:58 -0700 Subject: rwsem: simplify __rwsem_do_wake This is mostly for cleanup value: - We don't need several gotos to handle the case where the first waiter is a writer. Two simple tests will do (and generate very similar code). - In the remainder of the function, we know the first waiter is a reader, so we don't have to double check that. We can use do..while loops to iterate over the readers to wake (generates slightly better code). Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem-spinlock.c | 23 +++++++---------------- lib/rwsem.c | 26 ++++++++++++-------------- 2 files changed, 19 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 5f117f37ac0..9be8a914497 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -70,26 +70,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (!wakewrite) { - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - goto out; - goto dont_wake_writers; - } - - /* - * as we support write lock stealing, we can't set sem->activity - * to -1 here to indicate we get the lock. Instead, we wake it up - * to let it go get it again. - */ if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - wake_up_process(waiter->task); + if (wakewrite) + /* Wake up a writer. Note that we do not grant it the + * lock - it will have to acquire it when it runs. */ + wake_up_process(waiter->task); goto out; } /* grant an infinite number of read locks to the front of the queue */ - dont_wake_writers: woken = 0; - while (waiter->type == RWSEM_WAITING_FOR_READ) { + do { struct list_head *next = waiter->list.next; list_del(&waiter->list); @@ -99,10 +90,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) wake_up_process(tsk); put_task_struct(tsk); woken++; - if (list_empty(&sem->wait_list)) + if (next == &sem->wait_list) break; waiter = list_entry(next, struct rwsem_waiter, list); - } + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); sem->activity += woken; diff --git a/lib/rwsem.c b/lib/rwsem.c index 0d50e46d5b0..9a675fa9d78 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -68,20 +68,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) signed long woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - goto readers_only; - - if (wake_type == RWSEM_WAKE_READ_OWNED) - /* Another active reader was observed, so wakeup is not - * likely to succeed. Save the atomic op. - */ + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type != RWSEM_WAKE_READ_OWNED) + /* Wake writer at the front of the queue, but do not + * grant it the lock yet as we want other writers + * to be able to steal it. Readers, on the other hand, + * will block as they will notice the queued writer. + */ + wake_up_process(waiter->task); goto out; + } - /* Wake up the writing waiter and let the task grab the sem: */ - wake_up_process(waiter->task); - goto out; - - readers_only: /* If we come here from up_xxxx(), another thread might have reached * rwsem_down_failed_common() before we acquired the spinlock and * woken up a waiter, making it now active. We prefer to check for @@ -125,7 +122,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) rwsem_atomic_add(adjustment, sem); next = sem->wait_list.next; - for (loop = woken; loop > 0; loop--) { + loop = woken; + do { waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; @@ -133,7 +131,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) waiter->task = NULL; wake_up_process(tsk); put_task_struct(tsk); - } + } while (--loop); sem->wait_list.next = next; next->prev = &sem->wait_list; -- cgit v1.2.3-70-g09d2 From fe6e674c6187d4f452a679ced7e95262bd517936 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:59 -0700 Subject: rwsem: implement support for write lock stealing on the fastpath When we decide to wake up readers, we must first grant them as many read locks as necessary, and then actually wake up all these readers. But in order to know how many read shares to grant, we must first count the readers at the head of the queue. This might take a while if there are many readers, and we want to be protected against a writer stealing the lock while we're counting. To that end, we grant the first reader lock before counting how many more readers are queued. We also require some adjustments to the wake_type semantics. RWSEM_WAKE_NO_ACTIVE used to mean that we had found the count to be RWSEM_WAITING_BIAS, in which case the rwsem was known to be free as nobody could steal it while we hold the wait_lock. This doesn't make sense once we implement fastpath write lock stealing, so we now use RWSEM_WAKE_ANY in that case. Similarly, when rwsem_down_write_failed found that a read lock was active, it would use RWSEM_WAKE_READ_OWNED which signalled that new readers could be woken without checking first that the rwsem was available. We can't do that anymore since the existing readers might release their read locks, and a writer could steal the lock before we wake up additional readers. So, we have to use a new RWSEM_WAKE_READERS value to indicate we only want to wake readers, but we don't currently hold any read lock. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 64 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 9a675fa9d78..bbe48c04f36 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -4,6 +4,7 @@ * Derived from arch/i386/kernel/semaphore.c * * Writer lock-stealing by Alex Shi + * and Michel Lespinasse */ #include #include @@ -41,13 +42,11 @@ struct rwsem_waiter { enum rwsem_waiter_type type; }; -/* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and - * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held - * since the rwsem value was observed. - */ -#define RWSEM_WAKE_ANY 0 /* Wake whatever's at head of wait list */ -#define RWSEM_WAKE_NO_ACTIVE 1 /* rwsem was observed with no active thread */ -#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */ +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; /* * handle the lock release when processes blocked on it that can now run @@ -60,16 +59,16 @@ struct rwsem_waiter { * - writers are only woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - signed long woken, loop, adjustment; + signed long oldcount, woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type != RWSEM_WAKE_READ_OWNED) + if (wake_type == RWSEM_WAKE_ANY) /* Wake writer at the front of the queue, but do not * grant it the lock yet as we want other writers * to be able to steal it. Readers, on the other hand, @@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) goto out; } - /* If we come here from up_xxxx(), another thread might have reached - * rwsem_down_failed_common() before we acquired the spinlock and - * woken up a waiter, making it now active. We prefer to check for - * this first in order to not spend too much time with the spinlock - * held if we're not going to be able to wake up readers in the end. - * - * Note that we do not need to update the rwsem count: any writer - * trying to acquire rwsem will run rwsem_down_write_failed() due - * to the waiting threads and block trying to acquire the spinlock. - * - * We use a dummy atomic update in order to acquire the cache line - * exclusively since we expect to succeed and run the final rwsem - * count adjustment pretty soon. + /* Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. */ - if (wake_type == RWSEM_WAKE_ANY && - rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS) - /* Someone grabbed the sem for write already */ - goto out; + adjustment = 0; + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { + /* A writer stole the lock. Undo our reader grant. */ + if (rwsem_atomic_update(-adjustment, sem) & + RWSEM_ACTIVE_MASK) + goto out; + /* Last active locker left. Retry waking readers. */ + goto try_reader_grant; + } + } /* Grant an infinite number of read locks to the readers at the front * of the queue. Note we increment the 'active part' of the count by @@ -114,12 +113,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - adjustment = woken * RWSEM_ACTIVE_READ_BIAS; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; if (waiter->type != RWSEM_WAITING_FOR_WRITE) /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; - rwsem_atomic_add(adjustment, sem); + if (adjustment) + rwsem_atomic_add(adjustment, sem); next = sem->wait_list.next; loop = woken; @@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(adjustment, sem); /* If there are no active locks, wake the front queued process(es). */ - if (count == RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); + if (!(count & RWSEM_ACTIVE_MASK)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); @@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) * any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS && adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); /* wait until we successfully acquire the lock */ set_task_state(tsk, TASK_UNINTERRUPTIBLE); -- cgit v1.2.3-70-g09d2 From 25c39325968bbcebe6cd2a1991228c9dfb48d655 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:46:00 -0700 Subject: rwsem: do not block readers at head of queue if other readers are active This change fixes a race condition where a reader might determine it needs to block, but by the time it acquires the wait_lock the rwsem has active readers and no queued waiters. In this situation the reader can run in parallel with the existing active readers; it does not need to block until the active readers complete. Thanks to Peter Hurley for noticing this possible race. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index bbe48c04f36..61f91ca75e4 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -163,8 +163,14 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* If there are no active locks, wake the front queued process(es). */ - if (!(count & RWSEM_ACTIVE_MASK)) + /* If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3-70-g09d2 From b5f541810ea9fb98d93c0ee0e00e07a22874856f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 7 May 2013 06:46:02 -0700 Subject: rwsem: no need for explicit signed longs Change explicit "signed long" declarations into plain "long" as suggested by Peter Hurley. Signed-off-by: Davidlohr Bueso Reviewed-by: Michel Lespinasse Signed-off-by: Michel Lespinasse Signed-off-by: Linus Torvalds --- lib/rwsem.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 61f91ca75e4..cf0ad2ad19f 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -64,7 +64,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - signed long oldcount, woken, loop, adjustment; + long oldcount, woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { @@ -145,10 +145,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) */ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { - signed long adjustment = -RWSEM_ACTIVE_READ_BIAS; + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - signed long count; /* set up my own style of waitqueue */ waiter.task = tsk; @@ -193,10 +192,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) */ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - signed long count; /* set up my own style of waitqueue */ waiter.task = tsk; -- cgit v1.2.3-70-g09d2 From 2d864e41710f1d2ba406fb62018ab0487152e6f2 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Tue, 7 May 2013 15:37:48 -0700 Subject: kref: minor cleanup - make warning smp-safe - result of atomic _unless_zero functions should be checked by caller to avoid use-after-free error - trivial whitespace fix. Link: https://lkml.org/lkml/2013/4/12/391 Tested: compile x86, boot machine and run xfstests Signed-off-by: Anatol Pomozov [ Removed line-break, changed to use WARN_ON_ONCE() - Linus ] Signed-off-by: Linus Torvalds --- include/linux/kref.h | 9 ++++++--- lib/kobject.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/kref.h b/include/linux/kref.h index 4972e6e9ca9..e15828fd71f 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -39,8 +39,11 @@ static inline void kref_init(struct kref *kref) */ static inline void kref_get(struct kref *kref) { - WARN_ON(!atomic_read(&kref->refcount)); - atomic_inc(&kref->refcount); + /* If refcount was 0 before incrementing then we have a race + * condition when this kref is freeing by some other thread right now. + * In this case one should use kref_get_unless_zero() + */ + WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2); } /** @@ -100,7 +103,7 @@ static inline int kref_put_mutex(struct kref *kref, struct mutex *lock) { WARN_ON(release == NULL); - if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) { + if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) { mutex_lock(lock); if (unlikely(!atomic_dec_and_test(&kref->refcount))) { mutex_unlock(lock); diff --git a/lib/kobject.c b/lib/kobject.c index a65486613d7..b7e29a6056d 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -529,7 +529,7 @@ struct kobject *kobject_get(struct kobject *kobj) return kobj; } -static struct kobject *kobject_get_unless_zero(struct kobject *kobj) +static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; -- cgit v1.2.3-70-g09d2 From 9607a85b67a9714290a78c1a56630ab1c9fa2c23 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 7 May 2013 15:39:03 -0700 Subject: rwsem: check counter to avoid cmpxchg calls This patch tries to reduce the amount of cmpxchg calls in the writer failed path by checking the counter value first before issuing the instruction. If ->count is not set to RWSEM_WAITING_BIAS then there is no point wasting a cmpxchg call. Furthermore, Michel states "I suppose it helps due to the case where someone else steals the lock while we're trying to acquire sem->wait_lock." Two very different workloads and machines were used to see how this patch improves throughput: pgbench on a quad-core laptop and aim7 on a large 8 socket box with 80 cores. Some results comparing Michel's fast-path write lock stealing (tps-rwsem) on a quad-core laptop running pgbench: | db_size | clients | tps-rwsem | tps-patch | +---------+----------+----------------+--------------+ | 160 MB | 1 | 6906 | 9153 | + 32.5 | 160 MB | 2 | 15931 | 22487 | + 41.1% | 160 MB | 4 | 33021 | 32503 | | 160 MB | 8 | 34626 | 34695 | | 160 MB | 16 | 33098 | 34003 | | 160 MB | 20 | 31343 | 31440 | | 160 MB | 30 | 28961 | 28987 | | 160 MB | 40 | 26902 | 26970 | | 160 MB | 50 | 25760 | 25810 | ------------------------------------------------------ | 1.6 GB | 1 | 7729 | 7537 | | 1.6 GB | 2 | 19009 | 23508 | + 23.7% | 1.6 GB | 4 | 33185 | 32666 | | 1.6 GB | 8 | 34550 | 34318 | | 1.6 GB | 16 | 33079 | 32689 | | 1.6 GB | 20 | 31494 | 31702 | | 1.6 GB | 30 | 28535 | 28755 | | 1.6 GB | 40 | 27054 | 27017 | | 1.6 GB | 50 | 25591 | 25560 | ------------------------------------------------------ | 7.6 GB | 1 | 6224 | 7469 | + 20.0% | 7.6 GB | 2 | 13611 | 12778 | | 7.6 GB | 4 | 33108 | 32927 | | 7.6 GB | 8 | 34712 | 34878 | | 7.6 GB | 16 | 32895 | 33003 | | 7.6 GB | 20 | 31689 | 31974 | | 7.6 GB | 30 | 29003 | 28806 | | 7.6 GB | 40 | 26683 | 26976 | | 7.6 GB | 50 | 25925 | 25652 | ------------------------------------------------------ For the aim7 worloads, they overall improved on top of Michel's patchset. For full graphs on how the rwsem series plus this patch behaves on a large 8 socket machine against a vanilla kernel: http://stgolabs.net/rwsem-aim7-results.tar.gz Signed-off-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index cf0ad2ad19f..19c5fa95e0b 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -223,7 +223,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = RWSEM_ACTIVE_WRITE_BIAS; if (!list_is_singular(&sem->wait_list)) count += RWSEM_WAITING_BIAS; - if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) break; } -- cgit v1.2.3-70-g09d2