From 408af87a397a8ddef56ad39a79481f592aa1ac1a Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 4 Nov 2010 21:44:41 +0100 Subject: Clean up relay_alloc_page_array() slightly by using vzalloc rather than vmalloc and memset We can optimize kernel/relay.c::relay_alloc_page_array() slightly by using vzalloc. The patch makes these changes: - use vzalloc instead of vmalloc+memset. - remove redundant local variable 'array'. - declare local 'pa_size' as const. Cuts down nicely on both source and object-code size. Signed-off-by: Jesper Juhl Acked-by: Pekka Enberg Acked-by: Mathieu Desnoyers Signed-off-by: Linus Torvalds --- kernel/relay.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb92..859ea5a9605 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - struct page **array; - size_t pa_size = n_pages * sizeof(struct page *); - - if (pa_size > PAGE_SIZE) { - array = vmalloc(pa_size); - if (array) - memset(array, 0, pa_size); - } else { - array = kzalloc(pa_size, GFP_KERNEL); - } - return array; + const size_t pa_size = n_pages * sizeof(struct page *); + if (pa_size > PAGE_SIZE) + return vzalloc(pa_size); + return kzalloc(pa_size, GFP_KERNEL); } /* -- cgit v1.2.3-70-g09d2 From e0a70217107e6f9844628120412cb27bb4cea194 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 5 Nov 2010 16:53:42 +0100 Subject: posix-cpu-timers: workaround to suppress the problems with mt exec posix-cpu-timers.c correctly assumes that the dying process does posix_cpu_timers_exit_group() and removes all !CPUCLOCK_PERTHREAD timers from signal->cpu_timers list. But, it also assumes that timer->it.cpu.task is always the group leader, and thus the dead ->task means the dead thread group. This is obviously not true after de_thread() changes the leader. After that almost every posix_cpu_timer_ method has problems. It is not simple to fix this bug correctly. First of all, I think that timer->it.cpu should use struct pid instead of task_struct. Also, the locking should be reworked completely. In particular, tasklist_lock should not be used at all. This all needs a lot of nontrivial and hard-to-test changes. Change __exit_signal() to do posix_cpu_timers_exit_group() when the old leader dies during exec. This is not the fix, just the temporary hack to hide the problem for 2.6.37 and stable. IOW, this is obviously wrong but this is what we currently have anyway: cpu timers do not work after mt exec. In theory this change adds another race. The exiting leader can detach the timers which were attached to the new leader. However, the window between de_thread() and release_task() is small, we can pretend that sys_timer_create() was called before de_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b194febf579..21aa7b3001f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk) tty = sig->tty; sig->tty = NULL; } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + /* * If there is any task waiting for the group exit * then notify it: -- cgit v1.2.3-70-g09d2 From 433039e97f672b81e6c8f6daef385dcf035c6e29 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 5 Nov 2010 16:17:39 -0700 Subject: watchdog: Fix section mismatch and potential undefined behavior. Commit d9ca07a05ce1 ("watchdog: Avoid kernel crash when disabling watchdog") introduces a section mismatch. Now that we reference no_watchdog from non-__init code it can no longer be __initdata. Signed-off-by: David Daney Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d..6e3c41a4024 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __initdata no_watchdog; +static int no_watchdog; /* boot commands */ -- cgit v1.2.3-70-g09d2 From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 14:54:09 +0100 Subject: block: remove REQ_HARDBARRIER REQ_HARDBARRIER is dead now, so remove the leftovers. What's left at this point is: - various checks inside the block layer. - sanity checks in bio based drivers. - now unused bio_empty_barrier helper. - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while, but Xen really needs to sort out it's barrier situaton. - setting of ordered tags in uas - dead code copied from old scsi drivers. - scsi different retry for barriers - it's dead and should have been removed when flushes were converted to FS requests. - blktrace handling of barriers - removed. Someone who knows blktrace better should add support for REQ_FLUSH and REQ_FUA, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ------- block/elevator.c | 4 ++-- drivers/block/aoe/aoeblk.c | 3 --- drivers/block/loop.c | 6 ------ drivers/block/xen-blkfront.c | 2 -- drivers/scsi/scsi_error.c | 18 +++++------------- drivers/usb/storage/uas.c | 5 +---- include/linux/bio.h | 4 ---- include/linux/blk_types.h | 6 ++---- include/linux/blkdev.h | 3 +-- kernel/trace/blktrace.c | 4 ---- 11 files changed, 11 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 17fcb83670c..4ce953f1b39 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1194,13 +1194,6 @@ static int __make_request(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; int rw_flags; - /* REQ_HARDBARRIER is no more */ - if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER, - "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) { - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even diff --git a/block/elevator.c b/block/elevator.c index 282e8308f7e..2569512830d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -429,7 +429,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; + stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -691,7 +691,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + if (rq->cmd_flags & REQ_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || (rq->cmd_flags & REQ_DISCARD)) { diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 541e1887996..528f6318ded 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -180,9 +180,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) BUG(); bio_endio(bio, -ENXIO); return 0; - } else if (bio->bi_rw & REQ_HARDBARRIER) { - bio_endio(bio, -EOPNOTSUPP); - return 0; } else if (bio->bi_io_vec == NULL) { printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); BUG(); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1e5284ef65f..7ea0bea2f7e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -481,12 +481,6 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) if (bio_rw(bio) == WRITE) { struct file *file = lo->lo_backing_file; - /* REQ_HARDBARRIER is deprecated */ - if (bio->bi_rw & REQ_HARDBARRIER) { - ret = -EOPNOTSUPP; - goto out; - } - if (bio->bi_rw & REQ_FLUSH) { ret = vfs_fsync(file, 0); if (unlikely(ret && ret != -EINVAL)) { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 06e2812ba12..255035cfc88 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -289,8 +289,6 @@ static int blkif_queue_request(struct request *req) ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & REQ_HARDBARRIER) - ring_req->operation = BLKIF_OP_WRITE_BARRIER; ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 1de30eb83bb..f3cf924a2cd 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -320,19 +320,11 @@ static int scsi_check_sense(struct scsi_cmnd *scmd) "changed. The Linux SCSI layer does not " "automatically adjust these parameters.\n"); - if (scmd->request->cmd_flags & REQ_HARDBARRIER) - /* - * barrier requests should always retry on UA - * otherwise block will get a spurious error - */ - return NEEDS_RETRY; - else - /* - * for normal (non barrier) commands, pass the - * UA upwards for a determination in the - * completion functions - */ - return SUCCESS; + /* + * Pass the UA upwards for a determination in the completion + * functions. + */ + return SUCCESS; /* these three are not supported */ case COPY_ABORTED: diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 2054b1e25a6..d1268191acb 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -331,10 +331,7 @@ static struct urb *uas_alloc_cmd_urb(struct uas_dev_info *devinfo, gfp_t gfp, iu->iu_id = IU_ID_COMMAND; iu->tag = cpu_to_be16(stream_id); - if (sdev->ordered_tags && (cmnd->request->cmd_flags & REQ_HARDBARRIER)) - iu->prio_attr = UAS_ORDERED_TAG; - else - iu->prio_attr = UAS_SIMPLE_TAG; + iu->prio_attr = UAS_SIMPLE_TAG; iu->len = len; int_to_scsilun(sdev->lun, &iu->lun); memcpy(iu->cdb, cmnd->cmnd, cmnd->cmd_len); diff --git a/include/linux/bio.h b/include/linux/bio.h index ba679992d39..35dcdb3589b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -66,10 +66,6 @@ #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) -#define bio_empty_barrier(bio) \ - ((bio->bi_rw & REQ_HARDBARRIER) && \ - !bio_has_data(bio) && \ - !(bio->bi_rw & REQ_DISCARD)) static inline unsigned int bio_cur_bytes(struct bio *bio) { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0437ab6bb54..46ad5197537 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -122,7 +122,6 @@ enum rq_flag_bits { __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - __REQ_HARDBARRIER, /* may not be passed by drive either */ __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_DISCARD, /* request to discard sectors */ @@ -159,7 +158,6 @@ enum rq_flag_bits { #define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) #define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) #define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) -#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER) #define REQ_SYNC (1 << __REQ_SYNC) #define REQ_META (1 << __REQ_META) #define REQ_DISCARD (1 << __REQ_DISCARD) @@ -168,8 +166,8 @@ enum rq_flag_bits { #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ - (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \ - REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) + (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_DISCARD | \ + REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define REQ_CLONE_MASK REQ_COMMON_MASK #define REQ_UNPLUG (1 << __REQ_UNPLUG) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5027a599077..aae86fd10c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -552,8 +552,7 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync) * it already be started by driver. */ #define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \ - REQ_FLUSH | REQ_FUA) + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) #define rq_mergeable(rq) \ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ (((rq)->cmd_flags & REQ_DISCARD) || \ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) -- cgit v1.2.3-70-g09d2 From eed01528a45dc4138e9a08064b4b6cc1a9426899 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 26 Oct 2010 16:08:01 +0200 Subject: perf_events: Fix time tracking in samples This patch corrects time tracking in samples. Without this patch both time_enabled and time_running are bogus when user asks for PERF_SAMPLE_READ. One uses PERF_SAMPLE_READ to sample the values of other counters in each sample. Because of multiplexing, it is necessary to know both time_enabled, time_running to be able to scale counts correctly. In this second version of the patch, we maintain a shadow copy of ctx->time which allows us to compute ctx->time without calling update_context_time() from NMI context. We avoid the issue that update_context_time() must always be called with ctx->lock held. We do not keep shadow copies of the other event timings because if the lead event is overflowing then it is active and thus it's been scheduled in via event_sched_in() in which case neither tstamp_stopped, tstamp_running can be modified. This timing logic only applies to samples when PERF_SAMPLE_READ is used. Note that this patch does not address timing issues related to sampling inheritance between tasks. This will be addressed in a future patch. With this patch, the libpfm4 example task_smpl now reports correct counts (shown on 2.4GHz Core 2): $ task_smpl -p 2400000000 -e unhalted_core_cycles:u,instructions_retired:u,baclears noploop 5 noploop for 5 seconds IIP:0x000000004006d6 PID:5596 TID:5596 TIME:466,210,211,430 STREAM_ID:33 PERIOD:2,400,000,000 ENA=1,010,157,814 RUN=1,010,157,814 NR=3 2,400,000,254 unhalted_core_cycles:u (33) 2,399,273,744 instructions_retired:u (34) 53,340 baclears (35) Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cc6e14b.1e07e30a.256e.5190@mx.google.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++++++++ kernel/perf_event.c | 42 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 057bf22a832..40150f34598 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -747,6 +747,16 @@ struct perf_event { u64 tstamp_running; u64 tstamp_stopped; + /* + * timestamp shadows the actual context timing but it can + * be safely used in NMI interrupt context. It reflects the + * context time as it was when the event was last scheduled in. + * + * ctx_time already accounts for ctx->timestamp. Therefore to + * compute ctx_time for a sample, simply add perf_clock(). + */ + u64 shadow_ctx_time; + struct perf_event_attr attr; struct hw_perf_event hw; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f498..cb6c0d2af68 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -674,6 +674,8 @@ event_sched_in(struct perf_event *event, event->tstamp_running += ctx->time - event->tstamp_stopped; + event->shadow_ctx_time = ctx->time - ctx->timestamp; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -3396,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[4]; @@ -3404,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + + values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + + values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) @@ -3421,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; @@ -3431,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; + values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; + values[n++] = running; if (leader != event) leader->pmu->read(leader); @@ -3459,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, } } +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event); + perf_output_read_group(handle, event, enabled, running); else - perf_output_read_one(handle, event); + perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, -- cgit v1.2.3-70-g09d2 From 834b40380e93e36f1c9b48ec1d280cebe3d7bd8c Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Thu, 11 Nov 2010 14:05:14 -0800 Subject: kernel/range.c: fix clean_sort_range() for the case of full array clean_sort_range() should return a number of nonempty elements of range array, but if the array is full clean_sort_range() returns 0. The problem is that the number of nonempty elements is evaluated by finding the first empty element of the array. If there is no such element it returns an initial value of local variable nr_range that is zero. The fix is trivial: it changes initial value of nr_range to size of the array. The bug can lead to loss of information regarding all ranges, since typically returned value of clean_sort_range() is considered as an actual number of ranges in the array after a series of add/subtract operations. Found by Analytical Verification project of Linux Verification Center (linuxtesting.org), thanks to Alexander Kolosov. Signed-off-by: Alexey Khoroshilov Cc: Yinghai Lu Cc: "H. Peter Anvin" Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb..37fa9b99ad5 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) int clean_sort_range(struct range *range, int az) { - int i, j, k = az - 1, nr_range = 0; + int i, j, k = az - 1, nr_range = az; for (i = 0; i < k; i++) { if (range[i].end) -- cgit v1.2.3-70-g09d2 From 38715258aa2e8cd94bd4aafadc544e5104efd551 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 11 Nov 2010 14:05:16 -0800 Subject: latencytop: fix per task accumulator Per task latencytop accumulator prematurely terminates due to erroneous placement of latency_record_count. It should be incremented whenever a new record is allocated instead of increment on every latencytop event. Also fix search iterator to only search known record events instead of blindly searching all pre-allocated space. Signed-off-by: Ken Chen Reviewed-by: Arjan van de Ven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d41..17110a4a4fc 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) account_global_scheduler_latency(tsk, &lat); - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT; i++) { + for (i = 0; i < tsk->latency_record_count; i++) { struct latency_record *mylat; int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) } } + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + /* Allocated a new one: */ - i = tsk->latency_record_count; + i = tsk->latency_record_count++; memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: -- cgit v1.2.3-70-g09d2 From eaf06b241b091357e72b76863ba16e89610d31bd Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Thu, 11 Nov 2010 14:05:18 -0800 Subject: Restrict unprivileged access to kernel syslog The kernel syslog contains debugging information that is often useful during exploitation of other vulnerabilities, such as kernel heap addresses. Rather than futilely attempt to sanitize hundreds (or thousands) of printk statements and simultaneously cripple useful debugging functionality, it is far simpler to create an option that prevents unprivileged users from reading the syslog. This patch, loosely based on grsecurity's GRKERNSEC_DMESG, creates the dmesg_restrict sysctl. When set to "0", the default, no restrictions are enforced. When set to "1", only users with CAP_SYS_ADMIN can read the kernel syslog via dmesg(8) or other mechanisms. [akpm@linux-foundation.org: explain the config option in kernel.txt] Signed-off-by: Dan Rosenberg Acked-by: Ingo Molnar Acked-by: Eugene Teo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 14 ++++++++++++++ include/linux/kernel.h | 1 + kernel/printk.c | 6 ++++++ kernel/sysctl.c | 9 +++++++++ security/Kconfig | 12 ++++++++++++ security/commoncap.c | 2 ++ 6 files changed, 44 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 3894eaa2348..209e1584c3d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -28,6 +28,7 @@ show up in /proc/sys/kernel: - core_uses_pid - ctrl-alt-del - dentry-state +- dmesg_restrict - domainname - hostname - hotplug @@ -213,6 +214,19 @@ to decide what to do with it. ============================================================== +dmesg_restrict: + +This toggle indicates whether unprivileged users are prevented from using +dmesg(8) to view messages from the kernel's log buffer. When +dmesg_restrict is set to (0) there are no restrictions. When +dmesg_restrict is set set to (1), users must have CAP_SYS_ADMIN to use +dmesg(8). + +The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the default +value of dmesg_restrict. + +============================================================== + domainname & hostname: These files can be used to set the NIS/YP domainname and the diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b526947bdf4..fc3da9e4da1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -293,6 +293,7 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); extern int printk_delay_msec; +extern int dmesg_restrict; /* * Print a one-time message (analogous to WARN_ONCE() et al): diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c37..38e7d5868d6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void) } #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799..b65bf634035 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -703,6 +703,15 @@ static struct ctl_table kern_table[] = { .extra2 = &ten_thousand, }, #endif + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ngroups_max", .data = &ngroups_max, diff --git a/security/Kconfig b/security/Kconfig index bd72ae62349..e80da955e68 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -39,6 +39,18 @@ config KEYS_DEBUG_PROC_KEYS If you are unsure as to whether this is required, answer N. +config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" + default n + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). + + If this option is not selected, no restrictions will be enforced + unless the dmesg_restrict sysctl is explicitly set to (1). + + If you are unsure how to answer this question, answer N. + config SECURITY bool "Enable different security models" depends on SYSFS diff --git a/security/commoncap.c b/security/commoncap.c index 5e632b4857e..04b80f9912b 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -895,6 +895,8 @@ int cap_syslog(int type, bool from_file) { if (type != SYSLOG_ACTION_OPEN && from_file) return 0; + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; if ((type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3-70-g09d2 From 12b3052c3ee8f508b2c7ee4ddd63ed03423409d8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 15 Nov 2010 18:36:29 -0500 Subject: capabilities/syslog: open code cap_syslog logic to fix build failure The addition of CONFIG_SECURITY_DMESG_RESTRICT resulted in a build failure when CONFIG_PRINTK=n. This is because the capabilities code which used the new option was built even though the variable in question didn't exist. The patch here fixes this by moving the capabilities checks out of the LSM and into the caller. All (known) LSMs should have been calling the capabilities hook already so it actually makes the code organization better to eliminate the hook altogether. Signed-off-by: Eric Paris Acked-by: James Morris Signed-off-by: Linus Torvalds --- include/linux/security.h | 9 ++++----- kernel/printk.c | 15 ++++++++++++++- security/capability.c | 5 +++++ security/commoncap.c | 21 --------------------- security/security.c | 4 ++-- security/selinux/hooks.c | 6 +----- security/smack/smack_lsm.c | 8 ++------ 7 files changed, 28 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index b8246a8df7d..fd4d55fb884 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -77,7 +77,6 @@ extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, extern int cap_task_setscheduler(struct task_struct *p); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); -extern int cap_syslog(int type, bool from_file); extern int cap_vm_enough_memory(struct mm_struct *mm, long pages); struct msghdr; @@ -1388,7 +1387,7 @@ struct security_operations { int (*sysctl) (struct ctl_table *table, int op); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type, bool from_file); + int (*syslog) (int type); int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); @@ -1671,7 +1670,7 @@ int security_real_capable_noaudit(struct task_struct *tsk, int cap); int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); -int security_syslog(int type, bool from_file); +int security_syslog(int type); int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); @@ -1901,9 +1900,9 @@ static inline int security_quota_on(struct dentry *dentry) return 0; } -static inline int security_syslog(int type, bool from_file) +static inline int security_syslog(int type) { - return cap_syslog(type, from_file); + return 0; } static inline int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/kernel/printk.c b/kernel/printk.c index 38e7d5868d6..9a2264fc42c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -274,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) char c; int error = 0; - error = security_syslog(type, from_file); + /* + * If this is from /proc/kmsg we only do the capabilities checks + * at open time. + */ + if (type == SYSLOG_ACTION_OPEN || !from_file) { + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + error = security_syslog(type); if (error) return error; diff --git a/security/capability.c b/security/capability.c index 30ae00fbecd..c773635ca3a 100644 --- a/security/capability.c +++ b/security/capability.c @@ -17,6 +17,11 @@ static int cap_sysctl(ctl_table *table, int op) return 0; } +static int cap_syslog(int type) +{ + return 0; +} + static int cap_quotactl(int cmds, int type, int id, struct super_block *sb) { return 0; diff --git a/security/commoncap.c b/security/commoncap.c index 04b80f9912b..64c2ed9c901 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -27,7 +27,6 @@ #include #include #include -#include /* * If a non-root user executes a setuid-root binary in @@ -883,26 +882,6 @@ error: return error; } -/** - * cap_syslog - Determine whether syslog function is permitted - * @type: Function requested - * @from_file: Whether this request came from an open file (i.e. /proc) - * - * Determine whether the current process is permitted to use a particular - * syslog function, returning 0 if permission is granted, -ve if not. - */ -int cap_syslog(int type, bool from_file) -{ - if (type != SYSLOG_ACTION_OPEN && from_file) - return 0; - if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) - return -EPERM; - if ((type != SYSLOG_ACTION_READ_ALL && - type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - return 0; -} - /** * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted * @mm: The VM space in which the new mapping is to be made diff --git a/security/security.c b/security/security.c index 3ef5e2a7a74..1b798d3df71 100644 --- a/security/security.c +++ b/security/security.c @@ -197,9 +197,9 @@ int security_quota_on(struct dentry *dentry) return security_ops->quota_on(dentry); } -int security_syslog(int type, bool from_file) +int security_syslog(int type) { - return security_ops->syslog(type, from_file); + return security_ops->syslog(type); } int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d9154cf90ae..65fa8bf596f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1973,14 +1973,10 @@ static int selinux_quota_on(struct dentry *dentry) return dentry_has_perm(cred, NULL, dentry, FILE__QUOTAON); } -static int selinux_syslog(int type, bool from_file) +static int selinux_syslog(int type) { int rc; - rc = cap_syslog(type, from_file); - if (rc) - return rc; - switch (type) { case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index bc39f4067af..489a85afa47 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -157,15 +157,11 @@ static int smack_ptrace_traceme(struct task_struct *ptp) * * Returns 0 on success, error code otherwise. */ -static int smack_syslog(int type, bool from_file) +static int smack_syslog(int typefrom_file) { - int rc; + int rc = 0; char *sp = current_security(); - rc = cap_syslog(type, from_file); - if (rc != 0) - return rc; - if (capable(CAP_MAC_OVERRIDE)) return 0; -- cgit v1.2.3-70-g09d2