From f6d87f4bd259cf33e092cd1a8fde05f291c47af1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 13:18:30 +0100 Subject: genirq: keep affinities set from userspace across free/request_irq() Impact: preserve user-modified affinities on interrupts Kumar Galak noticed that commit 18404756765c713a0be4eb1082920c04822ce588 (genirq: Expose default irq affinity mask (take 3)) overrides an already set affinity setting across a free / request_irq(). Happens e.g. with ifdown/ifup of a network device. Change the logic to mark the affinities as set and keep them intact. This also fixes the unlocked access to irq_desc in irq_select_affinity() when called from irq_affinity_proc_write() Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 58 +++++++++++++++++++++++++++++++++++++++++--------- kernel/irq/migration.c | 11 ---------- kernel/irq/proc.c | 2 +- 4 files changed, 51 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c9767e64198..64c1c7253da 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern int irq_select_affinity_usr(unsigned int irq); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c498a1b8c62..634a2a95510 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq) int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; if (!desc->chip->set_affinity) return -EINVAL; + spin_lock_irqsave(&desc->lock, flags); + #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); - spin_unlock_irqrestore(&desc->lock, flags); - } else - set_pending_irq(irq, cpumask); + } else { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = cpumask; + } #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); #endif + desc->status |= IRQ_AFFINITY_SET; + spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) /* * Generic version of the affinity autoselector. */ -int irq_select_affinity(unsigned int irq) +int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { cpumask_t mask; - struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - desc = irq_to_desc(irq); + /* + * Preserve an userspace affinity setup, but make sure that + * one of the targets is online. + */ + if (desc->status & IRQ_AFFINITY_SET) { + if (cpus_intersects(desc->affinity, cpu_online_map)) + mask = desc->affinity; + else + desc->status &= ~IRQ_AFFINITY_SET; + } + desc->affinity = mask; desc->chip->set_affinity(irq, mask); return 0; } +#else +static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) +{ + return irq_select_affinity(irq); +} #endif +/* + * Called when affinity is set via /proc/irq + */ +int irq_select_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + spin_lock_irqsave(&desc->lock, flags); + ret = do_irq_select_affinity(irq, desc); + spin_unlock_irqrestore(&desc->lock, flags); + + return ret; +} + +#else +static inline int do_select_irq_affinity(int irq, struct irq_desc *desc) +{ + return 0; +} #endif /** @@ -446,7 +484,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) desc->depth = 1; /* Set default affinity mask once everything is setup */ - irq_select_affinity(irq); + do_irq_select_affinity(irq, desc); } else if ((new->flags & IRQF_TRIGGER_MASK) && (new->flags & IRQF_TRIGGER_MASK) diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 90b920d3f52..9db681d9581 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,17 +1,6 @@ #include -void set_pending_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - spin_unlock_irqrestore(&desc->lock, flags); -} - void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4d161c70ba5..d257e7d6a8a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity(irq) ? -EINVAL : count; + return irq_select_affinity_usr(irq) ? -EINVAL : count; irq_set_affinity(irq, new_value); -- cgit v1.2.3-70-g09d2 From 612e3684c1b7752d2890510e4f90115fd1eb2afb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 13:58:46 +0100 Subject: genirq: fix the affinity setting in setup_irq The affinity setting in setup irq is called before the NO_BALANCING flag is checked and might therefore override affinity settings from the calling code with the default setting. Move the NO_BALANCING flag check before the call to the affinity setting. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 634a2a95510..948a22a2c01 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -123,7 +123,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & IRQ_AFFINITY_SET) { + if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { if (cpus_intersects(desc->affinity, cpu_online_map)) mask = desc->affinity; else @@ -483,6 +483,10 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) /* Undo nested disables: */ desc->depth = 1; + /* Exclude IRQ from balancing if requested */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + /* Set default affinity mask once everything is setup */ do_irq_select_affinity(irq, desc); @@ -497,10 +501,6 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) *p = new; - /* Exclude IRQ from balancing */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; - /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; -- cgit v1.2.3-70-g09d2 From f131e2436ddbac2527bb2d6297a823aae4b024f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 8 Nov 2008 09:57:40 +0100 Subject: irq: fix typo Impact: build fix fix build failure on UP. Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 948a22a2c01..435861284e4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -159,7 +159,7 @@ int irq_select_affinity_usr(unsigned int irq) } #else -static inline int do_select_irq_affinity(int irq, struct irq_desc *desc) +static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) { return 0; } -- cgit v1.2.3-70-g09d2 From 3ff68a6a106c362a6811d3e51bced58e6fc87de7 Mon Sep 17 00:00:00 2001 From: Mark Nelson Date: Thu, 13 Nov 2008 21:37:41 +1100 Subject: genirq: __irq_set_trigger: change pr_warning to pr_debug Commit 0c5d1eb77a8be917b638344a22afe1398236482b (genirq: record trigger type) caused powerpc platforms that had no set_type() function in their struct irq_chip to spew out warnings about "No set_type function for IRQ...". This warning isn't necessarily justified though because the generic powerpc platform code calls set_irq_type() (which in turn calls __irq_set_trigger) with information from the device tree to establish the interrupt mappings, regardless of whether the PIC can actually set a type. A platform's irq_chip might not have a set_type function for a variety of reasons, for example: the platform may have the type essentially hard-coded, or as in the case for Cell interrupts are just messages past around that have no real concept of type, or the platform could even have a virtual PIC as on the PS3. Signed-off-by: Mark Nelson Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 435861284e4..801addda3c4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -365,7 +365,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_warning("No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", irq, chip ? (chip->name ? : "unknown") : "unknown"); return 0; } -- cgit v1.2.3-70-g09d2 From 522a110b42b306d696cf84e34c677ed0e7080194 Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 21 Nov 2008 11:00:18 +0800 Subject: function tracing: fix wrong position computing of stack_trace Impact: make output of stack_trace complete if buffer overruns When read buffer overruns, the output of stack_trace isn't complete. When printing records with seq_printf in t_show, if the read buffer has overruned by the current record, then this record won't be printed to user space through read buffer, it will just be dropped in this printing. When next printing, t_start should return the "*pos"th record, which is the one dropped by previous printing, but it just returns (m->private + *pos)th record. Here we use a more sane method to implement seq_operations which can be found in kernel code. Thus we needn't initialize m->private. About testing, it's not easy to overrun read buffer, but we can use seq_printf to print more padding bytes in t_show, then it's easy to check whether or not records are lost. This commit has been tested on both condition of overrun and non overrun. Signed-off-by: Liming Wang Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index be682b62fe5..3bdb44bde4b 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -184,11 +184,16 @@ static struct file_operations stack_max_size_fops = { static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - long i = (long)m->private; + long i; (*pos)++; - i++; + if (v == SEQ_START_TOKEN) + i = 0; + else { + i = *(long *)v; + i++; + } if (i >= max_stack_trace.nr_entries || stack_dump_trace[i] == ULONG_MAX) @@ -201,12 +206,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - void *t = &m->private; + void *t = SEQ_START_TOKEN; loff_t l = 0; local_irq_disable(); __raw_spin_lock(&max_stack_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + for (; t && l < *pos; t = t_next(m, t, &l)) ; @@ -235,10 +243,10 @@ static int trace_lookup_stack(struct seq_file *m, long i) static int t_show(struct seq_file *m, void *v) { - long i = *(long *)v; + long i; int size; - if (i < 0) { + if (v == SEQ_START_TOKEN) { seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", @@ -246,6 +254,8 @@ static int t_show(struct seq_file *m, void *v) return 0; } + i = *(long *)v; + if (i >= max_stack_trace.nr_entries || stack_dump_trace[i] == ULONG_MAX) return 0; @@ -275,10 +285,6 @@ static int stack_trace_open(struct inode *inode, struct file *file) int ret; ret = seq_open(file, &stack_trace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = (void *)-1; - } return ret; } -- cgit v1.2.3-70-g09d2 From b0788caf7af773b6c2374590dabd3a205f0918a8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 21 Nov 2008 15:57:32 +0800 Subject: lockdep: consistent alignement for lockdep info Impact: prettify /proc/lockdep_info Just feel odd that not all lines of lockdep info are aligned. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 06e157119d2..46a404173db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3276,10 +3276,10 @@ void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); -- cgit v1.2.3-70-g09d2 From 7ee1768ddb3075ae3a0801cc2d0ea4195530a7db Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 23 Nov 2008 21:24:30 +0200 Subject: x86, mmiotrace: fix buffer overrun detection Impact: fix mmiotrace overrun tracing When ftrace framework moved to use the ring buffer facility, the buffer overrun detection was broken after 2.6.27 by commit | commit 3928a8a2d98081d1bc3c0a84a2d70e29b90ecf1c | Author: Steven Rostedt | Date: Mon Sep 29 23:02:41 2008 -0400 | | ftrace: make work with new ring buffer | | This patch ports ftrace over to the new ring buffer. The detection is now fixed by using the ring buffer API. When mmiotrace detects a buffer overrun, it will report the number of lost events. People reading an mmiotrace log must know if something was missed, otherwise the data may not make sense. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index f28484618ff..e62cbf78eab 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -18,12 +18,14 @@ struct header_iter { static struct trace_array *mmio_trace_array; static bool overrun_detected; +static unsigned long prev_overruns; static void mmio_reset_data(struct trace_array *tr) { int cpu; overrun_detected = false; + prev_overruns = 0; tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) @@ -128,16 +130,12 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { - int cpu; unsigned long cnt = 0; -/* FIXME: */ -#if 0 - for_each_online_cpu(cpu) { - cnt += iter->overrun[cpu]; - iter->overrun[cpu] = 0; - } -#endif - (void)cpu; + unsigned long over = ring_buffer_overruns(iter->tr->buffer); + + if (over > prev_overruns) + cnt = over - prev_overruns; + prev_overruns = over; return cnt; } -- cgit v1.2.3-70-g09d2 From 4f5a7f40ddbae98569acbb99118a98570315579c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 27 Nov 2008 10:21:46 +0800 Subject: ftrace: prevent recursion Impact: prevent unnecessary stack recursion if the resched flag was set before we entered, then don't reschedule. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f780e9552f9..668bbb5ef2b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1215,7 +1215,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, out: if (resched) - preempt_enable_notrace(); + preempt_enable_no_resched_notrace(); else preempt_enable_notrace(); return NULL; -- cgit v1.2.3-70-g09d2 From 4cd4262034849da01eb88659af677b69f8169f06 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 21:04:24 -0500 Subject: sched: prevent divide by zero error in cpu_avg_load_per_task Impact: fix divide by zero crash in scheduler rebalance irq While testing the branch profiler, I hit this crash: divide error: 0000 [#1] PREEMPT SMP [...] RIP: 0010:[] [] cpu_avg_load_per_task+0x50/0x7f [...] Call Trace: <0> [] find_busiest_group+0x3e5/0xcaa [] rebalance_domains+0x2da/0xa21 [] ? find_next_bit+0x1b2/0x1e6 [] run_rebalance_domains+0x112/0x19f [] __do_softirq+0xa8/0x232 [] call_softirq+0x1c/0x3e [] do_softirq+0x94/0x1cd [] irq_exit+0x6b/0x10e [] smp_apic_timer_interrupt+0xd3/0xff [] apic_timer_interrupt+0x13/0x20 The code for cpu_avg_load_per_task has: if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; The runqueue lock is not held here, and there is nothing that prevents the rq->nr_running from going to zero after it passes the if condition. The branch profiler simply made the race window bigger. This patch saves off the rq->nr_running to a local variable and uses that for both the condition and the division. Signed-off-by: Steven Rostedt Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9b1e79371c2..700aa9a1413 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,9 +1453,10 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = rq->nr_running; - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; + if (nr_running) + rq->avg_load_per_task = rq->load.weight / nr_running; else rq->avg_load_per_task = 0; -- cgit v1.2.3-70-g09d2 From 1583715ddb61f822041807a0f18b3b4845e88c76 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 25 Nov 2008 10:27:49 +0100 Subject: sched, cpusets: fix warning in kernel/cpuset.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/cpuset.c: In function ‘generate_sched_domains’: kernel/cpuset.c:588: warning: ‘ndoms’ may be used uninitialized in this function triggers because GCC does not recognize that ndoms stays uninitialized only if doms is NULL - but that flow is covered at the end of generate_sched_domains(). Help out GCC by initializing this variable to 0. (that's prudent anyway) Also, this function needs a splitup and code flow simplification: with 160 lines length it's clearly too long. Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index da7ff6137f3..96c0ba13b8c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -585,7 +585,7 @@ static int generate_sched_domains(cpumask_t **domains, int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ - int ndoms; /* number of sched domains in result */ + int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ doms = NULL; -- cgit v1.2.3-70-g09d2 From af6d596fd603219b054c1c90fb16672a9fd441bd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Nov 2008 20:45:15 +0100 Subject: sched: prevent divide by zero error in cpu_avg_load_per_task, update Regarding the bug addressed in: 4cd4262: sched: prevent divide by zero error in cpu_avg_load_per_task Linus points out that the fix is not complete: > There's nothing that keeps gcc from deciding not to reload > rq->nr_running. > > Of course, in _practice_, I don't think gcc ever will (if it decides > that it will spill, gcc is likely going to decide that it will > literally spill the local variable to the stack rather than decide to > reload off the pointer), but it's a valid compiler optimization, and > it even has a name (rematerialization). > > So I suspect that your patch does fix the bug, but it still leaves the > fairly unlikely _potential_ for it to re-appear at some point. > > We have ACCESS_ONCE() as a macro to guarantee that the compiler > doesn't rematerialize a pointer access. That also would clarify > the fact that we access something unsafe outside a lock. So make sure our nr_running value is immutable and cannot change after we check it for nonzero. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 700aa9a1413..b7480fb5c3d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,7 +1453,7 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = rq->nr_running; + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) rq->avg_load_per_task = rq->load.weight / nr_running; -- cgit v1.2.3-70-g09d2 From 8419641450edc838a6ce7cdf0f99d262bf0af2d5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Nov 2008 17:36:44 +0000 Subject: cpuinit fixes in kernel/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- kernel/profile.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5a732c5ef08..8ea32e8d68b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,7 +462,7 @@ out: * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -void notify_cpu_starting(unsigned int cpu) +void __cpuinit notify_cpu_starting(unsigned int cpu) { unsigned long val = CPU_STARTING; diff --git a/kernel/profile.c b/kernel/profile.c index 5b7d1ac7124..dc41827fbfe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -351,7 +351,7 @@ out: put_cpu(); } -static int __devinit profile_cpu_callback(struct notifier_block *info, +static int __cpuinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; @@ -596,7 +596,7 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -int create_proc_profile(void) +int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ { struct proc_dir_entry *entry; -- cgit v1.2.3-70-g09d2 From 96b8936a9ed08746e47081458a5eb9e43a751e24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Nov 2008 08:10:03 +0100 Subject: remove __ARCH_WANT_COMPAT_SYS_PTRACE All architectures now use the generic compat_sys_ptrace, as should every new architecture that needs 32bit compat (if we'll ever get another). Remove the now superflous __ARCH_WANT_COMPAT_SYS_PTRACE define, and also kill a comment about __ARCH_SYS_PTRACE that was added after __ARCH_SYS_PTRACE was already gone. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- arch/Kconfig | 2 -- arch/ia64/include/asm/ptrace.h | 2 -- arch/mips/include/asm/ptrace.h | 4 ---- arch/parisc/include/asm/ptrace.h | 2 -- arch/powerpc/include/asm/ptrace.h | 2 -- arch/s390/include/asm/ptrace.h | 2 -- arch/sparc/include/asm/ptrace_64.h | 2 -- arch/x86/include/asm/ptrace.h | 2 -- include/linux/compat.h | 2 -- kernel/ptrace.c | 4 ++-- 10 files changed, 2 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 8977d99987c..471e72dbaf8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -79,8 +79,6 @@ config HAVE_KRETPROBES # task_pt_regs() in asm/processor.h or asm/ptrace.h # arch_has_single_step() if there is hardware single-step support # arch_has_block_step() if there is hardware block-step support -# arch_ptrace() and not #define __ARCH_SYS_PTRACE -# compat_arch_ptrace() and #define __ARCH_WANT_COMPAT_SYS_PTRACE # asm/syscall.h supplying asm-generic/syscall.h interface # linux/regset.h user_regset interfaces # CORE_DUMP_USE_REGSET #define'd in linux/elf.h diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index 6417c1ecb44..14055c636ad 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -325,8 +325,6 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) #define arch_has_block_step() (1) extern void user_enable_block_step(struct task_struct *); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #endif /* !__KERNEL__ */ /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */ diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 813abd16255..c2c8bac4330 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -9,10 +9,6 @@ #ifndef _ASM_PTRACE_H #define _ASM_PTRACE_H -#ifdef CONFIG_64BIT -#define __ARCH_WANT_COMPAT_SYS_PTRACE -#endif - /* 0 - 31 are integer registers, 32 - 63 are fp registers. */ #define FPR_BASE 32 #define PC 64 diff --git a/arch/parisc/include/asm/ptrace.h b/arch/parisc/include/asm/ptrace.h index afa5333187b..302f68dc889 100644 --- a/arch/parisc/include/asm/ptrace.h +++ b/arch/parisc/include/asm/ptrace.h @@ -47,8 +47,6 @@ struct pt_regs { #define task_regs(task) ((struct pt_regs *) ((char *)(task) + TASK_REGS)) -#define __ARCH_WANT_COMPAT_SYS_PTRACE - struct task_struct; #define arch_has_single_step() 1 void user_disable_single_step(struct task_struct *task); diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 280a90cc989..c9c678fb253 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -55,8 +55,6 @@ struct pt_regs { #ifdef __powerpc64__ -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define STACK_FRAME_OVERHEAD 112 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ #define STACK_FRAME_REGS_MARKER ASM_CONST(0x7265677368657265) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index a7226f8143f..560ce8561df 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -486,8 +486,6 @@ struct task_struct; extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) diff --git a/arch/sparc/include/asm/ptrace_64.h b/arch/sparc/include/asm/ptrace_64.h index 3d3e9c161d8..84e969f06af 100644 --- a/arch/sparc/include/asm/ptrace_64.h +++ b/arch/sparc/include/asm/ptrace_64.h @@ -142,8 +142,6 @@ struct global_reg_snapshot { }; extern struct global_reg_snapshot global_reg_snapshot[NR_CPUS]; -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define force_successful_syscall_return() \ do { current_thread_info()->syscall_noerror = 1; \ } while (0) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d1531c8480b..eefb0594b05 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -271,8 +271,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/compat.h b/include/linux/compat.h index f061a1ea1b7..e88f3ecf38b 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -252,12 +252,10 @@ extern int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); -#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data); -#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ /* * epoll (fs/eventpoll.c) compat bits follow ... diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1e68e4c39e2..4c8bcd7dd8e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) return (copied == sizeof(data)) ? 0 : -EIO; } -#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE +#if defined CONFIG_COMPAT #include int compat_ptrace_request(struct task_struct *child, compat_long_t request, @@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, unlock_kernel(); return ret; } -#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ +#endif /* CONFIG_COMPAT */ -- cgit v1.2.3-70-g09d2 From 7ef9964e6d1b911b78709f144000aacadd0ebc21 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 1 Dec 2008 13:13:55 -0800 Subject: epoll: introduce resource usage limits It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Cc: Cc: Cyrill Gorcunov Reported-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 27 ++++++++++++ fs/eventpoll.c | 85 ++++++++++++++++++++++++++++++++++---- include/linux/sched.h | 4 ++ kernel/sysctl.c | 10 +++++ 4 files changed, 118 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index bcceb99b81d..bb1b0dd3bfc 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -44,6 +44,7 @@ Table of Contents 2.14 /proc//io - Display the IO accounting fields 2.15 /proc//coredump_filter - Core dump filtering settings 2.16 /proc//mountinfo - Information about mounts + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface ------------------------------------------------------------------------------ Preface @@ -2483,4 +2484,30 @@ For more information on mount propagation see: Documentation/filesystems/sharedsubtree.txt +2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface +-------------------------------------------------------- + +This directory contains configuration options for the epoll(7) interface. + +max_user_instances +------------------ + +This is the maximum number of epoll file descriptors that a single user can +have open at a given time. The default value is 128, and should be enough +for normal users. + +max_user_watches +---------------- + +Every epoll file descriptor can store a number of files to be monitored +for event readiness. Each one of these monitored files constitutes a "watch". +This configuration option sets the maximum number of "watches" that are +allowed for each user. +Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes +on a 64bit one. +The current default value for max_user_watches is the 1/32 of the available +low memory, divided for the "watch" cost in bytes. + + ------------------------------------------------------------------------------ + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aec5c13f634..96355d50534 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -102,6 +102,8 @@ #define EP_UNACTIVE_PTR ((void *) -1L) +#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) + struct epoll_filefd { struct file *file; int fd; @@ -200,6 +202,9 @@ struct eventpoll { * holding ->lock. */ struct epitem *ovflist; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; }; /* Wait structure used by the poll hooks */ @@ -226,10 +231,18 @@ struct ep_pqueue { struct epitem *epi; }; +/* + * Configuration options available inside /proc/sys/fs/epoll/ + */ +/* Maximum number of epoll devices, per user */ +static int max_user_instances __read_mostly; +/* Maximum number of epoll watched descriptors, per user */ +static int max_user_watches __read_mostly; + /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ -static struct mutex epmutex; +static DEFINE_MUTEX(epmutex); /* Safe wake up implementation */ static struct poll_safewake psw; @@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +#ifdef CONFIG_SYSCTL + +#include + +static int zero; + +ctl_table epoll_table[] = { + { + .procname = "max_user_instances", + .data = &max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_user_watches", + .data = &max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); + atomic_dec(&ep->user->epoll_watches); + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", current, ep, file)); @@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); + atomic_dec(&ep->user->epoll_devs); + free_uid(ep->user); kfree(ep); } @@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file) static int ep_alloc(struct eventpoll **pep) { - struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); + int error; + struct user_struct *user; + struct eventpoll *ep; - if (!ep) - return -ENOMEM; + user = get_current_user(); + error = -EMFILE; + if (unlikely(atomic_read(&user->epoll_devs) >= + max_user_instances)) + goto free_uid; + error = -ENOMEM; + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (unlikely(!ep)) + goto free_uid; spin_lock_init(&ep->lock); mutex_init(&ep->mtx); @@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; ep->ovflist = EP_UNACTIVE_PTR; + ep->user = user; *pep = ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", current, ep)); return 0; + +free_uid: + free_uid(user); + return error; } /* @@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct epitem *epi; struct ep_pqueue epq; - error = -ENOMEM; + if (unlikely(atomic_read(&ep->user->epoll_watches) >= + max_user_watches)) + return -ENOSPC; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) - goto error_return; + return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ + error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; @@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, spin_unlock_irqrestore(&ep->lock, flags); + atomic_inc(&ep->user->epoll_watches); + /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); @@ -789,7 +852,7 @@ error_unregister: spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); -error_return: + return error; } @@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags) flags & O_CLOEXEC); if (fd < 0) ep_free(ep); + atomic_inc(&ep->user->epoll_devs); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", @@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, static int __init eventpoll_init(void) { - mutex_init(&epmutex); + struct sysinfo si; + + si_meminfo(&si); + max_user_instances = 128; + max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / + EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ ep_poll_safewake_init(&psw); diff --git a/include/linux/sched.h b/include/linux/sched.h index 644ffbda17c..55e30d11447 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -630,6 +630,10 @@ struct user_struct { atomic_t inotify_watches; /* How many inotify watches does this user have? */ atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif +#ifdef CONFIG_EPOLL + atomic_t epoll_devs; /* The number of epoll descriptors currently open */ + atomic_t epoll_watches; /* The number of file descriptors currently watched */ +#endif #ifdef CONFIG_POSIX_MQUEUE /* protected by mq_lock */ unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d048fa2d90..3d56fe7570d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -176,6 +176,9 @@ extern struct ctl_table random_table[]; #ifdef CONFIG_INOTIFY_USER extern struct ctl_table inotify_table[]; #endif +#ifdef CONFIG_EPOLL +extern struct ctl_table epoll_table[]; +#endif #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; @@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = { .child = inotify_table, }, #endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif #endif { .ctl_name = KERN_SETUID_DUMPABLE, -- cgit v1.2.3-70-g09d2 From a8005992836434cab6182c6147993d21442184c1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Dec 2008 13:14:00 -0800 Subject: taint: add missing comment The description for 'D' was missing in the comment... (causing me a minute of WTF followed by looking at more of the code) Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6513aac8e99..4d5088355bf 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -167,6 +167,7 @@ static const struct tnt tnts[] = { * 'M' - System experienced a machine check exception. * 'B' - System has hit bad_page. * 'U' - Userspace-defined naughtiness. + * 'D' - Kernel has oopsed before * 'A' - ACPI table overridden. * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. -- cgit v1.2.3-70-g09d2