diff options
-rw-r--r-- | Documentation/cputopology.txt | 23 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 4 | ||||
-rw-r--r-- | arch/ia64/include/asm/system.h | 4 | ||||
-rw-r--r-- | arch/powerpc/include/asm/system.h | 4 | ||||
-rw-r--r-- | arch/s390/Kconfig | 7 | ||||
-rw-r--r-- | arch/s390/include/asm/system.h | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/topology.h | 27 | ||||
-rw-r--r-- | arch/s390/kernel/topology.c | 150 | ||||
-rw-r--r-- | arch/x86/Kconfig | 11 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 8 | ||||
-rw-r--r-- | drivers/base/topology.c | 16 | ||||
-rw-r--r-- | include/linux/hardirq.h | 9 | ||||
-rw-r--r-- | include/linux/sched.h | 23 | ||||
-rw-r--r-- | include/linux/topology.h | 6 | ||||
-rw-r--r-- | include/trace/events/sched.h | 29 | ||||
-rw-r--r-- | kernel/sched.c | 291 | ||||
-rw-r--r-- | kernel/sched_fair.c | 76 | ||||
-rw-r--r-- | kernel/sched_features.h | 5 | ||||
-rw-r--r-- | kernel/sched_rt.c | 40 | ||||
-rw-r--r-- | kernel/sched_stoptask.c | 108 | ||||
-rw-r--r-- | kernel/softirq.c | 64 | ||||
-rw-r--r-- | kernel/stop_machine.c | 8 | ||||
-rw-r--r-- | net/sched/cls_cgroup.c | 2 |
23 files changed, 732 insertions, 184 deletions
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index f1c5c4bccd3..902d3151f52 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -14,25 +14,39 @@ to /proc/cpuinfo. identifier (rather than the kernel's). The actual value is architecture and platform dependent. -3) /sys/devices/system/cpu/cpuX/topology/thread_siblings: +3) /sys/devices/system/cpu/cpuX/topology/book_id: + + the book ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +4) /sys/devices/system/cpu/cpuX/topology/thread_siblings: internel kernel map of cpuX's hardware threads within the same core as cpuX -4) /sys/devices/system/cpu/cpuX/topology/core_siblings: +5) /sys/devices/system/cpu/cpuX/topology/core_siblings: internal kernel map of cpuX's hardware threads within the same physical_package_id. +6) /sys/devices/system/cpu/cpuX/topology/book_siblings: + + internal kernel map of cpuX's hardware threads within the same + book_id. + To implement it in an architecture-neutral way, a new source file, -drivers/base/topology.c, is to export the 4 attributes. +drivers/base/topology.c, is to export the 4 or 6 attributes. The two book +related sysfs files will only be created if CONFIG_SCHED_BOOK is selected. For an architecture to support this feature, it must define some of these macros in include/asm-XXX/topology.h: #define topology_physical_package_id(cpu) #define topology_core_id(cpu) +#define topology_book_id(cpu) #define topology_thread_cpumask(cpu) #define topology_core_cpumask(cpu) +#define topology_book_cpumask(cpu) The type of **_id is int. The type of siblings is (const) struct cpumask *. @@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h: 3) thread_siblings: just the given CPU 4) core_siblings: just the given CPU +For architectures that don't support books (CONFIG_SCHED_BOOK) there are no +default definitions for topology_book_id() and topology_book_cpumask(). + Additionally, CPU topology information is provided under /sys/devices/system/cpu and includes these files. The internal source for the output is in brackets ("[]"). diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8dd7248508a..ed05a4a0d24 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2435,6 +2435,10 @@ and is between 256 and 4096 characters. It is defined in the file disables clocksource verification at runtime. Used to enable high-resolution timer mode on older hardware, and in virtualized environment. + [x86] noirqtime: Do not use TSC to do irq accounting. + Used to run time disable IRQ_TIME_ACCOUNTING on any + platforms where RDTSC is slow and this accounting + can add overhead. turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h index 9f342a574ce..dd028f2b13b 100644 --- a/arch/ia64/include/asm/system.h +++ b/arch/ia64/include/asm/system.h @@ -272,10 +272,6 @@ void cpu_idle_wait(void); void default_idle(void); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void account_system_vtime(struct task_struct *); -#endif - #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h index 6c294acac84..9c3d160670b 100644 --- a/arch/powerpc/include/asm/system.h +++ b/arch/powerpc/include/asm/system.h @@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long); #define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x))) -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void account_system_vtime(struct task_struct *); -#endif - extern struct dentry *powerpc_debugfs_root; #endif /* __KERNEL__ */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 958f0dadead..75976a14194 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -199,6 +199,13 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. +config SCHED_BOOK + bool "Book scheduler support" + depends on SMP + help + Book scheduler support improves the CPU scheduler's decision making + when dealing with machines that have several books. + config MATHEMU bool "IEEE FPU emulation" depends on MARCH_G5 diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h index cef66210c84..38ddd8a9a9e 100644 --- a/arch/s390/include/asm/system.h +++ b/arch/s390/include/asm/system.h @@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs) extern void account_vtime(struct task_struct *, struct task_struct *); extern void account_tick_vtime(struct task_struct *); -extern void account_system_vtime(struct task_struct *); #ifdef CONFIG_PFAULT extern void pfault_irq_init(void); diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 831bd033ea7..051107a2c5e 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -3,15 +3,32 @@ #include <linux/cpumask.h> -#define mc_capable() (1) - -const struct cpumask *cpu_coregroup_mask(unsigned int cpu); - extern unsigned char cpu_core_id[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; +static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu) +{ + return &cpu_core_map[cpu]; +} + #define topology_core_id(cpu) (cpu_core_id[cpu]) #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) +#define mc_capable() (1) + +#ifdef CONFIG_SCHED_BOOK + +extern unsigned char cpu_book_id[NR_CPUS]; +extern cpumask_t cpu_book_map[NR_CPUS]; + +static inline const struct cpumask *cpu_book_mask(unsigned int cpu) +{ + return &cpu_book_map[cpu]; +} + +#define topology_book_id(cpu) (cpu_book_id[cpu]) +#define topology_book_cpumask(cpu) (&cpu_book_map[cpu]) + +#endif /* CONFIG_SCHED_BOOK */ int topology_set_cpu_management(int fc); void topology_schedule_update(void); @@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void) }; #endif +#define SD_BOOK_INIT SD_CPU_INIT + #include <asm-generic/topology.h> #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index bcef00766a6..13559c99384 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -57,8 +57,8 @@ struct tl_info { union tl_entry tle[0]; }; -struct core_info { - struct core_info *next; +struct mask_info { + struct mask_info *next; unsigned char id; cpumask_t mask; }; @@ -66,7 +66,6 @@ struct core_info { static int topology_enabled; static void topology_work_fn(struct work_struct *work); static struct tl_info *tl_info; -static struct core_info core_info; static int machine_has_topology; static struct timer_list topology_timer; static void set_topology_timer(void); @@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn); /* topology_lock protects the core linked list */ static DEFINE_SPINLOCK(topology_lock); +static struct mask_info core_info; cpumask_t cpu_core_map[NR_CPUS]; unsigned char cpu_core_id[NR_CPUS]; -static cpumask_t cpu_coregroup_map(unsigned int cpu) +#ifdef CONFIG_SCHED_BOOK +static struct mask_info book_info; +cpumask_t cpu_book_map[NR_CPUS]; +unsigned char cpu_book_id[NR_CPUS]; +#endif + +static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) { - struct core_info *core = &core_info; - unsigned long flags; cpumask_t mask; cpus_clear(mask); if (!topology_enabled || !machine_has_topology) return cpu_possible_map; - spin_lock_irqsave(&topology_lock, flags); - while (core) { - if (cpu_isset(cpu, core->mask)) { - mask = core->mask; + while (info) { + if (cpu_isset(cpu, info->mask)) { + mask = info->mask; break; } - core = core->next; + info = info->next; } - spin_unlock_irqrestore(&topology_lock, flags); if (cpus_empty(mask)) mask = cpumask_of_cpu(cpu); return mask; } -const struct cpumask *cpu_coregroup_mask(unsigned int cpu) -{ - return &cpu_core_map[cpu]; -} - -static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core) +static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book, + struct mask_info *core) { unsigned int cpu; @@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core) rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin; for_each_present_cpu(lcpu) { - if (cpu_logical_map(lcpu) == rcpu) { - cpu_set(lcpu, core->mask); - cpu_core_id[lcpu] = core->id; - smp_cpu_polarization[lcpu] = tl_cpu->pp; - } + if (cpu_logical_map(lcpu) != rcpu) + continue; +#ifdef CONFIG_SCHED_BOOK + cpu_set(lcpu, book->mask); + cpu_book_id[lcpu] = book->id; +#endif + cpu_set(lcpu, core->mask); + cpu_core_id[lcpu] = core->id; + smp_cpu_polarization[lcpu] = tl_cpu->pp; } } } -static void clear_cores(void) +static void clear_masks(void) { - struct core_info *core = &core_info; + struct mask_info *info; - while (core) { - cpus_clear(core->mask); - core = core->next; + info = &core_info; + while (info) { + cpus_clear(info->mask); + info = info->next; + } +#ifdef CONFIG_SCHED_BOOK + info = &book_info; + while (info) { + cpus_clear(info->mask); + info = info->next; } +#endif } static union tl_entry *next_tle(union tl_entry *tle) @@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle) static void tl_to_cores(struct tl_info *info) { +#ifdef CONFIG_SCHED_BOOK + struct mask_info *book = &book_info; +#else + struct mask_info *book = NULL; +#endif + struct mask_info *core = &core_info; union tl_entry *tle, *end; - struct core_info *core = &core_info; + spin_lock_irq(&topology_lock); - clear_cores(); + clear_masks(); tle = info->tle; end = (union tl_entry *)((unsigned long)info + info->length); while (tle < end) { switch (tle->nl) { - case 5: - case 4: - case 3: +#ifdef CONFIG_SCHED_BOOK case 2: + book = book->next; + book->id = tle->container.id; break; +#endif case 1: core = core->next; core->id = tle->container.id; break; case 0: - add_cpus_to_core(&tle->cpu, core); + add_cpus_to_mask(&tle->cpu, book, core); break; default: - clear_cores(); + clear_masks(); machine_has_topology = 0; goto out; } @@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc) static void update_cpu_core_map(void) { + unsigned long flags; int cpu; - for_each_possible_cpu(cpu) - cpu_core_map[cpu] = cpu_coregroup_map(cpu); + spin_lock_irqsave(&topology_lock, flags); + for_each_possible_cpu(cpu) { + cpu_core_map[cpu] = cpu_group_map(&core_info, cpu); +#ifdef CONFIG_SCHED_BOOK + cpu_book_map[cpu] = cpu_group_map(&book_info, cpu); +#endif + } + spin_unlock_irqrestore(&topology_lock, flags); +} + +static void store_topology(struct tl_info *info) +{ +#ifdef CONFIG_SCHED_BOOK + int rc; + + rc = stsi(info, 15, 1, 3); + if (rc != -ENOSYS) + return; +#endif + stsi(info, 15, 1, 2); } int arch_update_cpu_topology(void) @@ -238,7 +274,7 @@ int arch_update_cpu_topology(void) topology_update_polarization_simple(); return 0; } - stsi(info, 15, 1, 2); + store_topology(info); tl_to_cores(info); update_cpu_core_map(); for_each_online_cpu(cpu) { @@ -299,12 +335,24 @@ out: } __initcall(init_topology_update); +static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset) +{ + int i, nr_masks; + + nr_masks = info->mag[NR_MAG - offset]; + for (i = 0; i < info->mnest - offset; i++) + nr_masks *= info->mag[NR_MAG - offset - 1 - i]; + nr_masks = max(nr_masks, 1); + for (i = 0; i < nr_masks; i++) { + mask->next = alloc_bootmem(sizeof(struct mask_info)); + mask = mask->next; + } +} + void __init s390_init_cpu_topology(void) { unsigned long long facility_bits; struct tl_info *info; - struct core_info *core; - int nr_cores; int i; if (stfle(&facility_bits, 1) <= 0) @@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void) tl_info = alloc_bootmem_pages(PAGE_SIZE); info = tl_info; - stsi(info, 15, 1, 2); - - nr_cores = info->mag[NR_MAG - 2]; - for (i = 0; i < info->mnest - 2; i++) - nr_cores *= info->mag[NR_MAG - 3 - i]; - + store_topology(info); pr_info("The CPU configuration topology of the machine is:"); for (i = 0; i < NR_MAG; i++) printk(" %d", info->mag[i]); printk(" / %d\n", info->mnest); - - core = &core_info; - for (i = 0; i < nr_cores; i++) { - core->next = alloc_bootmem(sizeof(struct core_info)); - core = core->next; - if (!core) - goto error; - } - return; -error: - machine_has_topology = 0; + alloc_masks(info, &core_info, 2); +#ifdef CONFIG_SCHED_BOOK + alloc_masks(info, &book_info, 3); +#endif } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fd227d6b8d9..89b88e3a56e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -799,6 +799,17 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + default n + ---help--- + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + source "kernel/Kconfig.preempt" config X86_UP_APIC diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 26a863a9c2a..a1c2cd76853 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,10 +104,14 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int no_sched_irq_time; + static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; return 1; } @@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason) if (!tsc_unstable) { tsc_unstable = 1; sched_clock_stable = 0; + disable_sched_clock_irqtime(); printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -987,6 +992,9 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); lpj_fine = lpj; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 9fc630ce1dd..f6f37a05a0c 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \ return sprintf(buf, "%d\n", topology_##name(cpu)); \ } -#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) +#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \ + defined(topology_book_cpumask) static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf) { ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; @@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask); define_one_ro_named(core_siblings, show_core_cpumask); define_one_ro_named(core_siblings_list, show_core_cpumask_list); +#ifdef CONFIG_SCHED_BOOK +define_id_show_func(book_id); +define_one_ro(book_id); +define_siblings_show_func(book_cpumask); +define_one_ro_named(book_siblings, show_book_cpumask); +define_one_ro_named(book_siblings_list, show_book_cpumask_list); +#endif + static struct attribute *default_attrs[] = { &attr_physical_package_id.attr, &attr_core_id.attr, @@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = { &attr_thread_siblings_list.attr, &attr_core_siblings.attr, &attr_core_siblings_list.attr, +#ifdef CONFIG_SCHED_BOOK + &attr_book_id.attr, + &attr_book_siblings.attr, + &attr_book_siblings_list.attr, +#endif NULL }; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 1f4517d55b1..96c323ac44d 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -64,6 +64,8 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) + #ifndef PREEMPT_ACTIVE #define PREEMPT_ACTIVE_BITS 1 #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) @@ -82,10 +84,13 @@ /* * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? + * in_softirq - Are we currently processing softirq or have bh disabled? + * in_serving_softirq - Are we currently processing softirq? */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) /* * Are we in NMI context? @@ -132,10 +137,12 @@ extern void synchronize_irq(unsigned int irq); struct task_struct; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) static inline void account_system_vtime(struct task_struct *tsk) { } +#else +extern void account_system_vtime(struct task_struct *tsk); #endif #if defined(CONFIG_NO_HZ) diff --git a/include/linux/sched.h b/include/linux/sched.h index 61b4ecf1da5..0383601a927 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -875,6 +875,7 @@ enum sched_domain_level { SD_LV_NONE = 0, SD_LV_SIBLING, SD_LV_MC, + SD_LV_BOOK, SD_LV_CPU, SD_LV_NODE, SD_LV_ALLNODES, @@ -1690,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ -#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ - /* Not implemented yet, only for 486*/ +#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ @@ -1837,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * An i/f to runtime opt-in for irq time accounting based off of sched_clock. + * The reason for this explicit opt-in is not to have perf penalty with + * slow sched_clocks. + */ +extern void enable_sched_clock_irqtime(void); +extern void disable_sched_clock_irqtime(void); +#else +static inline void enable_sched_clock_irqtime(void) {} +static inline void disable_sched_clock_irqtime(void) {} +#endif + extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); @@ -2378,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_softirq(void); -#define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ - __cond_resched_softirq(); \ +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + __cond_resched_softirq(); \ }) /* diff --git a/include/linux/topology.h b/include/linux/topology.h index 64e084ff5e5..b91a40e847d 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -201,6 +201,12 @@ int arch_update_cpu_topology(void); .balance_interval = 64, \ } +#ifdef CONFIG_SCHED_BOOK +#ifndef SD_BOOK_INIT +#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! +#endif +#endif /* CONFIG_SCHED_BOOK */ + #ifdef CONFIG_NUMA #ifndef SD_NODE_INIT #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9208c92aeab..f6334782a59 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); +/* + * Tracepoint for showing priority inheritance modifying a tasks + * priority. + */ +TRACE_EVENT(sched_pi_setprio, + + TP_PROTO(struct task_struct *tsk, int newprio), + + TP_ARGS(tsk, newprio), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, oldprio ) + __field( int, newprio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->oldprio = tsk->prio; + __entry->newprio = newprio; + ), + + TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", + __entry->comm, __entry->pid, + __entry->oldprio, __entry->newprio) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched.c b/kernel/sched.c index 5a5cc33e499..d42992bccdf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -426,9 +426,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -437,7 +435,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. @@ -488,11 +486,12 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -520,6 +519,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); + inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) { + int cpu = cpu_of(rq); + u64 irq_time; + + rq->clock = sched_clock_cpu(cpu); + irq_time = irq_time_cpu(cpu); + if (rq->clock - irq_time > rq->clock_task) + rq->clock_task = rq->clock - irq_time; + + sched_irq_time_avg_update(rq, irq_time); + } } /* @@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ @@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +static u64 irq_time_cpu(int cpu) +{ + if (!sched_clock_irqtime) + return 0; + + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + int cpu; + u64 now, delta; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + now = sched_clock_cpu(cpu); + delta = now - per_cpu(irq_start_time, cpu); + per_cpu(irq_start_time, cpu) = now; + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + per_cpu(cpu_hardirq_time, cpu) += delta; + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + per_cpu(cpu_softirq_time, cpu) += delta; + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ + if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { + u64 delta_irq = curr_irq_time - rq->prev_irq_time; + rq->prev_irq_time = curr_irq_time; + sched_rt_avg_update(rq, delta_irq); + } +} + +#else + +static u64 irq_time_cpu(int cpu) +{ + return 0; +} + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + +#endif + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sc |