diff options
Diffstat (limited to 'kernel')
64 files changed, 3336 insertions, 1678 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 248e1c396f8..4af15802ccd 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -7,7 +7,7 @@ choice default HZ_250 help Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 HZ but 100 HZ may be more + to have the timer interrupt run at 1000 Hz but 100 Hz may be more beneficial for servers and NUMA systems that do not need to have a fast response for user interaction and that may experience bus contention and cacheline bounces as a result of timer interrupts. @@ -19,21 +19,30 @@ choice config HZ_100 bool "100 HZ" help - 100 HZ is a typical choice for servers, SMP and NUMA systems + 100 Hz is a typical choice for servers, SMP and NUMA systems with lots of processors that may show reduced performance if too many timer interrupts are occurring. config HZ_250 bool "250 HZ" help - 250 HZ is a good compromise choice allowing server performance + 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even - on SMP and NUMA systems. + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + + config HZ_300 + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. config HZ_1000 bool "1000 HZ" help - 1000 HZ is the preferred choice for desktop systems and other + 1000 Hz is the preferred choice for desktop systems and other systems requiring fast interactive responses to events. endchoice @@ -42,5 +51,6 @@ config HZ int default 100 if HZ_100 default 250 if HZ_250 + default 300 if HZ_300 default 1000 if HZ_1000 diff --git a/kernel/acct.c b/kernel/acct.c index 0aad5ca36a8..70d0d88e555 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -89,7 +89,8 @@ struct acct_glbs { struct timer_list timer; }; -static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; +static struct acct_glbs acct_globals __cacheline_aligned = + {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; /* * Called whenever the timer says to check the free space. @@ -117,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry, &sbuf)) + if (vfs_statfs(file->f_path.dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file) add_timer(&acct_globals.timer); } if (old_acct) { - mnt_unpin(old_acct->f_vfsmnt); + mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); do_acct_process(old_acct); filp_close(old_acct, NULL); @@ -211,7 +212,7 @@ static int acct_on(char *name) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -228,11 +229,11 @@ static int acct_on(char *name) } spin_lock(&acct_globals.lock); - mnt_pin(file->f_vfsmnt); + mnt_pin(file->f_path.mnt); acct_file_reopen(file); spin_unlock(&acct_globals.lock); - mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ + mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ return 0; } @@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) void acct_auto_close_mnt(struct vfsmount *m) { spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_vfsmnt == m) + if (acct_globals.file && acct_globals.file->f_path.mnt == m) acct_file_reopen(NULL); spin_unlock(&acct_globals.lock); } @@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb) { spin_lock(&acct_globals.lock); if (acct_globals.file && - acct_globals.file->f_vfsmnt->mnt_sb == sb) { + acct_globals.file->f_path.mnt->mnt_sb == sb) { acct_file_reopen(NULL); } spin_unlock(&acct_globals.lock); @@ -427,6 +428,7 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; /* * First check to see if there is enough free_space to continue @@ -483,16 +485,9 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - mutex_lock(&tty_mutex); - /* FIXME: Whoever is responsible for current->signal locking needs - to use the same locking all over the kernel and document it */ - read_lock(&tasklist_lock); - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); - mutex_unlock(&tty_mutex); - spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; diff --git a/kernel/audit.c b/kernel/audit.c index 98106f6078b..d9b690ac684 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -57,6 +57,7 @@ #include <linux/netlink.h> #include <linux/selinux.h> #include <linux/inotify.h> +#include <linux/freezer.h> #include "audit.h" diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4f40d923af8..2e896f8ae29 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) struct audit_rule *rule; int i; - rule = kmalloc(sizeof(*rule), GFP_KERNEL); + rule = kzalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) return NULL; - memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; rule->action = krule->action; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 42f2f117971..298897559ca 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@ #include <linux/tty.h> #include <linux/selinux.h> #include <linux/binfmts.h> +#include <linux/highmem.h> #include <linux/syscalls.h> #include "audit.h" @@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context) printk(KERN_ERR "audit: freed %d contexts\n", count); } -static void audit_log_task_context(struct audit_buffer *ab) +void audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; ssize_t len = 0; @@ -759,6 +760,8 @@ error_path: return; } +EXPORT_SYMBOL(audit_log_task_context); + static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { char name[sizeof(tsk->comm)]; @@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_dentry, - vma->vm_file->f_vfsmnt); + vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); break; } vma = vma->vm_next; @@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->return_code); mutex_lock(&tty_mutex); + read_lock(&tasklist_lock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; + read_unlock(&tasklist_lock); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" @@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } +EXPORT_SYMBOL(audit_get_loginuid); + /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag diff --git a/kernel/compat.c b/kernel/compat.c index d4898aad6cf..6952dd05730 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} #endif diff --git a/kernel/configs.c b/kernel/configs.c index f9e31974f4a..8fa1fb28f8a 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf, return count; } -static struct file_operations ikconfig_file_ops = { +static const struct file_operations ikconfig_file_ops = { .owner = THIS_MODULE, .read = ikconfig_read_current, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 663c920b223..9124669f458 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void) recursive_depth--; return; } - mutex_unlock(&cpu_bitmask_lock); recursive = NULL; + mutex_unlock(&cpu_bitmask_lock); } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); @@ -270,11 +270,7 @@ int disable_nonboot_cpus(void) goto out; } } - error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); - if (error) { - printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); - goto out; - } + /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6313c38c930..232aed2b10f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { * * * When reading/writing to a file: - * - the cpuset to use in file->f_dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_dentry->d_fsdata + * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { @@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* Remaining checks don't apply to root cpuset */ - if ((par = cur->parent) == NULL) + if (cur == &top_cpuset) return 0; + par = cur->parent; + /* We must be a subset of our parent cpuset */ if (!is_cpuset_subset(trial, par)) return -EACCES; @@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); mutex_lock(&callback_mutex); - if (turning_on) - set_bit(bit, &cs->flags); - else - clear_bit(bit, &cs->flags); + cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) @@ -1281,18 +1280,19 @@ typedef enum { FILE_TASKLIST, } cpuset_filetype_t; -static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, +static ssize_t cpuset_common_file_write(struct file *file, + const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); - struct cftype *cft = __d_cft(file->f_dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); cpuset_filetype_t type = cft->private; char *buffer; char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ - if (nbytes > 100 + 6 * NR_CPUS) + if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) return -E2BIG; /* +1 for nul-terminator */ @@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct cftype *cft = __d_cft(file->f_dentry); - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) if (err) return err; - cft = __d_cft(file->f_dentry); + cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; if (cft->open) @@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) static int cpuset_file_release(struct inode *inode, struct file *file) { - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (cft->release) return cft->release(inode, file); return 0; @@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cpuset_file_operations = { +static const struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, .llseek = generic_file_llseek, @@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); struct ctr_struct *ctr; pid_t *pidarray; int npids; @@ -2045,7 +2045,6 @@ out: return err; } -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void) mutex_unlock(&callback_mutex); mutex_unlock(&manage_mutex); } -#endif -#ifdef CONFIG_HOTPLUG_CPU /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent @@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, common_cpu_mem_hotplug_unplug(); return 0; } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* @@ -2346,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) } /** - * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? - * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) + * @gfp_mask: memory allocation flags * - * If we're in interrupt, yes, we can always allocate. If zone + * If we're in interrupt, yes, we can always allocate. If + * __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * mem_exclusive cpuset ancestor to this tasks cpuset, yes. * Otherwise, no. * + * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() + * reduces to cpuset_zone_allowed_hardwall(). Otherwise, + * cpuset_zone_allowed_softwall() might sleep, and might allow a zone + * from an enclosing cpuset. + * + * cpuset_zone_allowed_hardwall() only handles the simpler case of + * hardwall cpusets, and never sleeps. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset. * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest mem_exclusive ancestor cpuset. + * nearest enclosing mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() - * routine only calls here with __GFP_HARDWALL bit _not_ set if - * it's a GFP_KERNEL allocation, and all nodes in the current tasks - * mems_allowed came up empty on the first pass over the |