diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'kernel')
67 files changed, 40718 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 00000000000..eb88b446c2c --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,53 @@ +# +# Makefile for the linux kernel. +# + +obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ + exit.o itimer.o time.o softirq.o resource.o \ + sysctl.o capability.o ptrace.o timer.o user.o \ + signal.o sys.o kmod.o workqueue.o pid.o \ + rcupdate.o intermodule.o extable.o params.o posix-timers.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o + +obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o +obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_PM) += power/ +obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKCONFIG_PROC) += configs.o +obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +obj-$(CONFIG_SECCOMP) += seccomp.o + +ifneq ($(CONFIG_IA64),y) +# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is +# needed for x86 only. Why this used to be enabled for all architectures is beyond +# me. I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer +endif + +$(obj)/configs.o: $(obj)/config_data.h + +# config_data.h contains the same information as ikconfig.h but gzipped. +# Info from config_data can be extracted from /proc/config* +targets += config_data.gz +$(obj)/config_data.gz: .config FORCE + $(call if_changed,gzip) + +quiet_cmd_ikconfiggz = IKCFG $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ +targets += config_data.h +$(obj)/config_data.h: $(obj)/config_data.gz FORCE + $(call if_changed,ikconfiggz) diff --git a/kernel/acct.c b/kernel/acct.c new file mode 100644 index 00000000000..4168f631868 --- /dev/null +++ b/kernel/acct.c @@ -0,0 +1,561 @@ +/* + * linux/kernel/acct.c + * + * BSD Process Accounting for Linux + * + * Author: Marco van Wieringen <mvw@planets.elm.net> + * + * Some code based on ideas and code from: + * Thomas K. Dyas <tdyas@eden.rutgers.edu> + * + * This file implements BSD-style process accounting. Whenever any + * process exits, an accounting record of type "struct acct" is + * written to the file specified with the acct() system call. It is + * up to user-level programs to do useful things with the accounting + * log. The kernel just provides the raw accounting information. + * + * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. + * + * Plugged two leaks. 1) It didn't return acct_file into the free_filps if + * the file happened to be read-only. 2) If the accounting was suspended + * due to the lack of space it happily allowed to reopen it and completely + * lost the old acct_file. 3/10/98, Al Viro. + * + * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). + * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. + * + * Fixed a nasty interaction with with sys_umount(). If the accointing + * was suspeneded we failed to stop it on umount(). Messy. + * Another one: remount to readonly didn't stop accounting. + * Question: what should we do if we have CAP_SYS_ADMIN but not + * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY + * unless we are messing with the root. In that case we are getting a + * real mess with do_remount_sb(). 9/11/98, AV. + * + * Fixed a bunch of races (and pair of leaks). Probably not the best way, + * but this one obviously doesn't introduce deadlocks. Later. BTW, found + * one race (and leak) in BSD implementation. + * OK, that's better. ANOTHER race and leak in BSD variant. There always + * is one more bug... 10/11/98, AV. + * + * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold + * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks + * a struct file opened for write. Fixed. 2/6/2000, AV. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/acct.h> +#include <linux/file.h> +#include <linux/tty.h> +#include <linux/security.h> +#include <linux/vfs.h> +#include <linux/jiffies.h> +#include <linux/times.h> +#include <linux/syscalls.h> +#include <asm/uaccess.h> +#include <asm/div64.h> +#include <linux/blkdev.h> /* sector_div */ + +/* + * These constants control the amount of freespace that suspend and + * resume the process accounting system, and the time delay between + * each check. + * Turned into sysctl-controllable parameters. AV, 12/11/98 + */ + +int acct_parm[3] = {4, 2, 30}; +#define RESUME (acct_parm[0]) /* >foo% free space - resume */ +#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */ +#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ + +/* + * External references and all of the globals. + */ +static void do_acct_process(long, struct file *); + +/* + * This structure is used so that all the data protected by lock + * can be placed in the same cache line as the lock. This primes + * the cache line to have the data after getting the lock. + */ +struct acct_glbs { + spinlock_t lock; + volatile int active; + volatile int needcheck; + struct file *file; + struct timer_list timer; +}; + +static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; + +/* + * Called whenever the timer says to check the free space. + */ +static void acct_timeout(unsigned long unused) +{ + acct_globals.needcheck = 1; +} + +/* + * Check the amount of free space and suspend/resume accordingly. + */ +static int check_free_space(struct file *file) +{ + struct kstatfs sbuf; + int res; + int act; + sector_t resume; + sector_t suspend; + + spin_lock(&acct_globals.lock); + res = acct_globals.active; + if (!file || !acct_globals.needcheck) + goto out; + spin_unlock(&acct_globals.lock); + + /* May block */ + if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + return res; + suspend = sbuf.f_blocks * SUSPEND; + resume = sbuf.f_blocks * RESUME; + + sector_div(suspend, 100); + sector_div(resume, 100); + + if (sbuf.f_bavail <= suspend) + act = -1; + else if (sbuf.f_bavail >= resume) + act = 1; + else + act = 0; + + /* + * If some joker switched acct_globals.file under us we'ld better be + * silent and _not_ touch anything. + */ + spin_lock(&acct_globals.lock); + if (file != acct_globals.file) { + if (act) + res = act>0; + goto out; + } + + if (acct_globals.active) { + if (act < 0) { + acct_globals.active = 0; + printk(KERN_INFO "Process accounting paused\n"); + } + } else { + if (act > 0) { + acct_globals.active = 1; + printk(KERN_INFO "Process accounting resumed\n"); + } + } + + del_timer(&acct_globals.timer); + acct_globals.needcheck = 0; + acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct_globals.timer); + res = acct_globals.active; +out: + spin_unlock(&acct_globals.lock); + return res; +} + +/* + * Close the old accouting file (if currently open) and then replace + * it with file (if non-NULL). + * + * NOTE: acct_globals.lock MUST be held on entry and exit. + */ +static void acct_file_reopen(struct file *file) +{ + struct file *old_acct = NULL; + + if (acct_globals.file) { + old_acct = acct_globals.file; + del_timer(&acct_globals.timer); + acct_globals.active = 0; + acct_globals.needcheck = 0; + acct_globals.file = NULL; + } + if (file) { + acct_globals.file = file; + acct_globals.needcheck = 0; + acct_globals.active = 1; + /* It's been deleted if it was used before so this is safe */ + init_timer(&acct_globals.timer); + acct_globals.timer.function = acct_timeout; + acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct_globals.timer); + } + if (old_acct) { + spin_unlock(&acct_globals.lock); + do_acct_process(0, old_acct); + filp_close(old_acct, NULL); + spin_lock(&acct_globals.lock); + } +} + +/* + * sys_acct() is the only system call needed to implement process + * accounting. It takes the name of the file where accounting records + * should be written. If the filename is NULL, accounting will be + * shutdown. + */ +asmlinkage long sys_acct(const char __user *name) +{ + struct file *file = NULL; + char *tmp; + int error; + + if (!capable(CAP_SYS_PACCT)) + return -EPERM; + + if (name) { + tmp = getname(name); + if (IS_ERR(tmp)) { + return (PTR_ERR(tmp)); + } + /* Difference from BSD - they don't do O_APPEND */ + file = filp_open(tmp, O_WRONLY|O_APPEND, 0); + putname(tmp); + if (IS_ERR(file)) { + return (PTR_ERR(file)); + } + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + filp_close(file, NULL); + return (-EACCES); + } + + if (!file->f_op->write) { + filp_close(file, NULL); + return (-EIO); + } + } + + error = security_acct(file); + if (error) { + if (file) + filp_close(file, NULL); + return error; + } + + spin_lock(&acct_globals.lock); + acct_file_reopen(file); + spin_unlock(&acct_globals.lock); + + return (0); +} + +/* + * If the accouting is turned on for a file in the filesystem pointed + * to by sb, turn accouting off. + */ +void acct_auto_close(struct super_block *sb) +{ + spin_lock(&acct_globals.lock); + if (acct_globals.file && + acct_globals.file->f_dentry->d_inode->i_sb == sb) { + acct_file_reopen((struct file *)NULL); + } + spin_unlock(&acct_globals.lock); +} + +/* + * encode an unsigned long into a comp_t + * + * This routine has been adopted from the encode_comp_t() function in + * the kern_acct.c file of the FreeBSD operating system. The encoding + * is a 13-bit fraction with a 3-bit (base 8) exponent. + */ + +#define MANTSIZE 13 /* 13 bit mantissa. */ +#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ +#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ + +static comp_t encode_comp_t(unsigned long value) +{ + int exp, rnd; + + exp = rnd = 0; + while (value > MAXFRACT) { + rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ + value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ + exp++; + } + + /* + * If we need to round up, do it (and handle overflow correctly). + */ + if (rnd && (++value > MAXFRACT)) { + value >>= EXPSIZE; + exp++; + } + + /* + * Clean it up and polish it off. + */ + exp <<= MANTSIZE; /* Shift the exponent into place */ + exp += value; /* and add on the mantissa. */ + return exp; +} + +#if ACCT_VERSION==1 || ACCT_VERSION==2 +/* + * encode an u64 into a comp2_t (24 bits) + * + * Format: 5 bit base 2 exponent, 20 bits mantissa. + * The leading bit of the mantissa is not stored, but implied for + * non-zero exponents. + * Largest encodable value is 50 bits. + */ + +#define MANTSIZE2 20 /* 20 bit mantissa. */ +#define EXPSIZE2 5 /* 5 bit base 2 exponent. */ +#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ +#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ + +static comp2_t encode_comp2_t(u64 value) +{ + int exp, rnd; + + exp = (value > (MAXFRACT2>>1)); + rnd = 0; + while (value > MAXFRACT2) { + rnd = value & 1; + value >>= 1; + exp++; + } + + /* + * If we need to round up, do it (and handle overflow correctly). + */ + if (rnd && (++value > MAXFRACT2)) { + value >>= 1; + exp++; + } + + if (exp > MAXEXP2) { + /* Overflow. Return largest representable number instead. */ + return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; + } else { + return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); + } +} +#endif + +#if ACCT_VERSION==3 +/* + * encode an u64 into a 32 bit IEEE float + */ +static u32 encode_float(u64 value) +{ + unsigned exp = 190; + unsigned u; + + if (value==0) return 0; + while ((s64)value > 0){ + value <<= 1; + exp--; + } + u = (u32)(value >> 40) & 0x7fffffu; + return u | (exp << 23); +} +#endif + +/* + * Write an accounting entry for an exiting process + * + * The acct_process() call is the workhorse of the process + * accounting system. The struct acct is built here and then written + * into the accounting file. This function should only be called from + * do_exit(). + */ + +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(long exitcode, struct file *file) +{ + acct_t ac; + mm_segment_t fs; + unsigned long vsize; + unsigned long flim; + u64 elapsed; + u64 run_time; + struct timespec uptime; + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(file)) + return; + + /* + * Fill the accounting struct with the needed info as recorded + * by the different kernel functions. + */ + memset((caddr_t)&ac, 0, sizeof(acct_t)); + + ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + + /* calculate run_time in nsec*/ + do_posix_clock_monotonic_gettime(&uptime); + run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; + run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC + + current->start_time.tv_nsec; + /* convert nsec -> AHZ */ + elapsed = nsec_to_AHZ(run_time); +#if ACCT_VERSION==3 + ac.ac_etime = encode_float(elapsed); +#else + ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + (unsigned long) elapsed : (unsigned long) -1l); +#endif +#if ACCT_VERSION==1 || ACCT_VERSION==2 + { + /* new enlarged etime field */ + comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; + ac.ac_etime_lo = (u16) etime; + } +#endif + do_div(elapsed, AHZ); + ac.ac_btime = xtime.tv_sec - elapsed; + ac.ac_utime = encode_comp_t(jiffies_to_AHZ( + current->signal->utime + + current->group_leader->utime)); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ( + current->signal->stime + + current->group_leader->stime)); + /* we really need to bite the bullet and change layout */ + ac.ac_uid = current->uid; + ac.ac_gid = current->gid; +#if ACCT_VERSION==2 + ac.ac_ahz = AHZ; +#endif +#if ACCT_VERSION==1 || ACCT_VERSION==2 + /* backward-compatible 16 bit fields */ + ac.ac_uid16 = current->uid; + ac.ac_gid16 = current->gid; +#endif +#if ACCT_VERSION==3 + ac.ac_pid = current->tgid; + ac.ac_ppid = current->parent->tgid; +#endif + + read_lock(&tasklist_lock); /* pin current->signal */ + ac.ac_tty = current->signal->tty ? + old_encode_dev(tty_devnum(current->signal->tty)) : 0; + read_unlock(&tasklist_lock); + + ac.ac_flag = 0; + if (current->flags & PF_FORKNOEXEC) + ac.ac_flag |= AFORK; + if (current->flags & PF_SUPERPRIV) + ac.ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + ac.ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + ac.ac_flag |= AXSIG; + + vsize = 0; + if (current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + vsize = vsize / 1024; + ac.ac_mem = encode_comp_t(vsize); + ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ + ac.ac_rw = encode_comp_t(ac.ac_io / 1024); + ac.ac_minflt = encode_comp_t(current->signal->min_flt + + current->group_leader->min_flt); + ac.ac_majflt = encode_comp_t(current->signal->maj_flt + + current->group_leader->maj_flt); + ac.ac_swaps = encode_comp_t(0); + ac.ac_exitcode = exitcode; + + /* + * Kernel segment override to datasegment and write it + * to the accounting file. + */ + fs = get_fs(); + set_fs(KERNEL_DS); + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + file->f_op->write(file, (char *)&ac, + sizeof(acct_t), &file->f_pos); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; + set_fs(fs); +} + +/* + * acct_process - now just a wrapper around do_acct_process + */ +void acct_process(long exitcode) +{ + struct file *file = NULL; + + /* + * accelerate the common fastpath: + */ + if (!acct_globals.file) + return; + + spin_lock(&acct_globals.lock); + file = acct_globals.file; + if (unlikely(!file)) { + spin_unlock(&acct_globals.lock); + return; + } + get_file(file); + spin_unlock(&acct_globals.lock); + + do_acct_process(exitcode, file); + fput(file); +} + + +/* + * acct_update_integrals + * - update mm integral fields in task_struct + */ +void acct_update_integrals(struct task_struct *tsk) +{ + if (likely(tsk->mm)) { + long delta = tsk->stime - tsk->acct_stimexpd; + + if (delta == 0) + return; + tsk->acct_stimexpd = tsk->stime; + tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + } +} + +/* + * acct_clear_integrals + * - clear the mm integral fields in task_struct + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + if (tsk) { + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; + } +} diff --git a/kernel/audit.c b/kernel/audit.c new file mode 100644 index 00000000000..0f84dd7af2c --- /dev/null +++ b/kernel/audit.c @@ -0,0 +1,839 @@ +/* audit.c -- Auditing support -*- linux-c -*- + * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. + * System-call specific features have moved to auditsc.c + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Written by Rickard E. (Rik) Faith <faith@redhat.com> + * + * Goals: 1) Integrate fully with SELinux. + * 2) Minimal run-time overhead: + * a) Minimal when syscall auditing is disabled (audit_enable=0). + * b) Small when syscall auditing is enabled and no audit record + * is generated (defer as much work as possible to record + * generation time): + * i) context is allocated, + * ii) names from getname are stored without a copy, and + * iii) inode information stored from path_lookup. + * 3) Ability to disable syscall auditing at boot time (audit=0). + * 4) Usable by other parts of the kernel (if audit_log* is called, + * then a syscall record will be generated automatically for the + * current syscall). + * 5) Netlink interface to user-space. + * 6) Support low-overhead kernel-based filtering to minimize the + * information that must be passed to user-space. + * + * Example user-space utilities: http://people.redhat.com/faith/audit/ + */ + +#include <linux/init.h> +#include <asm/atomic.h> +#include <asm/types.h> +#include <linux/mm.h> +#include <linux/module.h> + +#include <linux/audit.h> + +#include <net/sock.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> + +/* No auditing will take place until audit_initialized != 0. + * (Initialization happens after skb_init is called.) */ +static int audit_initialized; + +/* No syscall auditing will take place unless audit_enabled != 0. */ +int audit_enabled; + +/* Default state when kernel boots without any parameters. */ +static int audit_default; + +/* If auditing cannot proceed, audit_failure selects what happens. */ +static int audit_failure = AUDIT_FAIL_PRINTK; + +/* If audit records are to be written to the netlink socket, audit_pid + * contains the (non-zero) pid. */ +static int audit_pid; + +/* If audit_limit is non-zero, limit the rate of sending audit records + * to that number per second. This prevents DoS attacks, but results in + * audit records being dropped. */ +static int audit_rate_limit; + +/* Number of outstanding audit_buffers allowed. */ +static int audit_backlog_limit = 64; +static atomic_t audit_backlog = ATOMIC_INIT(0); + +/* Records can be lost in several ways: + 0) [suppressed in audit_alloc] + 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] + 2) out of memory in audit_log_move [alloc_skb] + 3) suppressed due to audit_rate_limit + 4) suppressed due to audit_backlog_limit +*/ +static atomic_t audit_lost = ATOMIC_INIT(0); + +/* The netlink socket. */ +static struct sock *audit_sock; + +/* There are two lists of audit buffers. The txlist contains audit + * buffers that cannot be sent immediately to the netlink device because + * we are in an irq context (these are sent later in a tasklet). + * + * The second list is a list of pre-allocated audit buffers (if more + * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of + * being placed on the freelist). */ +static DEFINE_SPINLOCK(audit_txlist_lock); +static DEFINE_SPINLOCK(audit_freelist_lock); +static int audit_freelist_count = 0; +static LIST_HEAD(audit_txlist); +static LIST_HEAD(audit_freelist); + +/* There are three lists of rules -- one to search at task creation + * time, one to search at syscall entry time, and another to search at + * syscall exit time. */ +static LIST_HEAD(audit_tsklist); +static LIST_HEAD(audit_entlist); +static LIST_HEAD(audit_extlist); + +/* The netlink socket is only to be read by 1 CPU, which lets us assume + * that list additions and deletions never happen simultaneiously in + * auditsc.c */ +static DECLARE_MUTEX(audit_netlink_sem); + +/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting + * audit records. Since printk uses a 1024 byte buffer, this buffer + * should be at least that large. */ +#define AUDIT_BUFSIZ 1024 + +/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the + * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ +#define AUDIT_MAXFREE (2*NR_CPUS) + +/* The audit_buffer is used when formatting an audit record. The caller + * locks briefly to get the record off the freelist or to allocate the + * buffer, and locks briefly to send the buffer to the netlink layer or + * to place it on a transmit queue. Multiple audit_buffers can be in + * use simultaneously. */ +struct audit_buffer { + struct list_head list; + struct sk_buff_head sklist; /* formatted skbs ready to send */ + struct audit_context *ctx; /* NULL or associated context */ + int len; /* used area of tmp */ + char tmp[AUDIT_BUFSIZ]; + + /* Pointer to header and contents */ + struct nlmsghdr *nlh; + int total; + int type; + int pid; + int count; /* Times requeued */ +}; + +void audit_set_type(struct audit_buffer *ab, int type) +{ + ab->type = type; +} + +struct audit_entry { + struct list_head list; + struct audit_rule rule; +}; + +static void audit_log_end_irq(struct audit_buffer *ab); +static void audit_log_end_fast(struct audit_buffer *ab); + +static void audit_panic(const char *message) +{ + switch (audit_failure) + { + case AUDIT_FAIL_SILENT: + break; + case AUDIT_FAIL_PRINTK: + printk(KERN_ERR "audit: %s\n", message); + break; + case AUDIT_FAIL_PANIC: + panic("audit: %s\n", message); + break; + } +} + +static inline int audit_rate_check(void) +{ + static unsigned long last_check = 0; + static int messages = 0; + static DEFINE_SPINLOCK(lock); + unsigned long flags; + unsigned long now; + unsigned long elapsed; + int retval = 0; + + if (!audit_rate_limit) return 1; + + spin_lock_irqsave(&lock, flags); + if (++messages < audit_rate_limit) { + retval = 1; + } else { + now = jiffies; + elapsed = now - last_check; + if (elapsed > HZ) { + last_check = now; + messages = 0; + retval = 1; + } + } + spin_unlock_irqrestore(&lock, flags); + + return retval; +} + +/* Emit at least 1 message per second, even if audit_rate_check is + * throttling. */ +void audit_log_lost(const char *message) +{ + static unsigned long last_msg = 0; + static DEFINE_SPINLOCK(lock); + unsigned long flags; + unsigned long now; + int print; + + atomic_inc(&audit_lost); + + print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); + + if (!print) { + spin_lock_irqsave(&lock, flags); + now = jiffies; + if (now - last_msg > HZ) { + print = 1; + last_msg = now; + } + spin_unlock_irqrestore(&lock, flags); + } + + if (print) { + printk(KERN_WARNING + "audit: audit_lost=%d audit_backlog=%d" + " audit_rate_limit=%d audit_backlog_limit=%d\n", + atomic_read(&audit_lost), + atomic_read(&audit_backlog), + audit_rate_limit, + audit_backlog_limit); + audit_panic(message); + } + +} + +static int audit_set_rate_limit(int limit) +{ + int old = audit_rate_limit; + audit_rate_limit = limit; + audit_log(current->audit_context, "audit_rate_limit=%d old=%d", + audit_rate_limit, old); + return old; +} + +static int audit_set_backlog_limit(int limit) +{ + int old = audit_backlog_limit; + audit_backlog_limit = limit; + audit_log(current->audit_context, "audit_backlog_limit=%d old=%d", + audit_backlog_limit, old); + return old; +} + +static int audit_set_enabled(int state) +{ + int old = audit_enabled; + if (state != 0 && state != 1) + return -EINVAL; + audit_enabled = state; + audit_log(current->audit_context, "audit_enabled=%d old=%d", + audit_enabled, old); + return old; +} + +static int audit_set_failure(int state) +{ + int old = audit_failure; + if (state != AUDIT_FAIL_SILENT + && state != AUDIT_FAIL_PRINTK + && state != AUDIT_FAIL_PANIC) + return -EINVAL; + audit_failure = state; + audit_log(current->audit_context, "audit_failure=%d old=%d", + audit_failure, old); + return old; +} + +#ifdef CONFIG_NET +void audit_send_reply(int pid, int seq, int type, int done, int multi, + void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len = NLMSG_SPACE(size); + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + goto nlmsg_failure; + + nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); + nlh->nlmsg_flags = flags; + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); + return; + +nlmsg_failure: /* Used by NLMSG_PUT */ + if (skb) + kfree_skb(skb); +} + +/* + * Check for appropriate CAP_AUDIT_ capabilities on incoming audit + * control messages. + */ +static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) +{ + int err = 0; + + switch (msg_type) { + case AUDIT_GET: + case AUDIT_LIST: + case AUDIT_SET: + case AUDIT_ADD: + case AUDIT_DEL: + if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) + err = -EPERM; + break; + case AUDIT_USER: + if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) + err = -EPERM; + break; + default: /* bad msg */ + err = -EINVAL; + } + + return err; +} + +static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + u32 uid, pid, seq; + void *data; + struct audit_status *status_get, status_set; + int err; + struct audit_buffer *ab; + u16 msg_type = nlh->nlmsg_type; + + err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); + if (err) + return err; + + pid = NETLINK_CREDS(skb)->pid; + uid = NETLINK_CREDS(skb)->uid; + seq = nlh->nlmsg_seq; + data = NLMSG_DATA(nlh); + + switch (msg_type) { + case AUDIT_GET: + status_set.enabled = audit_enabled; + status_set.failure = audit_failure; + status_set.pid = audit_pid; + status_set.rate_limit = audit_rate_limit; + status_set.backlog_limit = audit_backlog_limit; + status_set.lost = atomic_read(&audit_lost); + status_set.backlog = atomic_read(&audit_backlog); + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, + &status_set, sizeof(status_set)); + break; + case AUDIT_SET: + if (nlh->nlmsg_len < sizeof(struct audit_status)) + return -EINVAL; + status_get = (struct audit_status *)data; + if (status_get->mask & AUDIT_STATUS_ENABLED) { + err = audit_set_enabled(status_get->enabled); + if (err < 0) return err; + } + if (status_get->mask & AUDIT_STATUS_FAILURE) { + err = audit_set_failure(status_get->failure); + if (err < 0) return err; + } + if (status_get->mask & AUDIT_STATUS_PID) { + int old = audit_pid; + audit_pid = status_get->pid; + audit_log(current->audit_context, + "audit_pid=%d old=%d", audit_pid, old); + } + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) + audit_set_rate_limit(status_get->rate_limit); + if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) + audit_set_backlog_limit(status_get->backlog_limit); + break; + case AUDIT_USER: + ab = audit_log_start(NULL); + if (!ab) + break; /* audit_panic has been called */ + audit_log_format(ab, + "user pid=%d uid=%d length=%d msg='%.1024s'", + pid, uid, + (int)(nlh->nlmsg_len + - ((char *)data - (char *)nlh)), + (char *)data); + ab->type = AUDIT_USER; + ab->pid = pid; + audit_log_end(ab); + break; + case AUDIT_ADD: + case AUDIT_DEL: + if (nlh->nlmsg_len < sizeof(struct audit_rule)) + return -EINVAL; + /* fallthrough */ + case AUDIT_LIST: +#ifdef CONFIG_AUDITSYSCALL + err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + uid, seq, data); +#else + err = -EOPNOTSUPP; +#endif + break; + default: + err = -EINVAL; + break; + } + + return err < 0 ? err : 0; +} + +/* Get message from skb (based on rtnetlink_rcv_skb). Each message is + * processed by audit_receive_msg. Malformed skbs with wrong length are + * discarded silently. */ +static int audit_receive_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + u32 rlen; + + while (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if ((err = audit_receive_msg(skb, nlh))) { + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + return 0; +} + +/* Receive messages from netlink socket. */ +static void audit_receive(struct sock *sk, int length) +{ + struct sk_buff *skb; + + if (down_trylock(&audit_netlink_sem)) + return; + + /* FIXME: this must not cause starvation */ + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + if (audit_receive_skb(skb) && skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else + kfree_skb(skb); + } + up(&audit_netlink_sem); +} + +/* Move data from tmp buffer into an skb. This is an extra copy, and + * that is unfortunate. However, the copy will only occur when a record + * is being written to user space, which is already a high-overhead + * operation. (Elimination of the copy is possible, for example, by + * writing directly into a pre-allocated skb, at the cost of wasting + * memory. */ +static void audit_log_move(struct audit_buffer *ab) +{ + struct sk_buff *skb; + char *start; + int extra = ab->nlh ? 0 : NLMSG_SPACE(0); + + /* possible resubmission */ + if (ab->len == 0) + return; + + skb = skb_peek(&ab->sklist); + if (!skb || skb_tailroom(skb) <= ab->len + extra) { + skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); + if (!skb) { + ab->len = 0; /* Lose information in ab->tmp */ + audit_log_lost("out of memory in audit_log_move"); + return; + } + __skb_queue_tail(&ab->sklist, skb); + if (!ab->nlh) + ab->nlh = (struct nlmsghdr *)skb_put(skb, + NLMSG_SPACE(0)); + } + start = skb_put(skb, ab->len); + memcpy(start, ab->tmp, ab->len); + ab->len = 0; +} + +/* Iterate over the skbuff in the audit_buffer, sending their contents + * to user space. */ +static inline int audit_log_drain(struct audit_buffer *ab) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&ab->sklist))) { + int retval = 0; + + if (audit_pid) { + if (ab->nlh) { + ab->nlh->nlmsg_len = ab->total; + ab->nlh->nlmsg_type = ab->type; + ab->nlh->nlmsg_flags = 0; + ab->nlh->nlmsg_seq = 0; + ab->nlh->nlmsg_pid = ab->pid; + } + skb_get(skb); /* because netlink_* frees */ + retval = netlink_unicast(audit_sock, skb, audit_pid, + MSG_DONTWAIT); + } + if (retval == -EAGAIN && ab->count < 5) { + ++ab->count; + skb_queue_tail(&ab->sklist, skb); + audit_log_end_irq(ab); + return 1; + } + if (retval < 0) { + if (retval == -ECONNREFUSED) { + printk(KERN_ERR + "audit: *NO* daemon at audit_pid=%d\n", + audit_pid); + audit_pid = 0; + } else + audit_log_lost("netlink socket too busy"); + } + if (!audit_pid) { /* No daemon */ + int offset = ab->nlh ? NLMSG_SPACE(0) : 0; + int len = skb->len - offset; + printk(KERN_ERR "%*.*s\n", + len, len, skb->data + offset); + } + kfree_skb(skb); + ab->nlh = NULL; + } + return 0; +} + +/* Initialize audit support at boot time. */ +static int __init audit_init(void) +{ + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", + audit_default ? "enabled" : "disabled"); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + if (!audit_sock) + audit_panic("cannot initialize netlink socket"); + + audit_initialized = 1; + audit_enabled = audit_default; + audit_log(NULL, "initialized"); + return 0; +} + +#else +/* Without CONFIG_NET, we have no skbuffs. For now, print what we have + * in the buffer. */ +static void audit_log_move(struct audit_buffer *ab) +{ + printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp); + ab->len = 0; +} + +static inline int audit_log_drain(struct audit_buffer *ab) +{ + return 0; +} + +/* Initialize audit support at boot time. */ +int __init audit_init(void) +{ + printk(KERN_INFO "audit: initializing WITHOUT netlink support\n"); + audit_sock = NULL; + audit_pid = 0; + + audit_initialized = 1; + audit_enabled = audit_default; + audit_log(NULL, "initialized"); + return 0; +} +#endif + +__initcall(audit_init); + +/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ +static int __init audit_enable(char *str) +{ + audit_default = !!simple_strtol(str, NULL, 0); + printk(KERN_INFO "audit: %s%s\n", + audit_default ? "enabled" : "disabled", + audit_initialized ? "" : " (after initialization)"); + if (audit_initialized) + audit_enabled = audit_default; + return 0; +} + +__setup("audit=", audit_enable); + + +/* Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the tsk is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, tsk + * should be NULL. */ +struct audit_buffer *audit_log_start(struct audit_context *ctx) +{ + struct audit_buffer *ab = NULL; + unsigned long flags; + struct timespec t; + int serial = 0; + + if (!audit_initialized) + return NULL; + + if (audit_backlog_limit + && atomic_read(&audit_backlog) > audit_backlog_limit) { + if (audit_rate_check()) + printk(KERN_WARNING + "audit: audit_backlog=%d > " + "audit_backlog_limit=%d\n", + atomic_read(&audit_backlog), + audit_backlog_limit); + audit_log_lost("backlog limit exceeded"); + return NULL; + } + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (!list_empty(&audit_freelist)) { + ab = list_entry(audit_freelist.next, + struct audit_buffer, list); + list_del(&ab->list); + --audit_freelist_count; + } + spin_unlock_irqrestore(&audit_freelist_lock, flags); + + if (!ab) + ab = kmalloc(sizeof(*ab), GFP_ATOMIC); + if (!ab) { + audit_log_lost("out of memory in audit_log_start"); + return NULL; + } + + atomic_inc(&audit_backlog); + skb_queue_head_init(&ab->sklist); + + ab->ctx = ctx; + ab->len = 0; + ab->nlh = NULL; + ab->total = 0; + ab->type = AUDIT_KERNEL; + ab->pid = 0; + ab->count = 0; + +#ifdef CONFIG_AUDITSYSCALL + if (ab->ctx) + audit_get_stamp(ab->ctx, &t, &serial); + else +#endif + t = CURRENT_TIME; + + audit_log_format(ab, "audit(%lu.%03lu:%u): ", + t.tv_sec, t.tv_nsec/1000000, serial); + return ab; +} + + +/* Format an audit message into the audit buffer. If there isn't enough + * room in the audit buffer, more room will be allocated and vsnprint + * will be called a second time. Currently, we assume that a printk + * can't format message larger than 1024 bytes, so we don't either. */ +static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, + va_list args) +{ + int len, avail; + + if (!ab) + return; + + avail = sizeof(ab->tmp) - ab->len; + if (avail <= 0) { + audit_log_move(ab); + avail = sizeof(ab->tmp) - ab->len; + } + len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + if (len >= avail) { + /* The printk buffer is 1024 bytes long, so if we get + * here and AUDIT_BUFSIZ is at least 1024, then we can + * log everything that printk could have logged. */ + audit_log_move(ab); + avail = sizeof(ab->tmp) - ab->len; + len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + } + ab->len += (len < avail) ? len : avail; + ab->total += (len < avail) ? len : avail; +} + +/* Format a message into the audit buffer. All the work is done in + * audit_log_vformat. */ +void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) +{ + va_list args; + + if (!ab) + return; + va_start(args, fmt); + audit_log_vformat(ab, fmt, args); + va_end(args); +} + +/* This is a helper-function to print the d_path without using a static + * buffer or allocating another buffer in addition to the one in + * audit_buffer. */ +void audit_log_d_path(struct audit_buffer *ab, const char *prefix, + struct dentry *dentry, struct vfsmount *vfsmnt) +{ + char *p; + int len, avail; + + if (prefix) audit_log_format(ab, " %s", prefix); + + if (ab->len > 128) + audit_log_move(ab); + avail = sizeof(ab->tmp) - ab->len; + p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail); + if (IS_ERR(p)) { + /* FIXME: can we save some information here? */ + audit_log_format(ab, "<toolong>"); + } else { + /* path isn't at start of buffer */ + len = (ab->tmp + sizeof(ab->tmp) - 1) - p; + memmove(ab->tmp + ab->len, p, len); + ab->len += len; + ab->total += len; + } +} + +/* Remove queued messages from the audit_txlist and send them to userspace. */ +static void audit_tasklet_handler(unsigned long arg) +{ + LIST_HEAD(list); + struct audit_buffer *ab; + unsigned long flags; + + spin_lock_irqsave(&audit_txlist_lock, flags); + list_splice_init(&audit_txlist, &list); + spin_unlock_irqrestore(&audit_txlist_lock, flags); + + while (!list_empty(&list)) { + ab = list_entry(list.next, struct audit_buffer, list); + list_del(&ab->list); + audit_log_end_fast(ab); + } +} + +static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0); + +/* The netlink_* functions cannot be called inside an irq context, so + * the audit buffer is places on a queue and a tasklet is scheduled to + * remove them from the queue outside the irq context. May be called in + * any context. */ +static void audit_log_end_irq(struct audit_buffer *ab) +{ + unsigned long flags; + + if (!ab) + return; + spin_lock_irqsave(&audit_txlist_lock, flags); + list_add_tail(&ab->list, &audit_txlist); + spin_unlock_irqrestore(&audit_txlist_lock, flags); + + tasklet_schedule(&audit_tasklet); +} + +/* Send the message in the audit buffer directly to user space. May not + * be called in an irq context. */ +static void audit_log_end_fast(struct audit_buffer *ab) +{ + unsigned long flags; + + BUG_ON(in_irq()); + if (!ab) + return; + if (!audit_rate_check()) { + audit_log_lost("rate limit exceeded"); + } else { + audit_log_move(ab); + if (audit_log_drain(ab)) + return; + } + + atomic_dec(&audit_backlog); + spin_lock_irqsave(&audit_freelist_lock, flags); + if (++audit_freelist_count > AUDIT_MAXFREE) + kfree(ab); + else + list_add(&ab->list, &audit_freelist); + spin_unlock_irqrestore(&audit_freelist_lock, flags); +} + +/* Send or queue the message in the audit buffer, depending on the + * current context. (A convenience function that may be called in any + * context.) */ +void audit_log_end(struct audit_buffer *ab) +{ + if (in_irq()) + audit_log_end_irq(ab); + else + audit_log_end_fast(ab); +} + +/* Log an audit record. This is a convenience function that calls + * audit_log_start, audit_log_vformat, and audit_log_end. It may be + * called in any context. */ +void audit_log(struct audit_context *ctx, const char *fmt, ...) +{ + struct audit_buffer *ab; + va_list args; + + ab = audit_log_start(ctx); + if (ab) { + va_start(args, fmt); + audit_log_vformat(ab, fmt, args); + va_end(args); + audit_log_end(ab); + } +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c new file mode 100644 index 00000000000..8c454852d6a --- /dev/null +++ b/kernel/auditsc.c @@ -0,0 +1,1015 @@ +/* auditsc.c -- System-call auditing support -*- linux-c -*- + * Handles all system-call specific auditing features. + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Written by Rickard E. (Rik) Faith <faith@redhat.com> + * + * Many of the ideas implemented here are from Stephen C. Tweedie, + * especially the idea of avoiding a copy by using getname. + * + * The method for actual interception of syscall entry and exit (not in + * this file -- see entry.S) is based on a GPL'd patch written by + * okir@suse.de and Copyright 2003 SuSE Linux AG. + * + */ + +#include <linux/init.h> +#include <asm/atomic.h> +#include <asm/types.h> +#include <linux/mm.h> +#include <linux/module.h> + +#include <linux/audit.h> +#include <linux/personality.h> +#include <linux/time.h> +#include <asm/unistd.h> + +/* 0 = no checking + 1 = put_count checking + 2 = verbose put_count checking +*/ +#define AUDIT_DEBUG 0 + +/* No syscall auditing will take place unless audit_enabled != 0. */ +extern int audit_enabled; + +/* AUDIT_NAMES is the number of slots we reserve in the audit_context + * for saving names from getname(). */ +#define AUDIT_NAMES 20 + +/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the + * audit_context from being used for nameless inodes from + * path_lookup. */ +#define AUDIT_NAMES_RESERVED 7 + +/* At task start time, the audit_state is set in the audit_context using + a per-task filter. At syscall entry, the audit_state is augmented by + the syscall filter. */ +enum audit_state { + AUDIT_DISABLED, /* Do not create per-task audit_context. + * No syscall-specific audit records can + * be generated. */ + AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, + * but don't necessarily fill it in at + * syscall entry time (i.e., filter + * instead). */ + AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, + * and always fill it in at syscall + * entry time. This makes a full + * syscall record available if some + * other part of the kernel decides it + * should be recorded. */ + AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, + * always fill it in at syscall entry + * time, and always write out the audit + * record at syscall exit time. */ +}; + +/* When fs/namei.c:getname() is called, we store the pointer in name and + * we don't let putname() free it (instead we free all of the saved + * pointers at syscall exit time). + * + * Further, in fs/namei.c:path_lookup() we store the inode and device. */ +struct audit_names { + const char *name; + unsigned long ino; + dev_t dev; + umode_t mode; + uid_t uid; + gid_t gid; + dev_t rdev; +}; + +struct audit_aux_data { + struct audit_aux_data *next; + int type; +}; + +#define AUDIT_AUX_IPCPERM 0 + +struct audit_aux_data_ipcctl { + struct audit_aux_data d; + struct ipc_perm p; + unsigned long qbytes; + uid_t uid; + gid_t gid; + mode_t mode; +}; + + +/* The per-task audit context. */ +struct audit_context { + int in_syscall; /* 1 if task is in a syscall */ + enum audit_state state; + unsigned int serial; /* serial number for record */ + struct timespec ctime; /* time of syscall entry */ + uid_t loginuid; /* login uid (identity) */ + int major; /* syscall number */ + unsigned long argv[4]; /* syscall arguments */ + int return_valid; /* return code is valid */ + int return_code;/* syscall return code */ + int auditable; /* 1 if record should be written */ + int name_count; + struct audit_names names[AUDIT_NAMES]; + struct audit_context *previous; /* For nested syscalls */ + struct audit_aux_data *aux; + + /* Save things to print about task_struct */ + pid_t pid; + uid_t uid, euid, suid, fsuid; + gid_t gid, egid, sgid, fsgid; + unsigned long personality; + +#if AUDIT_DEBUG + int put_count; + int ino_count; +#endif +}; + + /* Public API */ +/* There are three lists of rules -- one to search at task creation + * time, one to search at syscall entry time, and another to search at + * syscall exit time. */ +static LIST_HEAD(audit_tsklist); +static LIST_HEAD(audit_entlist); +static LIST_HEAD(audit_extlist); + +struct audit_entry { + struct list_head list; + struct rcu_head rcu; + struct audit_rule rule; +}; + +/* Check to see if two rules are identical. It is called from + * audit_del_rule during AUDIT_DEL. */ +static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +{ + int i; + + if (a->flags != b->flags) + return 1; + + if (a->action != b->action) + return 1; + + if (a->field_count != b->field_count) + return 1; + + for (i = 0; i < a->field_count; i++) { + if (a->fields[i] != b->fields[i] + || a->values[i] != b->values[i]) + return 1; + } + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (a->mask[i] != b->mask[i]) + return 1; + + return 0; +} + +/* Note that audit_add_rule and audit_del_rule are called via + * audit_receive() in audit.c, and are protected by + * audit_netlink_sem. */ +static inline int audit_add_rule(struct audit_entry *entry, + struct list_head *list) +{ + if (entry->rule.flags & AUDIT_PREPEND) { + entry->rule.flags &= ~AUDIT_PREPEND; + list_add_rcu(&entry->list, list); + } else { + list_add_tail_rcu(&entry->list, list); + } + return 0; +} + +static void audit_free_rule(struct rcu_head *head) +{ + struct audit_entry *e = container_of(head, struct audit_entry, rcu); + kfree(e); +} + +/* Note that audit_add_rule and audit_del_rule are called via + * audit_receive() in audit.c, and are protected by + * audit_netlink_sem. */ +static inline int audit_del_rule(struct audit_rule *rule, + struct list_head *list) +{ + struct audit_entry *e; + + /* Do not use the _rcu iterator here, since this is the only + * deletion routine. */ + list_for_each_entry(e, list, list) { + if (!audit_compare_rule(rule, &e->rule)) { + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule); + return 0; + } + } + return -EFAULT; /* No matching rule */ +} + +#ifdef CONFIG_NET +/* Copy rule from user-space to kernel-space. Called during + * AUDIT_ADD. */ +static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +{ + int i; + + if (s->action != AUDIT_NEVER + && s->action != AUDIT_POSSIBLE + && s->action != AUDIT_ALWAYS) + return -1; + if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) + return -1; + + d->flags = s->flags; + d->action = s->action; + d->field_count = s->field_count; + for (i = 0; i < d->field_count; i++) { + d->fields[i] = s->fields[i]; + d->values[i] = s->values[i]; + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + return 0; +} + +int audit_receive_filter(int type, int pid, int uid, int seq, void *data) +{ + u32 flags; + struct audit_entry *entry; + int err = 0; + + switch (type) { + case AUDIT_LIST: + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_sem held. */ + list_for_each_entry(entry, &audit_tsklist, list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); + list_for_each_entry(entry, &audit_entlist, list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); + list_for_each_entry(entry, &audit_extlist, list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); + audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + break; + case AUDIT_ADD: + if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) + return -ENOMEM; + if (audit_copy_rule(&entry->rule, data)) { + kfree(entry); + return -EINVAL; + } + flags = entry->rule.flags; + if (!err && (flags & AUDIT_PER_TASK)) + err = audit_add_rule(entry, &audit_tsklist); + if (!err && (flags & AUDIT_AT_ENTRY)) + err = audit_add_rule(entry, &audit_entlist); + if (!err && (flags & AUDIT_AT_EXIT)) + err = audit_add_rule(entry, &audit_extlist); + break; + case AUDIT_DEL: + flags =((struct audit_rule *)data)->flags; + if (!err && (flags & AUDIT_PER_TASK)) + err = audit_del_rule(data, &audit_tsklist); + if (!err && (flags & AUDIT_AT_ENTRY)) + err = audit_del_rule(data, &audit_entlist); + if (!err && (flags & AUDIT_AT_EXIT)) + err = audit_del_rule(data, &audit_extlist); + break; + default: + return -EINVAL; + } + + return err; +} +#endif + +/* Compare a task_struct with an audit_rule. Return 1 on match, 0 + * otherwise. */ +static int audit_filter_rules(struct task_struct *tsk, + struct audit_rule *rule, + struct audit_context *ctx, + enum audit_state *state) +{ + int i, j; + + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_NEGATE; + u32 value = rule->values[i]; + int result = 0; + + switch (field) { + case AUDIT_PID: + result = (tsk->pid == value); + break; + case AUDIT_UID: + result = (tsk->uid == value); + break; + case AUDIT_EUID: + result = (tsk->euid == value); + break; + case AUDIT_SUID: + result = (tsk->suid == value); + break; + case AUDIT_FSUID: + result = (tsk->fsuid == value); + break; + case AUDIT_GID: + result = (tsk->gid == value); + break; + case AUDIT_EGID: + result = (tsk->egid == value); + break; + case AUDIT_SGID: + result = (tsk->sgid == value); + break; + case AUDIT_FSGID: + result = (tsk->fsgid == value); + break; + case AUDIT_PERS: + result = (tsk->personality == value); + break; + + case AUDIT_EXIT: + if (ctx && ctx->return_valid) + result = (ctx->return_code == value); + break; + case AUDIT_SUCCESS: + if (ctx && ctx->return_valid) + result = (ctx->return_code >= 0); + break; + case AUDIT_DEVMAJOR: + if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (MAJOR(ctx->names[j].dev)==value) { + ++result; + break; + } + } + } + break; + case AUDIT_DEVMINOR: + if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (MINOR(ctx->names[j].dev)==value) { + ++result; + break; + } + } + } + break; + case AUDIT_INODE: + if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (ctx->names[j].ino == value) { + ++result; + break; + } + } + } + break; + case AUDIT_LOGINUID: + result = 0; + if (ctx) + result = (ctx->loginuid == value); + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + if (ctx) + result = (ctx->argv[field-AUDIT_ARG0]==value); + break; + } + + if (rule->fields[i] & AUDIT_NEGATE) + result = !result; + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +/* At process creation time, we can determine if system-call auditing is + * completely disabled for this task. Since we only have the task + * structure at this point, we can only check uid and gid. + */ +static enum audit_state audit_filter_task(struct task_struct *tsk) +{ + struct audit_entry *e; + enum audit_state state; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &audit_tsklist, list) { + if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { + rcu_read_unlock(); + return state; + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* At syscall entry and exit time, this filter is called if the + * audit_state is not low enough that auditing cannot take place, but is + * also not high enough that we already know we have to write and audit + * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). + */ +static enum audit_state audit_filter_syscall(struct task_struct *tsk, + struct audit_context *ctx, + struct list_head *list) +{ + struct audit_entry *e; + enum audit_state state; + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + rcu_read_lock(); + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit + && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + rcu_read_unlock(); + return state; + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* This should be called with task_lock() held. */ +static inline struct audit_context *audit_get_context(struct task_struct *tsk, + int return_valid, + int return_code) +{ + struct audit_context *context = tsk->audit_context; + + if (likely(!context)) + return NULL; + context->return_valid = return_valid; + context->return_code = return_code; + + if (context->in_syscall && !context->auditable) { + enum audit_state state; + state = audit_filter_syscall(tsk, context, &audit_extlist); + if (state == AUDIT_RECORD_CONTEXT) + context->auditable = 1; + } + + context->pid = tsk->pid; + context->uid = tsk->uid; + context->gid = tsk->gid; + context->euid = tsk->euid; + context->suid = tsk->suid; + context->fsuid = tsk->fsuid; + context->egid = tsk->egid; + context->sgid = tsk->sgid; + context->fsgid = tsk->fsgid; + context->personality = tsk->personality; + tsk->audit_context = NULL; + return context; +} + +static inline void audit_free_names(struct audit_context *context) +{ + int i; + +#if AUDIT_DEBUG == 2 + if (context->auditable + ||context->put_count + context->ino_count != context->name_count) { + printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" + " name_count=%d put_count=%d" + " ino_count=%d [NOT freeing]\n", + __LINE__, + context->serial, context->major, context->in_syscall, + context->name_count, context->put_count, + context->ino_count); + for (i = 0; i < context->name_count; i++) + printk(KERN_ERR "names[%d] = %p = %s\n", i, + context->names[i].name, + context->names[i].name); + dump_stack(); + return; + } +#endif +#if AUDIT_DEBUG + context->put_count = 0; + context->ino_count = 0; +#endif + + for (i = 0; i < context->name_count; i++) + if (context->names[i].name) + __putname(context->names[i].name); + context->name_count = 0; +} + +static inline void audit_free_aux(struct audit_context *context) +{ + struct audit_aux_data *aux; + + while ((aux = context->aux)) { + context->aux = aux->next; + kfree(aux); + } +} + +static inline void audit_zero_context(struct audit_context *context, + enum audit_state state) +{ + uid_t loginuid = context->loginuid; + + memset(context, 0, sizeof(*context)); + context->state = state; + context->loginuid = loginuid; +} + +static inline struct audit_context *audit_alloc_context(enum audit_state state) +{ + struct audit_context *context; + + if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + return NULL; + audit_zero_context(context, state); + return context; +} + +/* Filter on the task information and allocate a per-task audit context + * if necessary. Doing so turns on system call auditing for the + * specified task. This is called from copy_process, so no lock is + * needed. */ +int audit_alloc(struct task_struct *tsk) +{ + struct audit_context *context; + enum audit_state state; + + if (likely(!audit_enabled)) + return 0; /* Return if not auditing. */ + + state = audit_filter_task(tsk); + if (likely(state == AUDIT_DISABLED)) + return 0; + + if (!(context = audit_alloc_context(state))) { + audit_log_lost("out of memory in audit_alloc"); + return -ENOMEM; + } + + /* Preserve login uid */ + context->loginuid = -1; + if (current->audit_context) + context->loginuid = current->audit_context->loginuid; + + tsk->audit_context = context; + set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + return 0; +} + +static inline void audit_free_context(struct audit_context *context) +{ + struct audit_context *previous; + int count = 0; + + do { + previous = context->previous; + if (previous || (count && count < 10)) { + ++count; + printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" + " freeing multiple contexts (%d)\n", + context->serial, context->major, + context->name_count, count); + } + audit_free_names(context); + audit_free_aux(context); + kfree(context); + context = previous; + } while (context); + if (count >= 10) + printk(KERN_ERR "audit: freed %d contexts\n", count); +} + +static void audit_log_exit(struct audit_context *context) +{ + int i; + struct audit_buffer *ab; + + ab = audit_log_start(context); + if (!ab) + return; /* audit_panic has been called */ + audit_log_format(ab, "syscall=%d", context->major); + if (context->personality != PER_LINUX) + audit_log_format(ab, " per=%lx", context->personality); + if (context->return_valid) + audit_log_format(ab, " exit=%d", context->return_code); + audit_log_format(ab, + " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" + " pid=%d loginuid=%d uid=%d gid=%d" + " euid=%d suid=%d fsuid=%d" + " egid=%d sgid=%d fsgid=%d", + context->argv[0], + context->argv[1], + context->argv[2], + context->argv[3], + context->name_count, + context->pid, + context->loginuid, + context->uid, + context->gid, + context->euid, context->suid, context->fsuid, + context->egid, context->sgid, context->fsgid); + audit_log_end(ab); + while (context->aux) { + struct audit_aux_data *aux; + + ab = audit_log_start(context); + if (!ab) + continue; /* audit_panic has been called */ + + aux = context->aux; + context->aux = aux->next; + + audit_log_format(ab, "auxitem=%d", aux->type); + switch (aux->type) { + case AUDIT_AUX_IPCPERM: { + struct audit_aux_data_ipcctl *axi = (void *)aux; + audit_log_format(ab, + " qbytes=%lx uid=%d gid=%d mode=%x", + axi->qbytes, axi->uid, axi->gid, axi->mode); + } + } + audit_log_end(ab); + kfree(aux); + } + + for (i = 0; i < context->name_count; i++) { + ab = audit_log_start(context); + if (!ab) + continue; /* audit_panic has been called */ + audit_log_format(ab, "item=%d", i); + if (context->names[i].name) + audit_log_format(ab, " name=%s", + context->names[i].name); + if (context->names[i].ino != (unsigned long)-1) + audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" + " uid=%d gid=%d rdev=%02x:%02x", + context->names[i].ino, + MAJOR(context->names[i].dev), + MINOR(context->names[i].dev), + context->names[i].mode, + context->names[i].uid, + context->names[i].gid, + MAJOR(context->names[i].rdev), + MINOR(context->names[i].rdev)); + audit_log_end(ab); + } +} + +/* Free a per-task audit context. Called from copy_process and + * __put_task_struct. */ +void audit_free(struct task_struct *tsk) +{ + struct audit_context *context; + + task_lock(tsk); + context = audit_get_context(tsk, 0, 0); + task_unlock(tsk); + + if (likely(!context)) + return; + + /* Check for system calls that do not go through the exit + * function (e.g., exit_group), then free context block. */ + if (context->in_syscall && context->auditable) + audit_log_exit(context); + + audit_free_context(context); +} + +/* Compute a serial number for the audit record. Audit records are + * written to user-space as soon as they are generated, so a complete + * audit record may be written in several pieces. The timestamp of the + * record and this serial number are used by the user-space daemon to + * determine which pieces belong to the same audit record. The + * (timestamp,serial) tuple is unique for each syscall and is live from + * syscall entry to syscall exit. + * + * Atomic values are only guaranteed to be 24-bit, so we count down. + * + * NOTE: Another possibility is to store the formatted records off the + * audit context (for those records that have a context), and emit them + * all at syscall exit. However, this could delay the reporting of + * significant errors until syscall exit (or never, if the system + * halts). */ +static inline unsigned int audit_serial(void) +{ + static atomic_t serial = ATOMIC_INIT(0xffffff); + unsigned int a, b; + + do { + a = atomic_read(&serial); + if (atomic_dec_and_test(&serial)) + atomic_set(&serial, 0xffffff); + b = atomic_read(&serial); + } while (b != a - 1); + + return 0xffffff - b; +} + +/* Fill in audit context at syscall entry. This only happens if the + * audit context was created when the task was created and the state or + * filters demand the audit context be built. If the state from the + * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, + * then the record will be written at syscall exit time (otherwise, it + * will only be written if another part of the kernel requests that it + * be written). */ +void audit_syscall_entry(struct task_struct *tsk, int major, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4) +{ + struct audit_context *context = tsk->audit_context; + enum audit_state state; + + BUG_ON(!context); + + /* This happens only on certain architectures that make system + * calls in kernel_thread via the entry.S interface, instead of + * with direct calls. (If you are porting to a new + * architecture, hitting this condition can indicate that you + * got the _exit/_leave calls backward in entry.S.) + * + * i386 no + * x86_64 no + * ppc64 yes (see arch/ppc64/kernel/misc.S) + * + * This also happens with vm86 emulation in a non-nested manner + * (entries without exits), so this case must be caught. + */ + if (context->in_syscall) { + struct audit_context *newctx; + +#if defined(__NR_vm86) && defined(__NR_vm86old) + /* vm86 mode should only be entered once */ + if (major == __NR_vm86 || major == __NR_vm86old) + return; +#endif +#if AUDIT_DEBUG + printk(KERN_ERR + "audit(:%d) pid=%d in syscall=%d;" + " entering syscall=%d\n", + context->serial, tsk->pid, context->major, major); +#endif + newctx = audit_alloc_context(context->state); + if (newctx) { + newctx->previous = context; + context = newctx; + tsk->audit_context = newctx; + } else { + /* If we can't alloc a new context, the best we + * can do is to leak memory (any pending putname + * will be lost). The only other alternative is + * to abandon auditing. */ + audit_zero_context(context, context->state); + } + } + BUG_ON(context->in_syscall || context->name_count); + + if (!audit_enabled) + return; + + context->major = major; + context->argv[0] = a1; + context->argv[1] = a2; + context->argv[2] = a3; + context->argv[3] = a4; + + state = context->state; + if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) + state = audit_filter_syscall(tsk, context, &audit_entlist); + if (likely(state == AUDIT_DISABLED)) + return; + + context->serial = audit_serial(); + context->ctime = CURRENT_TIME; + context->in_syscall = 1; + context->auditable = !!(state == AUDIT_RECORD_CONTEXT); +} + +/* Tear down after system call. If the audit context has been marked as + * auditable (either because of the AUDIT_RECORD_CONTEXT state from + * filtering, or because some other part of the kernel write an audit + * message), then write out the syscall information. In call cases, + * free the names stored from getname(). */ +void audit_syscall_exit(struct task_struct *tsk, int return_code) +{ + struct audit_context *context; + + get_task_struct(tsk); + task_lock(tsk); + context = audit_get_context(tsk, 1, return_code); + task_unlock(tsk); + + /* Not having a context here is ok, since the parent may have + * called __put_task_struct. */ + if (likely(!context)) + return; + + if (context->in_syscall && context->auditable) + audit_log_exit(context); + + context->in_syscall = 0; + context->auditable = 0; + if (context->previous) { + struct audit_context *new_context = context->previous; + context->previous = NULL; + audit_free_context(context); + tsk->audit_context = new_context; + } else { + audit_free_names(context); + audit_free_aux(context); + audit_zero_context(context, context->state); + tsk->audit_context = context; + } + put_task_struct(tsk); +} + +/* Add a name to the list. Called from fs/namei.c:getname(). */ +void audit_getname(const char *name) +{ + struct audit_context *context = current->audit_context; + + if (!context || IS_ERR(name) || !name) + return; + + if (!context->in_syscall) { +#if AUDIT_DEBUG == 2 + printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", + __FILE__, __LINE__, context->serial, name); + dump_stack(); +#endif + return; + } + BUG_ON(context->name_count >= AUDIT_NAMES); + context->names[context->name_count].name = name; + context->names[context->name_count].ino = (unsigned long)-1; + ++context->name_count; +} + +/* Intercept a putname request. Called from + * include/linux/fs.h:putname(). If we have stored the name from + * getname in the audit context, then we delay the putname until syscall + * exit. */ +void audit_putname(const char *name) +{ + struct audit_context *context = current->audit_context; + + BUG_ON(!context); + if (!context->in_syscall) { +#if AUDIT_DEBUG == 2 + printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", + __FILE__, __LINE__, context->serial, name); + if (context->name_count) { + int i; + for (i = 0; i < context->name_count; i++) + printk(KERN_ERR "name[%d] = %p = %s\n", i, + context->names[i].name, + context->names[i].name); + } +#endif + __putname(name); + } +#if AUDIT_DEBUG + else { + ++context->put_count; + if (context->put_count > context->name_count) { + printk(KERN_ERR "%s:%d(:%d): major=%d" + " in_syscall=%d putname(%p) name_count=%d" + " put_count=%d\n", + __FILE__, __LINE__, + context->serial, context->major, + context->in_syscall, name, context->name_count, + context->put_count); + dump_stack(); + } + } +#endif +} + +/* Store the inode and device from a lookup. Called from + * fs/namei.c:path_lookup(). */ +void audit_inode(const char *name, const struct inode *inode) +{ + int idx; + struct audit_context *context = current->audit_context; + + if (!context->in_syscall) + return; + if (context->name_count + && context->names[context->name_count-1].name + && context->names[context->name_count-1].name == name) + idx = context->name_count - 1; + else if (context->name_count > 1 + && context->names[context->name_count-2].name + && context->names[context->name_count-2].name == name) + idx = context->name_count - 2; + else { + /* FIXME: how much do we care about inodes that have no + * associated name? */ + if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED) + return; + idx = context->name_count++; + context->names[idx].name = NULL; +#if AUDIT_DEBUG + ++context->ino_count; +#endif + } + context->names[idx].ino = inode->i_ino; + context->names[idx].dev = inode->i_sb->s_dev; + context->names[idx].mode = inode->i_mode; + context->names[idx].uid = inode->i_uid; + context->names[idx].gid = inode->i_gid; + context->names[idx].rdev = inode->i_rdev; +} + +void audit_get_stamp(struct audit_context *ctx, + struct timespec *t, int *serial) +{ + if (ctx) { + t->tv_sec = ctx->ctime.tv_sec; + t->tv_nsec = ctx->ctime.tv_nsec; + *serial = ctx->serial; + ctx->auditable = 1; + } else { + *t = CURRENT_TIME; + *serial = 0; + } +} + +extern int audit_set_type(struct audit_buffer *ab, int type); + +int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid) +{ + if (ctx) { + struct audit_buffer *ab; + + ab = audit_log_start(NULL); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old loginuid=%u new loginuid=%u", + ctx->pid, ctx->uid, ctx->loginuid, loginuid); + audit_set_type(ab, AUDIT_LOGIN); + audit_log_end(ab); + } + ctx->loginuid = loginuid; + } + return 0; +} + +uid_t audit_get_loginuid(struct audit_context *ctx) +{ + return ctx ? ctx->loginuid : -1; +} + +int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +{ + struct audit_aux_data_ipcctl *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->qbytes = qbytes; + ax->uid = uid; + ax->gid = gid; + ax->mode = mode; + + ax->d.type = AUDIT_AUX_IPCPERM; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} diff --git a/kernel/capability.c b/kernel/capability.c new file mode 100644 index 00000000000..64db1ee820c --- /dev/null +++ b/kernel/capability.c @@ -0,0 +1,220 @@ +/* + * linux/kernel/capability.c + * + * Copyright (C) 1997 Andrew Main <zefram@fysh.org> + * + * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com> + * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <asm/uaccess.h> + +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ +kernel_cap_t cap_bset = CAP_INIT_EFF_SET; + +EXPORT_SYMBOL(securebits); +EXPORT_SYMBOL(cap_bset); + +/* + * This lock protects task->cap_* for all tasks including current. + * Locking rule: acquire this prior to tasklist_lock. + */ +static DEFINE_SPINLOCK(task_capability_lock); + +/* + * For sys_getproccap() and sys_setproccap(), any of the three + * capability set pointers may be NULL -- indicating that that set is + * uninteresting and/or not to be changed. + */ + +/* + * sys_capget - get the capabilities of a given process. + */ +asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) +{ + int ret = 0; + pid_t pid; + __u32 version; + task_t *target; + struct __user_cap_data_struct data; + + if (get_user(version, &header->version)) + return -EFAULT; + + if (version != _LINUX_CAPABILITY_VERSION) { + if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + if (get_user(pid, &header->pid)) + return -EFAULT; + + if (pid < 0) + return -EINVAL; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + if (pid && pid != current->pid) { + target = find_task_by_pid(pid); + if (!target) { + ret = -ESRCH; + goto out; + } + } else + target = current; + + ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted); + +out: + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + if (!ret && copy_to_user(dataptr, &data, sizeof data)) + return -EFAULT; + + return ret; +} + +/* + * cap_set_pg - set capabilities for all processes in a given process + * group. We call this holding task_capability_lock and tasklist_lock. + */ +static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + task_t *g, *target; + int ret = -EPERM; + int found = 0; + + do_each_task_pid(pgrp, PIDTYPE_PGID, g) { + target = g; + while_each_thread(g, target) { + if (!security_capset_check(target, effective, + inheritable, + permitted)) { + security_capset_set(target, effective, + inheritable, + permitted); + ret = 0; + } + found = 1; + } + } while_each_task_pid(pgrp, PIDTYPE_PGID, g); + + if (!found) + ret = 0; + return ret; +} + +/* + * cap_set_all - set capabilities for all processes other than init + * and self. We call this holding task_capability_lock and tasklist_lock. + */ +static inline int cap_set_all(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + task_t *g, *target; + int ret = -EPERM; + int found = 0; + + do_each_thread(g, target) { + if (target == current || target->pid == 1) + continue; + found = 1; + if (security_capset_check(target, effective, inheritable, + permitted)) + continue; + ret = 0; + security_capset_set(target, effective, inheritable, permitted); + } while_each_thread(g, target); + + if (!found) + ret = 0; + return ret; +} + +/* + * sys_capset - set capabilities for a given process, all processes, or all + * processes in a given process group. + * + * The restrictions on setting capabilities are specified as: + * + * [pid is for the 'target' task. 'current' is the calling task.] + * + * I: any raised capabilities must be a subset of the (old current) permitted + * P: any raised capabilities must be a subset of the (old current) permitted + * E: must be set to a subset of (new target) permitted + */ +asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) +{ + kernel_cap_t inheritable, permitted, effective; + __u32 version; + task_t *target; + int ret; + pid_t pid; + + if (get_user(version, &header->version)) + return -EFAULT; + + if (version != _LINUX_CAPABILITY_VERSION) { + if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + if (get_user(pid, &header->pid)) + return -EFAULT; + + if (pid && pid != current->pid && !capable(CAP_SETPCAP)) + return -EPERM; + + if (copy_from_user(&effective, &data->effective, sizeof(effective)) || + copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) || + copy_from_user(&permitted, &data->permitted, sizeof(permitted))) + return -EFAULT; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + if (pid > 0 && pid != current->pid) { + target = find_task_by_pid(pid); + if (!target) { + ret = -ESRCH; + goto out; + } + } else + target = current; + + ret = 0; + + /* having verified that the proposed changes are legal, + we now put them into effect. */ + if (pid < 0) { + if (pid == -1) /* all procs other than current and init */ + ret = cap_set_all(&effective, &inheritable, &permitted); + + else /* all procs in process group */ + ret = cap_set_pg(-pid, &effective, &inheritable, + &permitted); + } else { + ret = security_capset_check(target, &effective, &inheritable, + &permitted); + if (!ret) + security_capset_set(target, &effective, &inheritable, + &permitted); + } + +out: + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + return ret; +} diff --git a/kernel/compat.c b/kernel/compat.c new file mode 100644 index 00000000000..dad10656bf1 --- /dev/null +++ b/kernel/compat.c @@ -0,0 +1,860 @@ +/* + * linux/kernel/compat.c + * + * Kernel compatibililty routines for e.g. 32 bit syscall support + * on 64 bit kernels. + * + * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/compat.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/signal.h> +#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ +#include <linux/futex.h> /* for FUTEX_WAIT */ +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/security.h> + +#include <asm/uaccess.h> +#include <asm/bug.h> + +int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +{ + return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || + __get_user(ts->tv_sec, &cts->tv_sec) || + __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; +} + +int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +{ + return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || + __put_user(ts->tv_sec, &cts->tv_sec) || + __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; +} + +static long compat_nanosleep_restart(struct restart_block *restart) +{ + unsigned long expire = restart->arg0, now = jiffies; + struct compat_timespec __user *rmtp; + + /* Did it expire while we handled signals? */ + if (!time_after(expire, now)) + return 0; + + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire - now); + if (expire == 0) + return 0; + + rmtp = (struct compat_timespec __user *)restart->arg1; + if (rmtp) { + struct compat_timespec ct; + struct timespec t; + + jiffies_to_timespec(expire, &t); + ct.tv_sec = t.tv_sec; + ct.tv_nsec = t.tv_nsec; + if (copy_to_user(rmtp, &ct, sizeof(ct))) + return -EFAULT; + } + /* The 'restart' block is already filled in */ + return -ERESTART_RESTARTBLOCK; +} + +asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp) +{ + struct timespec t; + struct restart_block *restart; + unsigned long expire; + + if (get_compat_timespec(&t, rqtp)) + return -EFAULT; + + if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) + return -EINVAL; + + expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire); + if (expire == 0) + return 0; + + if (rmtp) { + jiffies_to_timespec(expire, &t); + if (put_compat_timespec(&t, rmtp)) + return -EFAULT; + } + restart = ¤t_thread_info()->restart_block; + restart->fn = compat_nanosleep_restart; + restart->arg0 = jiffies + expire; + restart->arg1 = (unsigned long) rmtp; + return -ERESTART_RESTARTBLOCK; +} + +static inline long get_compat_itimerval(struct itimerval *o, + struct compat_itimerval __user *i) +{ + return (!access_ok(VERIFY_READ, i, sizeof(*i)) || + (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | + __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | + __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | + __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); +} + +static inline long put_compat_itimerval(struct compat_itimerval __user *o, + struct itimerval *i) +{ + return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || + (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | + __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | + __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | + __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); +} + +asmlinkage long compat_sys_getitimer(int which, + struct compat_itimerval __user *it) +{ + struct itimerval kit; + int error; + + error = do_getitimer(which, &kit); + if (!error && put_compat_itimerval(it, &kit)) + error = -EFAULT; + return error; +} + +asmlinkage long compat_sys_setitimer(int which, + struct compat_itimerval __user *in, + struct compat_itimerval __user *out) +{ + struct itimerval kin, kout; + int error; + + if (in) { + if (get_compat_itimerval(&kin, in)) + return -EFAULT; + } else + memset(&kin, 0, sizeof(kin)); + + error = do_setitimer(which, &kin, out ? &kout : NULL); + if (error || !out) + return error; + if (put_compat_itimerval(out, &kout)) + return -EFAULT; + return 0; +} + +asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) +{ + /* + * In the SMP world we might just be unlucky and have one of + * the times increment as we use it. Since the value is an + * atomically safe type this is just fine. Conceptually its + * as if the syscall took an instant longer to occur. + */ + if (tbuf) { + struct compat_tms tmp; + struct task_struct *tsk = current; + struct task_struct *t; + cputime_t utime, stime, cutime, cstime; + + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + + tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); + tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); + tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); + tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); + if (copy_to_user(tbuf, &tmp, sizeof(tmp))) + return -EFAULT; + } + return compat_jiffies_to_clock_t(jiffies); +} + +/* + * Assumption: old_sigset_t and compat_old_sigset_t are both + * types that can be passed to put_user()/get_user(). + */ + +asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) +{ + old_sigset_t s; + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_sigpending((old_sigset_t __user *) &s); + set_fs(old_fs); + if (ret == 0) + ret = put_user(s, set); + return ret; +} + +asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, + compat_old_sigset_t __user *oset) +{ + old_sigset_t s; + long ret; + mm_segment_t old_fs; + + if (set && get_user(s, set)) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_sigprocmask(how, + set ? (old_sigset_t __user *) &s : NULL, + oset ? (old_sigset_t __user *) &s : NULL); + set_fs(old_fs); + if (ret == 0) + if (oset) + ret = put_user(s, oset); + return ret; +} + +#ifdef CONFIG_FUTEX +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, + struct compat_timespec __user *utime, u32 __user *uaddr2, + int val3) +{ + struct timespec t; + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + int val2 = 0; + + if ((op == FUTEX_WAIT) && utime) { + if (get_compat_timespec(&t, utime)) + return -EFAULT; + timeout = timespec_to_jiffies(&t) + 1; + } + if (op >= FUTEX_REQUEUE) + val2 = (int) (unsigned long) utime; + + return do_futex((unsigned long)uaddr, op, val, timeout, + (unsigned long)uaddr2, val2, val3); +} +#endif + +asmlinkage long compat_sys_setrlimit(unsigned int resource, + struct compat_rlimit __user *rlim) +{ + struct rlimit r; + int ret; + mm_segment_t old_fs = get_fs (); + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || + __get_user(r.rlim_cur, &rlim->rlim_cur) || + __get_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + + if (r.rlim_cur == COMPAT_RLIM_INFINITY) + r.rlim_cur = RLIM_INFINITY; + if (r.rlim_max == COMPAT_RLIM_INFINITY) + r.rlim_max = RLIM_INFINITY; + set_fs(KERNEL_DS); + ret = sys_setrlimit(resource, (struct rlimit __user *) &r); + set_fs(old_fs); + return ret; +} + +#ifdef COMPAT_RLIM_OLD_INFINITY + +asmlinkage long compat_sys_old_getrlimit(unsigned int resource, + struct compat_rlimit __user *rlim) +{ + struct rlimit r; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_old_getrlimit(resource, &r); + set_fs(old_fs); + + if (!ret) { + if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) + r.rlim_cur = COMPAT_RLIM_INFINITY; + if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) + r.rlim_max = COMPAT_RLIM_INFINITY; + + if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || + __put_user(r.rlim_cur, &rlim->rlim_cur) || + __put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + } + return ret; +} + +#endif + +asmlinkage long compat_sys_getrlimit (unsigned int resource, + struct compat_rlimit __user *rlim) +{ + struct rlimit r; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_getrlimit(resource, (struct rlimit __user *) &r); + set_fs(old_fs); + if (!ret) { + if (r.rlim_cur > COMPAT_RLIM_INFINITY) + r.rlim_cur = COMPAT_RLIM_INFINITY; + if (r.rlim_max > COMPAT_RLIM_INFINITY) + r.rlim_max = COMPAT_RLIM_INFINITY; + + if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || + __put_user(r.rlim_cur, &rlim->rlim_cur) || + __put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + } + return ret; +} + +int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) +{ + if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || + __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || + __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || + __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || + __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || + __put_user(r->ru_maxrss, &ru->ru_maxrss) || + __put_user(r->ru_ixrss, &ru->ru_ixrss) || + __put_user(r->ru_idrss, &ru->ru_idrss) || + __put_user(r->ru_isrss, &ru->ru_isrss) || + __put_user(r->ru_minflt, &ru->ru_minflt) || + __put_user(r->ru_majflt, &ru->ru_majflt) || + __put_user(r->ru_nswap, &ru->ru_nswap) || + __put_user(r->ru_inblock, &ru->ru_inblock) || + __put_user(r->ru_oublock, &ru->ru_oublock) || + __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || + __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || + __put_user(r->ru_nsignals, &ru->ru_nsignals) || + __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || + __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) + return -EFAULT; + return 0; +} + +asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) +{ + struct rusage r; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_getrusage(who, (struct rusage __user *) &r); + set_fs(old_fs); + + if (ret) + return ret; + + if (put_compat_rusage(&r, ru)) + return -EFAULT; + + return 0; +} + +asmlinkage long +compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, + struct compat_rusage __user *ru) +{ + if (!ru) { + return sys_wait4(pid, stat_addr, options, NULL); + } else { + struct rusage r; + int ret; + unsigned int status; + mm_segment_t old_fs = get_fs(); + + set_fs (KERNEL_DS); + ret = sys_wait4(pid, + (stat_addr ? + (unsigned int __user *) &status : NULL), + options, (struct rusage __user *) &r); + set_fs (old_fs); + + if (ret > 0) { + if (put_compat_rusage(&r, ru)) + return -EFAULT; + if (stat_addr && put_user(status, stat_addr)) + return -EFAULT; + } + return ret; + } +} + +asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, + struct compat_siginfo __user *uinfo, int options, + struct compat_rusage __user *uru) +{ + siginfo_t info; + struct rusage ru; + long ret; + mm_segment_t old_fs = get_fs(); + + memset(&info, 0, sizeof(info)); + + set_fs(KERNEL_DS); + ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? (struct rusage __user *)&ru : NULL); + set_fs(old_fs); + + if ((ret < 0) || (info.si_signo == 0)) + return ret; + + if (uru) { + ret = put_compat_rusage(&ru, uru); + if (ret) + return ret; + } + + BUG_ON(info.si_code & __SI_MASK); + info.si_code |= __SI_CHLD; + return copy_siginfo_to_user32(uinfo, &info); +} + +static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, + unsigned len, cpumask_t *new_mask) +{ + unsigned long *k; + + if (len < sizeof(cpumask_t)) + memset(new_mask, 0, sizeof(cpumask_t)); + else if (len > sizeof(cpumask_t)) + len = sizeof(cpumask_t); + + k = cpus_addr(*new_mask); + return compat_get_bitmap(k, user_mask_ptr, len * 8); +} + +asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, + unsigned int len, + compat_ulong_t __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + + retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, + compat_ulong_t __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + unsigned long *k; + unsigned int min_length = sizeof(cpumask_t); + + if (NR_CPUS <= BITS_PER_COMPAT_LONG) + min_length = sizeof(compat_ulong_t); + + if (len < min_length) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + k = cpus_addr(mask); + ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); + if (ret) + return ret; + + return min_length; +} + +static int get_compat_itimerspec(struct itimerspec *dst, + struct compat_itimerspec __user *src) +{ + if (get_compat_timespec(&dst->it_interval, &src->it_interval) || + get_compat_timespec(&dst->it_value, &src->it_value)) + return -EFAULT; + return 0; +} + +static int put_compat_itimerspec(struct compat_itimerspec __user *dst, + struct itimerspec *src) +{ + if (put_compat_timespec(&src->it_interval, &dst->it_interval) || + put_compat_timespec(&src->it_value, &dst->it_value)) + return -EFAULT; + return 0; +} + +long compat_sys_timer_settime(timer_t timer_id, int flags, + struct compat_itimerspec __user *new, + struct compat_itimerspec __user *old) +{ + long err; + mm_segment_t oldfs; + struct itimerspec newts, oldts; + + if (!new) + return -EINVAL; + if (get_compat_itimerspec(&newts, new)) + return -EFAULT; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_timer_settime(timer_id, flags, + (struct itimerspec __user *) &newts, + (struct itimerspec __user *) &oldts); + set_fs(oldfs); + if (!err && old && put_compat_itimerspec(old, &oldts)) + return -EFAULT; + return err; +} + +long compat_sys_timer_gettime(timer_t timer_id, + struct compat_itimerspec __user *setting) +{ + long err; + mm_segment_t oldfs; + struct itimerspec ts; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_timer_gettime(timer_id, + (struct itimerspec __user *) &ts); + set_fs(oldfs); + if (!err && put_compat_itimerspec(setting, &ts)) + return -EFAULT; + return err; +} + +long compat_sys_clock_settime(clockid_t which_clock, + struct compat_timespec __user *tp) +{ + long err; + mm_segment_t oldfs; + struct timespec ts; + + if (get_compat_timespec(&ts, tp)) + return -EFAULT; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_settime(which_clock, + (struct timespec __user *) &ts); + set_fs(oldfs); + return err; +} + +long compat_sys_clock_gettime(clockid_t which_clock, + struct compat_timespec __user *tp) +{ + long err; + mm_segment_t oldfs; + struct timespec ts; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_gettime(which_clock, + (struct timespec __user *) &ts); + set_fs(oldfs); + if (!err && put_compat_timespec(&ts, tp)) + return -EFAULT; + return err; +} + +long compat_sys_clock_getres(clockid_t which_clock, + struct compat_timespec __user *tp) +{ + long err; + mm_segment_t oldfs; + struct timespec ts; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_getres(which_clock, + (struct timespec __user *) &ts); + set_fs(oldfs); + if (!err && tp && put_compat_timespec(&ts, tp)) + return -EFAULT; + return err; +} + +long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, + struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp) +{ + long err; + mm_segment_t oldfs; + struct timespec in, out; + + if (get_compat_timespec(&in, rqtp)) + return -EFAULT; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_nanosleep(which_clock, flags, + (struct timespec __user *) &in, + (struct timespec __user *) &out); + set_fs(oldfs); + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && + put_compat_timespec(&out, rmtp)) + return -EFAULT; + return err; +} + +/* + * We currently only need the following fields from the sigevent + * structure: sigev_value, sigev_signo, sig_notify and (sometimes + * sigev_notify_thread_id). The others are handled in user mode. + * We also assume that copying sigev_value.sival_int is sufficient + * to keep all the bits of sigev_value.sival_ptr intact. + */ +int get_compat_sigevent(struct sigevent *event, + const struct compat_sigevent __user *u_event) +{ + memset(&event, 0, sizeof(*event)); + return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || + __get_user(event->sigev_value.sival_int, + &u_event->sigev_value.sival_int) || + __get_user(event->sigev_signo, &u_event->sigev_signo) || + __get_user(event->sigev_notify, &u_event->sigev_notify) || + __get_user(event->sigev_notify_thread_id, + &u_event->sigev_notify_thread_id)) + ? -EFAULT : 0; +} + +/* timer_create is architecture specific because it needs sigevent conversion */ + +long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, + unsigned long bitmap_size) +{ + int i, j; + unsigned long m; + compat_ulong_t um; + unsigned long nr_compat_longs; + + /* align bitmap up to nearest compat_long_t boundary */ + bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + + if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) + return -EFAULT; + + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); + + for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { + m = 0; + + for (j = 0; j < sizeof(m)/sizeof(um); j++) { + /* + * We dont want to read past the end of the userspace + * bitmap. We must however ensure the end of the + * kernel bitmap is zeroed. + */ + if (nr_compat_longs-- > 0) { + if (__get_user(um, umask)) + return -EFAULT; + } else { + um = 0; + } + + umask++; + m |= (long)um << (j * BITS_PER_COMPAT_LONG); + } + *mask++ = m; + } + + return 0; +} + +long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, + unsigned long bitmap_size) +{ + int i, j; + unsigned long m; + compat_ulong_t um; + unsigned long nr_compat_longs; + + /* align bitmap up to nearest compat_long_t boundary */ + bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + + if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) + return -EFAULT; + + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); + + for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { + m = *mask++; + + for (j = 0; j < sizeof(m)/sizeof(um); j++) { + um = m; + + /* + * We dont want to write past the end of the userspace + * bitmap. + */ + if (nr_compat_longs-- > 0) { + if (__put_user(um, umask)) + return -EFAULT; + } + + umask++; + m >>= 4*sizeof(um); + m >>= 4*sizeof(um); + } + } + + return 0; +} + +void +sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +{ + switch (_NSIG_WORDS) { +#if defined (__COMPAT_ENDIAN_SWAP__) + case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); + case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); + case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); + case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); +#else + case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); + case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); + case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); + case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); +#endif + } +} + +asmlinkage long +compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, + struct compat_siginfo __user *uinfo, + struct compat_timespec __user *uts, compat_size_t sigsetsize) +{ + compat_sigset_t s32; + sigset_t s; + int sig; + struct timespec t; + siginfo_t info; + long ret, timeout = 0; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&s, &s32); + sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); + signotset(&s); + + if (uts) { + if (get_compat_timespec (&t, uts)) + return -EFAULT; + if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 + || t.tv_sec < 0) + return -EINVAL; + } + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &s, &info); + if (!sig) { + timeout = MAX_SCHEDULE_TIMEOUT; + if (uts) + timeout = timespec_to_jiffies(&t) + +(t.tv_sec || t.tv_nsec); + if (timeout) { + current->real_blocked = current->blocked; + sigandsets(¤t->blocked, ¤t->blocked, &s); + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + timeout = schedule_timeout(timeout); + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &s, &info); + current->blocked = current->real_blocked; + siginitset(¤t->real_blocked, 0); + recalc_sigpending(); + } + } + spin_unlock_irq(¤t->sighand->siglock); + + if (sig) { + ret = sig; + if (uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + }else { + ret = timeout?-EINTR:-EAGAIN; + } + return ret; + +} + +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ + +asmlinkage long compat_sys_time(compat_time_t __user * tloc) +{ + compat_time_t i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + i = -EFAULT; + } + return i; +} + +asmlinkage long compat_sys_stime(compat_time_t __user *tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ diff --git a/kernel/configs.c b/kernel/configs.c new file mode 100644 index 00000000000..986f7af31e0 --- /dev/null +++ b/kernel/configs.c @@ -0,0 +1,118 @@ +/* + * kernel/configs.c + * Echo the kernel .config file used to build the kernel + * + * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> + * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> + * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> + * Copyright (C) 2002 Hewlett-Packard Company + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <asm/uaccess.h> + +/**************************************************/ +/* the actual current config file */ + +/* + * Define kernel_config_data and kernel_config_data_size, which contains the + * wrapped and compressed configuration file. The file is first compressed + * with gzip and then bounded by two eight byte magic numbers to allow + * extraction from a binary kernel image: + * + * IKCFG_ST + * <image> + * IKCFG_ED + */ +#define MAGIC_START "IKCFG_ST" +#define MAGIC_END "IKCFG_ED" +#include "config_data.h" + + +#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) +#define kernel_config_data_size \ + (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) + +#ifdef CONFIG_IKCONFIG_PROC + +/**************************************************/ +/* globals and useful constants */ + +static ssize_t +ikconfig_read_current(struct file *file, char __user *buf, + size_t len, loff_t * offset) +{ + loff_t pos = *offset; + ssize_t count; + + if (pos >= kernel_config_data_size) + return 0; + + count = min(len, (size_t)(kernel_config_data_size - pos)); + if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count)) + return -EFAULT; + + *offset += count; + return count; +} + +static struct file_operations ikconfig_file_ops = { + .owner = THIS_MODULE, + .read = ikconfig_read_current, +}; + +/***************************************************/ +/* ikconfig_init: start up everything we need to */ + +static int __init ikconfig_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current config file */ + entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, + &proc_root); + if (!entry) + return -ENOMEM; + + entry->proc_fops = &ikconfig_file_ops; + entry->size = kernel_config_data_size; + + return 0; +} + +/***************************************************/ +/* ikconfig_cleanup: clean up our mess */ + +static void __exit ikconfig_cleanup(void) +{ + remove_proc_entry("config.gz", &proc_root); +} + +module_init(ikconfig_init); +module_exit(ikconfig_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Randy Dunlap"); +MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); + +#endif /* CONFIG_IKCONFIG_PROC */ diff --git a/kernel/cpu.c b/kernel/cpu.c new file mode 100644 index 00000000000..628f4ccda12 --- /dev/null +++ b/kernel/cpu.c @@ -0,0 +1,193 @@ +/* CPU control. + * (C) 2001, 2002, 2003, 2004 Rusty Russell + * + * This code is licenced under the GPL. + */ +#include <linux/proc_fs.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/sched.h> +#include <linux/unistd.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/stop_machine.h> +#include <asm/semaphore.h> + +/* This protects CPUs going up and down... */ +DECLARE_MUTEX(cpucontrol); + +static struct notifier_block *cpu_chain; + +/* Need to know about CPUs going up/down? */ +int register_cpu_notifier(struct notifier_block *nb) +{ + int ret; + + if ((ret = down_interruptible(&cpucontrol)) != 0) + return ret; + ret = notifier_chain_register(&cpu_chain, nb); + up(&cpucontrol); + return ret; +} +EXPORT_SYMBOL(register_cpu_notifier); + +void unregister_cpu_notifier(struct notifier_block *nb) +{ + down(&cpucontrol); + notifier_chain_unregister(&cpu_chain, nb); + up(&cpucontrol); +} +EXPORT_SYMBOL(unregister_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU +static inline void check_for_tasks(int cpu) +{ + struct task_struct *p; + + write_lock_irq(&tasklist_lock); + for_each_process(p) { + if (task_cpu(p) == cpu && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ + (state = %ld, flags = %lx) \n", + p->comm, p->pid, cpu, p->state, p->flags); + } + write_unlock_irq(&tasklist_lock); +} + +/* Take this CPU down. */ +static int take_cpu_down(void *unused) +{ + int err; + + /* Take offline: makes arch_cpu_down somewhat easier. */ + cpu_clear(smp_processor_id(), cpu_online_map); + + /* Ensure this CPU doesn't handle any more interrupts. */ + err = __cpu_disable(); + if (err < 0) + cpu_set(smp_processor_id(), cpu_online_map); + else + /* Force idle task to run as soon as we yield: it should + immediately notice cpu is offline and die quickly. */ + sched_idle_next(); + + return err; +} + +int cpu_down(unsigned int cpu) +{ + int err; + struct task_struct *p; + cpumask_t old_allowed, tmp; + + if ((err = lock_cpu_hotplug_interruptible()) != 0) + return err; + + if (num_online_cpus() == 1) { + err = -EBUSY; + goto out; + } + + if (!cpu_online(cpu)) { + err = -EINVAL; + goto out; + } + + err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + (void *)(long)cpu); + if (err == NOTIFY_BAD) { + printk("%s: attempt to take down CPU %u failed\n", + __FUNCTION__, cpu); + err = -EINVAL; + goto out; + } + + /* Ensure that we are not runnable on dying cpu */ + old_allowed = current->cpus_allowed; + tmp = CPU_MASK_ALL; + cpu_clear(cpu, tmp); + set_cpus_allowed(current, tmp); + + p = __stop_machine_run(take_cpu_down, NULL, cpu); + if (IS_ERR(p)) { + /* CPU didn't die: tell everyone. Can't complain. */ + if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + (void *)(long)cpu) == NOTIFY_BAD) + BUG(); + + err = PTR_ERR(p); + goto out_allowed; + } + + if (cpu_online(cpu)) + goto out_thread; + + /* Wait for it to sleep (leaving idle task). */ + while (!idle_cpu(cpu)) + yield(); + + /* This actually kills the CPU. */ + __cpu_die(cpu); + + /* Move it here so it can run. */ + kthread_bind(p, get_cpu()); + put_cpu(); + + /* CPU is completely dead: tell everyone. Too late to complain. */ + if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) + == NOTIFY_BAD) + BUG(); + + check_for_tasks(cpu); + +out_thread: + err = kthread_stop(p); +out_allowed: + set_cpus_allowed(current, old_allowed); +out: + unlock_cpu_hotplug(); + return err; +} +#endif /*CONFIG_HOTPLUG_CPU*/ + +int __devinit cpu_up(unsigned int cpu) +{ + int ret; + void *hcpu = (void *)(long)cpu; + + if ((ret = down_interruptible(&cpucontrol)) != 0) + return ret; + + if (cpu_online(cpu) || !cpu_present(cpu)) { + ret = -EINVAL; + goto out; + } + ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + if (ret == NOTIFY_BAD) { + printk("%s: attempt to bring up CPU %u failed\n", + __FUNCTION__, cpu); + ret = -EINVAL; + goto out_notify; + } + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu); + if (ret != 0) + goto out_notify; + if (!cpu_online(cpu)) + BUG(); + + /* Now call notifier in preparation. */ + notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + +out_notify: + if (ret != 0) + notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); +out: + up(&cpucontrol); + return ret; +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c new file mode 100644 index 00000000000..69792bbe228 --- /dev/null +++ b/kernel/cpuset.c @@ -0,0 +1,1564 @@ +/* + * kernel/cpuset.c + * + * Processor and Memory placement constraints for sets of tasks. + * + * Copyright (C) 2003 BULL SA. + * Copyright (C) 2004 Silicon Graphics, Inc. + * + * Portions derived from Patrick Mochel's sysfs code. + * sysfs is Copyright (c) 2001-3 Patrick Mochel + * Portions Copyright (c) 2004 Silicon Graphics, Inc. + * + * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> + * 2003-10-22 Updates by Stephen Hemminger. + * 2004 May-July Rework by Paul Jackson <pj@sgi.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include <linux/config.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/spinlock.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/time.h> +#include <linux/backing-dev.h> +#include <linux/sort.h> + +#include <asm/uaccess.h> +#include <asm/atomic.h> +#include <asm/semaphore.h> + +#define CPUSET_SUPER_MAGIC 0x27e0eb + +struct cpuset { + unsigned long flags; /* "unsigned long" so bitops work */ + cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ + nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + + atomic_t count; /* count tasks using this cpuset */ + + /* + * We link our 'sibling' struct into our parents 'children'. + * Our children link their 'sibling' into our 'children'. + */ + struct list_head sibling; /* my parents children */ + struct list_head children; /* my children */ + + struct cpuset *parent; /* my parent */ + struct dentry *dentry; /* cpuset fs entry */ + + /* + * Copy of global cpuset_mems_generation as of the most + * recent time this cpuset changed its mems_allowed. + */ + int mems_generation; +}; + +/* bits in struct cpuset flags field */ +typedef enum { + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_REMOVED, + CS_NOTIFY_ON_RELEASE +} cpuset_flagbits_t; + +/* convenient tests for these bits */ +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_removed(const struct cpuset *cs) +{ + return !!test_bit(CS_REMOVED, &cs->flags); +} + +static inline int notify_on_release(const struct cpuset *cs) +{ + return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); +} + +/* + * Increment this atomic integer everytime any cpuset changes its + * mems_allowed value. Users of cpusets can track this generation + * number, and avoid having to lock and reload mems_allowed unless + * the cpuset they're using changes generation. + * + * A single, global generation is needed because attach_task() could + * reattach a task to a different cpuset, which must not have its + * generation numbers aliased with those of that tasks previous cpuset. + * + * Generations are needed for mems_allowed because one task cannot + * modify anothers memory placement. So we must enable every task, + * on every visit to __alloc_pages(), to efficiently check whether + * its current->cpuset->mems_allowed has changed, requiring an update + * of its current->mems_allowed. + */ +static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); + +static struct cpuset top_cpuset = { + .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .cpus_allowed = CPU_MASK_ALL, + .mems_allowed = NODE_MASK_ALL, + .count = ATOMIC_INIT(0), + .sibling = LIST_HEAD_INIT(top_cpuset.sibling), + .children = LIST_HEAD_INIT(top_cpuset.children), + .parent = NULL, + .dentry = NULL, + .mems_generation = 0, +}; + +static struct vfsmount *cpuset_mount; +static struct super_block *cpuset_sb = NULL; + +/* + * cpuset_sem should be held by anyone who is depending on the children + * or sibling lists of any cpuset, or performing non-atomic operations + * on the flags or *_allowed values of a cpuset, such as raising the + * CS_REMOVED flag bit iff it is not already raised, or reading and + * conditionally modifying the *_allowed values. One kernel global + * cpuset semaphore should be sufficient - these things don't change + * that much. + * + * The code that modifies cpusets holds cpuset_sem across the entire + * operation, from cpuset_common_file_write() down, single threading + * all cpuset modifications (except for counter manipulations from + * fork and exit) across the system. This presumes that cpuset + * modifications are rare - better kept simple and safe, even if slow. + * + * The code that reads cpusets, such as in cpuset_common_file_read() + * and below, only holds cpuset_sem across small pieces of code, such + * as when reading out possibly multi-word cpumasks and nodemasks, as + * the risks are less, and the desire for performance a little greater. + * The proc_cpuset_show() routine needs to hold cpuset_sem to insure + * that no cs->dentry is NULL, as it walks up the cpuset tree to root. + * + * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't + * (usually) grab cpuset_sem. These are the two most performance + * critical pieces of code here. The exception occurs on exit(), + * if the last task using a cpuset exits, and the cpuset was marked + * notify_on_release. In that case, the cpuset_sem is taken, the + * path to the released cpuset calculated, and a usermode call made + * to /sbin/cpuset_release_agent with the name of the cpuset (path + * relative to the root of cpuset file system) as the argument. + * + * A cpuset can only be deleted if both its 'count' of using tasks is + * zero, and its list of 'children' cpusets is empty. Since all tasks + * in the system use _some_ cpuset, and since there is always at least + * one task in the system (init, pid == 1), therefore, top_cpuset + * always has either children cpusets and/or using tasks. So no need + * for any special hack to ensure that top_cpuset cannot be deleted. + */ + +static DECLARE_MUTEX(cpuset_sem); + +/* + * A couple of forward declarations required, due to cyclic reference loop: + * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file + * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. + */ + +static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry); + +static struct backing_dev_info cpuset_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, +}; + +static struct inode *cpuset_new_inode(mode_t mode) +{ + struct inode *inode = new_inode(cpuset_sb); + + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; + } + return inode; +} + +static void cpuset_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cpuset */ + if (S_ISDIR(inode->i_mode)) { + struct cpuset *cs = dentry->d_fsdata; + BUG_ON(!(is_removed(cs))); + kfree(cs); + } + iput(inode); +} + +static struct dentry_operations cpuset_dops = { + .d_iput = cpuset_diput, +}; + +static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) +{ + struct qstr qstr; + struct dentry *d; + + qstr.name = name; + qstr.len = strlen(name); + qstr.hash = full_name_hash(name, qstr.len); + d = lookup_hash(&qstr, parent); + if (!IS_ERR(d)) + d->d_op = &cpuset_dops; + return d; +} + +static void remove_dir(struct dentry *d) +{ + struct dentry *parent = dget(d->d_parent); + + d_delete(d); + simple_rmdir(parent->d_inode, d); + dput(parent); +} + +/* + * NOTE : the dentry must have been dget()'ed + */ +static void cpuset_d_remove_dir(struct dentry *dentry) +{ + struct list_head *node; + + spin_lock(&dcache_lock); + node = dentry->d_subdirs.next; + while (node != &dentry->d_subdirs) { + struct dentry *d = list_entry(node, struct dentry, d_child); + list_del_init(node); + if (d->d_inode) { + d = dget_locked(d); + spin_unlock(&dcache_lock); + d_delete(d); + simple_unlink(dentry->d_inode, d); + dput(d); + spin_lock(&dcache_lock); + } + node = dentry->d_subdirs.next; + } + list_del_init(&dentry->d_child); + spin_unlock(&dcache_lock); + remove_dir(dentry); +} + +static struct super_operations cpuset_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, +}; + +static int cpuset_fill_super(struct super_block *sb, void *unused_data, + int unused_silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = CPUSET_SUPER_MAGIC; + sb->s_op = &cpuset_ops; + cpuset_sb = sb; + + inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR); + if (inode) { + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + /* directories start off with i_nlink == 2 (for "." entry) */ + inode->i_nlink++; + } else { + return -ENOMEM; + } + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + sb->s_root = root; + return 0; +} + +static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + return get_sb_single(fs_type, flags, data, cpuset_fill_super); +} + +static struct file_system_type cpuset_fs_type = { + .name = "cpuset", + .get_sb = cpuset_get_sb, + .kill_sb = kill_litter_super, +}; + +/* struct cftype: + * + * The files in the cpuset filesystem mostly have a very simple read/write + * handling, some common function will take care of it. Nevertheless some cases + * (read tasks) are special and therefore I define this structure for every + * kind of file. + * + * + * When reading/writing to a file: + * - the cpuset to use in file->f_dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_dentry->d_fsdata + */ + +struct cftype { + char *name; + int private; + int (*open) (struct inode *inode, struct file *file); + ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos); + int (*write) (struct file *file, const char __user *buf, size_t nbytes, + loff_t *ppos); + int (*release) (struct inode *inode, struct file *file); +}; + +static inline struct cpuset *__d_cs(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cftype *__d_cft(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +/* + * Call with cpuset_sem held. Writes path of cpuset into buf. + * Returns 0 on success, -errno on error. + */ + +static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) +{ + char *start; + + start = buf + buflen; + + *--start = '\0'; + for (;;) { + int len = cs->dentry->d_name.len; + if ((start -= len) < buf) + return -ENAMETOOLONG; + memcpy(start, cs->dentry->d_name.name, len); + cs = cs->parent; + if (!cs) + break; + if (!cs->parent) + continue; + if (--start < buf) + return -ENAMETOOLONG; + *start = '/'; + } + memmove(buf, start, buf + buflen - start); + return 0; +} + +/* + * Notify userspace when a cpuset is released, by running + * /sbin/cpuset_release_agent with the name of the cpuset (path + * relative to the root of cpuset file system) as the argument. + * + * Most likely, this user command will try to rmdir this cpuset. + * + * This races with the possibility that some other task will be + * attached to this cpuset before it is removed, or that some other + * user task will 'mkdir' a child cpuset of this cpuset. That's ok. + * The presumed 'rmdir' will fail quietly if this cpuset is no longer + * unused, and this cpuset will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * Note final arg to call_usermodehelper() is 0 - that means + * don't wait. Since we are holding the global cpuset_sem here, + * and we are asking another thread (started from keventd) to rmdir a + * cpuset, we can't wait - or we'd deadlock with the removing thread + * on cpuset_sem. + */ + +static int cpuset_release_agent(char *cpuset_str) +{ + char *argv[3], *envp[3]; + int i; + + i = 0; + argv[i++] = "/sbin/cpuset_release_agent"; + argv[i++] = cpuset_str; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp[i++] = "HOME=/"; + envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[i] = NULL; + + return call_usermodehelper(argv[0], argv, envp, 0); +} + +/* + * Either cs->count of using tasks transitioned to zero, or the + * cs->children list of child cpusets just became empty. If this + * cs is notify_on_release() and now both the user count is zero and + * the list of children is empty, send notice to user land. + */ + +static void check_for_release(struct cpuset *cs) +{ + if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && + list_empty(&cs->children)) { + char *buf; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return; + if (cpuset_path(cs, buf, PAGE_SIZE) < 0) + goto out; + cpuset_release_agent(buf); +out: + kfree(buf); + } +} + +/* + * Return in *pmask the portion of a cpusets's cpus_allowed that + * are online. If none are online, walk up the cpuset hierarchy + * until we find one that does have some online cpus. If we get + * all the way to the top and still haven't found any online cpus, + * return cpu_online_map. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_map. + * + * One way or another, we guarantee to return some non-empty subset + * of cpu_online_map. + * + * Call with cpuset_sem held. + */ + +static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) +{ + while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) + cs = cs->parent; + if (cs) + cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); + else + *pmask = cpu_online_map; + BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); +} + +/* + * Return in *pmask the portion of a cpusets's mems_allowed that + * are online. If none are online, walk up the cpuset hierarchy + * until we find one that does have some online mems. If we get + * all the way to the top and still haven't found any online mems, + * return node_online_map. + * + * One way or another, we guarantee to return some non-empty subset + * of node_online_map. + * + * Call with cpuset_sem held. + */ + +static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +{ + while (cs && !nodes_intersects(cs->mems_allowed, node_online_map)) + cs = cs->parent; + if (cs) + nodes_and(*pmask, cs->mems_allowed, node_online_map); + else + *pmask = node_online_map; + BUG_ON(!nodes_intersects(*pmask, node_online_map)); +} + +/* + * Refresh current tasks mems_allowed and mems_generation from + * current tasks cpuset. Call with cpuset_sem held. + * + * Be sure to call refresh_mems() on any cpuset operation which + * (1) holds cpuset_sem, and (2) might possibly alloc memory. + * Call after obtaining cpuset_sem lock, before any possible + * allocation. Otherwise one risks trying to allocate memory + * while the task cpuset_mems_generation is not the same as + * the mems_generation in its cpuset, which would deadlock on + * cpuset_sem in cpuset_update_current_mems_allowed(). + * + * Since we hold cpuset_sem, once refresh_mems() is called, the + * test (current->cpuset_mems_generation != cs->mems_generation) + * in cpuset_update_current_mems_allowed() will remain false, + * until we drop cpuset_sem. Anyone else who would change our + * cpusets mems_generation needs to lock cpuset_sem first. + */ + +static void refresh_mems(void) +{ + struct cpuset *cs = current->cpuset; + + if (current->cpuset_mems_generation != cs->mems_generation) { + guarantee_online_mems(cs, ¤t->mems_allowed); + current->cpuset_mems_generation = cs->mems_generation; + } +} + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpus_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/* + * validate_change() - Used to validate that any proposed cpuset change + * follows the structural rules for cpusets. + * + * If we replaced the flag and mask values of the current cpuset + * (cur) with those values in the trial cpuset (trial), would + * our various subset and exclusive rules still be valid? Presumes + * cpuset_sem held. + * + * 'cur' is the address of an actual, in-use cpuset. Operations + * such as list traversal that depend on the actual address of the + * cpuset in the list must use cur below, not trial. + * + * 'trial' is the address of bulk structure copy of cur, with + * perhaps one or more of the fields cpus_allowed, mems_allowed, + * or flags changed to new, trial values. + * + * Return 0 if valid, -errno if not. + */ + +static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +{ + struct cpuset *c, *par; + + /* Each of our child cpusets must be a subset of us */ + list_for_each_entry(c, &cur->children, sibling) { + if (!is_cpuset_subset(c, trial)) + return -EBUSY; + } + + /* Remaining checks don't apply to root cpuset */ + if ((par = cur->parent) == NULL) + return 0; + + /* We must be a subset of our parent cpuset */ + if (!is_cpuset_subset(trial, par)) + return -EACCES; + + /* If either I or some sibling (!= me) is exclusive, we can't overlap */ + list_for_each_entry(c, &par->children, sibling) { + if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && + c != cur && + cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) + return -EINVAL; + if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && + c != cur && + nodes_intersects(trial->mems_allowed, c->mems_allowed)) + return -EINVAL; + } + + return 0; +} + +static int update_cpumask(struct cpuset *cs, char *buf) +{ + struct cpuset trialcs; + int retval; + + trialcs = *cs; + retval = cpulist_parse(buf, trialcs.cpus_allowed); + if (retval < 0) + return retval; + cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); + if (cpus_empty(trialcs.cpus_allowed)) + return -ENOSPC; + retval = validate_change(cs, &trialcs); + if (retval == 0) + cs->cpus_allowed = trialcs.cpus_allowed; + return retval; +} + +static int update_nodemask(struct cpuset *cs, char *buf) +{ + struct cpuset trialcs; + int retval; + + trialcs = *cs; + retval = nodelist_parse(buf, trialcs.mems_allowed); + if (retval < 0) + return retval; + nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); + if (nodes_empty(trialcs.mems_allowed)) + return -ENOSPC; + retval = validate_change(cs, &trialcs); + if (retval == 0) { + cs->mems_allowed = trialcs.mems_allowed; + atomic_inc(&cpuset_mems_generation); + cs->mems_generation = atomic_read(&cpuset_mems_generation); + } + return retval; +} + +/* + * update_flag - read a 0 or a 1 in a file and update associated flag + * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + * CS_NOTIFY_ON_RELEASE) + * cs: the cpuset to update + * buf: the buffer where we read the 0 or 1 + */ + +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) +{ + int turning_on; + struct cpuset trialcs; + int err; + + turning_on = (simple_strtoul(buf, NULL, 10) != 0); + + trialcs = *cs; + if (turning_on) + set_bit(bit, &trialcs.flags); + else + clear_bit(bit, &trialcs.flags); + + err = validate_change(cs, &trialcs); + if (err == 0) { + if (turning_on) + set_bit(bit, &cs->flags); + else + clear_bit(bit, &cs->flags); + } + return err; +} + +static int attach_task(struct cpuset *cs, char *buf) +{ + pid_t pid; + struct task_struct *tsk; + struct cpuset *oldcs; + cpumask_t cpus; + + if (sscanf(buf, "%d", &pid) != 1) + return -EIO; + if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + return -ENOSPC; + + if (pid) { + read_lock(&tasklist_lock); + + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + if ((current->euid) && (current->euid != tsk->uid) + && (current->euid != tsk->suid)) { + put_task_struct(tsk); + return -EACCES; + } + } else { + tsk = current; + get_task_struct(tsk); + } + + task_lock(tsk); + oldcs = tsk->cpuset; + if (!oldcs) { + task_unlock(tsk); + put_task_struct(tsk); + return -ESRCH; + } + atomic_inc(&cs->count); + tsk->cpuset = cs; + task_unlock(tsk); + + guarantee_online_cpus(cs, &cpus); + set_cpus_allowed(tsk, cpus); + + put_task_struct(tsk); + if (atomic_dec_and_test(&oldcs->count)) + check_for_release(oldcs); + return 0; +} + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_ROOT, + FILE_DIR, + FILE_CPULIST, + FILE_MEMLIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_NOTIFY_ON_RELEASE, + FILE_TASKLIST, +} cpuset_filetype_t; + +static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cftype *cft = __d_cft(file->f_dentry); + cpuset_filetype_t type = cft->private; + char *buffer; + int retval = 0; + + /* Crude upper limit on largest legitimate cpulist user might write. */ + if (nbytes > 100 + 6 * NR_CPUS) + return -E2BIG; + + /* +1 for nul-terminator */ + if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) + return -ENOMEM; + + if (copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out1; + } + buffer[nbytes] = 0; /* nul-terminate */ + + down(&cpuset_sem); + + if (is_removed(cs)) { + retval = -ENODEV; + goto out2; + } + + switch (type) { + case FILE_CPULIST: + retval = update_cpumask(cs, buffer); + break; + case FILE_MEMLIST: + retval = update_nodemask(cs, buffer); + break; + case FILE_CPU_EXCLUSIVE: + retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); + break; + case FILE_MEM_EXCLUSIVE: + retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); + break; + case FILE_NOTIFY_ON_RELEASE: + retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); + break; + case FILE_TASKLIST: + retval = attach_task(cs, buffer); + break; + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + up(&cpuset_sem); +out1: + kfree(buffer); + return retval; +} + +static ssize_t cpuset_file_write(struct file *file, const char __user *buf, + size_t nbytes, loff_t *ppos) +{ + ssize_t retval = 0; + struct cftype *cft = __d_cft(file->f_dentry); + if (!cft) + return -ENODEV; + + /* special function ? */ + if (cft->write) + retval = cft->write(file, buf, nbytes, ppos); + else + retval = cpuset_common_file_write(file, buf, nbytes, ppos); + + return retval; +} + +/* + * These ascii lists should be read in a single call, by using a user + * buffer large enough to hold the entire map. If read in smaller + * chunks, there is no guarantee of atomicity. Since the display format + * used, list of ranges of sequential numbers, is variable length, + * and since these maps can change value dynamically, one could read + * gibberish by doing partial reads while a list was changing. + * A single large read to a buffer that crosses a page boundary is + * ok, because the result being copied to user land is not recomputed + * across a page fault. + */ + +static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +{ + cpumask_t mask; + + down(&cpuset_sem); + mask = cs->cpus_allowed; + up(&cpuset_sem); + + return cpulist_scnprintf(page, PAGE_SIZE, mask); +} + +static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +{ + nodemask_t mask; + + down(&cpuset_sem); + mask = cs->mems_allowed; + up(&cpuset_sem); + + return nodelist_scnprintf(page, PAGE_SIZE, mask); +} + +static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct cftype *cft = __d_cft(file->f_dentry); + struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + cpuset_filetype_t type = cft->private; + char *page; + ssize_t retval = 0; + char *s; + char *start; + size_t n; + + if (!(page = (char *)__get_free_page(GFP_KERNEL))) + return -ENOMEM; + + s = page; + + switch (type) { + case FILE_CPULIST: + s += cpuset_sprintf_cpulist(s, cs); + break; + case FILE_MEMLIST: + s += cpuset_sprintf_memlist(s, cs); + break; + case FILE_CPU_EXCLUSIVE: + *s++ = is_cpu_exclusive(cs) ? '1' : '0'; + break; + case FILE_MEM_EXCLUSIVE: + *s++ = is_mem_exclusive(cs) ? '1' : '0'; + break; + case FILE_NOTIFY_ON_RELEASE: + *s++ = notify_on_release(cs) ? '1' : '0'; + break; + default: + retval = -EINVAL; + goto out; + } + *s++ = '\n'; + *s = '\0'; + + start = page + *ppos; + n = s - start; + retval = n - copy_to_user(buf, start, min(n, nbytes)); + *ppos += retval; +out: + free_page((unsigned long)page); + return retval; +} + +static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + ssize_t retval = 0; + struct cftype *cft = __d_cft(file->f_dentry); + if (!cft) + return -ENODEV; + + /* special function ? */ + if (cft->read) + retval = cft->read(file, buf, nbytes, ppos); + else + retval = cpuset_common_file_read(file, buf, nbytes, ppos); + + return retval; +} + +static int cpuset_file_open(struct inode *inode, struct file *file) +{ + int err; + struct cftype *cft; + + err = generic_file_open(inode, file); + if (err) + return err; + + cft = __d_cft(file->f_dentry); + if (!cft) + return -ENODEV; + if (cft->open) + err = cft->open(inode, file); + else + err = 0; + + return err; +} + +static int cpuset_file_release(struct inode *inode, struct file *file) +{ + struct cftype *cft = __d_cft(file->f_dentry); + if (cft->release) + return cft->release(inode, file); + return 0; +} + +static struct file_operations cpuset_file_operations = { + .read = cpuset_file_read, + .write = cpuset_file_write, + .llseek = generic_file_llseek, + .open = cpuset_file_open, + .release = cpuset_file_release, +}; + +static struct inode_operations cpuset_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = cpuset_mkdir, + .rmdir = cpuset_rmdir, +}; + +static int cpuset_create_file(struct dentry *dentry, int mode) +{ + struct inode *inode; + + if (!dentry) + return -ENOENT; + if (dentry->d_inode) + return -EEXIST; + + inode = cpuset_new_inode(mode); + if (!inode) + return -ENOMEM; + + if (S_ISDIR(mode)) { + inode->i_op = &cpuset_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* start off with i_nlink == 2 (for "." entry) */ + inode->i_nlink++; + } else if (S_ISREG(mode)) { + inode->i_size = 0; + inode->i_fop = &cpuset_file_operations; + } + + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + return 0; +} + +/* + * cpuset_create_dir - create a directory for an object. + * cs: the cpuset we create the directory for. + * It must have a valid ->parent field + * And we are going to fill its ->dentry field. + * name: The name to give to the cpuset directory. Will be copied. + * mode: mode to set on new directory. + */ + +static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) +{ + struct dentry *dentry = NULL; + struct dentry *parent; + int error = 0; + + parent = cs->parent->dentry; + dentry = cpuset_get_dentry(parent, name); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + error = cpuset_create_file(dentry, S_IFDIR | mode); + if (!error) { + dentry->d_fsdata = cs; + parent->d_inode->i_nlink++; + cs->dentry = dentry; + } + dput(dentry); + + return error; +} + +static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) +{ + struct dentry *dentry; + int error; + + down(&dir->d_inode->i_sem); + dentry = cpuset_get_dentry(dir, cft->name); + if (!IS_ERR(dentry)) { + error = cpuset_create_file(dentry, 0644 | S_IFREG); + if (!error) + dentry->d_fsdata = (void *)cft; + dput(dentry); + } else + error = PTR_ERR(dentry); + up(&dir->d_inode->i_sem); + return error; +} + +/* + * Stuff for reading the 'tasks' file. + * + * Reading this file can return large amounts of data if a cpuset has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + * Upon tasks file open(), a struct ctr_struct is allocated, that + * will have a pointer to an array (also allocated here). The struct + * ctr_struct * is stored in file->private_data. Its resources will + * be freed by release() when the file is closed. The array is used + * to sprintf the PIDs and then used by read(). + */ + +/* cpusets_tasks_read array */ + +struct ctr_struct { + char *buf; + int bufsz; +}; + +/* + * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. + * Return actual number of pids loaded. + */ +static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) +{ + int n = 0; + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + + do_each_thread(g, p) { + if (p->cpuset == cs) { + pidarray[n++] = p->pid; + if (unlikely(n == npids)) + goto array_full; + } + } while_each_thread(g, p); + +array_full: + read_unlock(&tasklist_lock); + return n; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * Convert array 'a' of 'npids' pid_t's to a string of newline separated + * decimal pids in 'buf'. Don't write more than 'sz' chars, but return + * count 'cnt' of how many chars would be written if buf were large enough. + */ +static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) +{ + int cnt = 0; + int i; + + for (i = 0; i < npids; i++) + cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); + return cnt; +} + +static int cpuset_tasks_open(struct inode *unused, struct file *file) +{ + struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct ctr_struct *ctr; + pid_t *pidarray; + int npids; + char c; + + if (!(file->f_mode & FMODE_READ)) + return 0; + + ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); + if (!ctr) + goto err0; + + /* + * If cpuset gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cpuset users didn't + * show up until sometime later on. + */ + npids = atomic_read(&cs->count); + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + goto err1; + + npids = pid_array_load(pidarray, npids, cs); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); + + /* Call pid_array_to_buf() twice, first just to get bufsz */ + ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; + ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); + if (!ctr->buf) + goto err2; + ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); + + kfree(pidarray); + file->private_data = ctr; + return 0; + +err2: + kfree(pidarray); +err1: + kfree(ctr); +err0: + return -ENOMEM; +} + +static ssize_t cpuset_tasks_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct ctr_struct *ctr = file->private_data; + + if (*ppos + nbytes > ctr->bufsz) + nbytes = ctr->bufsz - *ppos; + if (copy_to_user(buf, ctr->buf + *ppos, nbytes)) + return -EFAULT; + *ppos += nbytes; + return nbytes; +} + +static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) +{ + struct ctr_struct *ctr; + + if (file->f_mode & FMODE_READ) { + ctr = file->private_data; + kfree(ctr->buf); + kfree(ctr); + } + return 0; +} + +/* + * for the common functions, 'private' gives the type of file + */ + +static struct cftype cft_tasks = { + .name = "tasks", + .open = cpuset_tasks_open, + .read = cpuset_tasks_read, + .release = cpuset_tasks_release, + .private = FILE_TASKLIST, +}; + +static struct cftype cft_cpus = { + .name = "cpus", + .private = FILE_CPULIST, +}; + +static struct cftype cft_mems = { + .name = "mems", + .private = FILE_MEMLIST, +}; + +static struct cftype cft_cpu_exclusive = { + .name = "cpu_exclusive", + .private = FILE_CPU_EXCLUSIVE, +}; + +static struct cftype cft_mem_exclusive = { + .name = "mem_exclusive", + .private = FILE_MEM_EXCLUSIVE, +}; + +static struct cftype cft_notify_on_release = { + .name = "notify_on_release", + .private = FILE_NOTIFY_ON_RELEASE, +}; + +static int cpuset_populate_dir(struct dentry *cs_dentry) +{ + int err; + + if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) + return err; + return 0; +} + +/* + * cpuset_create - create a cpuset + * parent: cpuset that will be parent of the new cpuset. + * name: name of the new cpuset. Will be strcpy'ed. + * mode: mode to set on new inode + * + * Must be called with the semaphore on the parent inode held + */ + +static long cpuset_create(struct cpuset *parent, const char *name, int mode) +{ + struct cpuset *cs; + int err; + + cs = kmalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return -ENOMEM; + + down(&cpuset_sem); + refresh_mems(); + cs->flags = 0; + if (notify_on_release(parent)) + set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + cs->cpus_allowed = CPU_MASK_NONE; + cs->mems_allowed = NODE_MASK_NONE; + atomic_set(&cs->count, 0); + INIT_LIST_HEAD(&cs->sibling); + INIT_LIST_HEAD(&cs->children); + atomic_inc(&cpuset_mems_generation); + cs->mems_generation = atomic_read(&cpuset_mems_generation); + + cs->parent = parent; + + list_add(&cs->sibling, &cs->parent->children); + + err = cpuset_create_dir(cs, name, mode); + if (err < 0) + goto err; + + /* + * Release cpuset_sem before cpuset_populate_dir() because it + * will down() this new directory's i_sem and if we race with + * another mkdir, we might deadlock. + */ + up(&cpuset_sem); + + err = cpuset_populate_dir(cs->dentry); + /* If err < 0, we have a half-filled directory - oh well ;) */ + return 0; +err: + list_del(&cs->sibling); + up(&cpuset_sem); + kfree(cs); + return err; +} + +static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct cpuset *c_parent = dentry->d_parent->d_fsdata; + + /* the vfs holds inode->i_sem already */ + return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); +} + +static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + struct cpuset *cs = dentry->d_fsdata; + struct dentry *d; + struct cpuset *parent; + + /* the vfs holds both inode->i_sem already */ + + down(&cpuset_sem); + refresh_mems(); + if (atomic_read(&cs->count) > 0) { + up(&cpuset_sem); + return -EBUSY; + } + if (!list_empty(&cs->children)) { + up(&cpuset_sem); + return -EBUSY; + } + spin_lock(&cs->dentry->d_lock); + parent = cs->parent; + set_bit(CS_REMOVED, &cs->flags); + list_del(&cs->sibling); /* delete my sibling from parent->children */ + if (list_empty(&parent->children)) + check_for_release(parent); + d = dget(cs->dentry); + cs->dentry = NULL; + spin_unlock(&d->d_lock); + cpuset_d_remove_dir(d); + dput(d); + up(&cpuset_sem); + return 0; +} + +/** + * cpuset_init - initialize cpusets at system boot + * + * Description: Initialize top_cpuset and the cpuset internal file system, + **/ + +int __init cpuset_init(void) +{ + struct dentry *root; + int err; + + top_cpuset.cpus_allowed = CPU_MASK_ALL; + top_cpuset.mems_allowed = NODE_MASK_ALL; + + atomic_inc(&cpuset_mems_generation); + top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); + + init_task.cpuset = &top_cpuset; + + err = register_filesystem(&cpuset_fs_type); + if (err < 0) + goto out; + cpuset_mount = kern_mount(&cpuset_fs_type); + if (IS_ERR(cpuset_mount)) { + printk(KERN_ERR "cpuset: could not mount!\n"); + err = PTR_ERR(cpuset_mount); + cpuset_mount = NULL; + goto out; + } + root = cpuset_mount->mnt_sb->s_root; + root->d_fsdata = &top_cpuset; + root->d_inode->i_nlink++; + top_cpuset.dentry = root; + root->d_inode->i_op = &cpuset_dir_inode_operations; + err = cpuset_populate_dir(root); +out: + return err; +} + +/** + * cpuset_init_smp - initialize cpus_allowed + * + * Description: Finish top cpuset after cpu, node maps are initialized + **/ + +void __init cpuset_init_smp(void) +{ + top_cpuset.cpus_allowed = cpu_online_map; + top_cpuset.mems_allowed = node_online_map; +} + +/** + * cpuset_fork - attach newly forked task to its parents cpuset. + * @p: pointer to task_struct of forking parent process. + * + * Description: By default, on fork, a task inherits its + * parents cpuset. The pointer to the shared cpuset is + * automatically copied in fork.c by dup_task_struct(). + * This cpuset_fork() routine need only increment the usage + * counter in that cpuset. + **/ + +void cpuset_fork(struct task_struct *tsk) +{ + atomic_inc(&tsk->cpuset->count); +} + +/** + * cpuset_exit - detach cpuset from exiting task + * @tsk: pointer to task_struct of exiting process + * + * Description: Detach cpuset from @tsk and release it. + * + **/ + +void cpuset_exit(struct task_struct *tsk) +{ + struct cpuset *cs; + + task_lock(tsk); + cs = tsk->cpuset; + tsk->cpuset = NULL; + task_unlock(tsk); + + if (atomic_dec_and_test(&cs->count)) { + down(&cpuset_sem); + check_for_release(cs); + up(&cpuset_sem); + } +} + +/** + * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * + * Description: Returns the cpumask_t cpus_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_online_map, even if this means going outside the + * tasks cpuset. + **/ + +const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) +{ + cpumask_t mask; + + down(&cpuset_sem); + task_lock((struct task_struct *)tsk); + guarantee_online_cpus(tsk->cpuset, &mask); + task_unlock((struct task_struct *)tsk); + up(&cpuset_sem); + + return mask; +} + +void cpuset_init_current_mems_allowed(void) +{ + current->mems_allowed = NODE_MASK_ALL; +} + +/* + * If the current tasks cpusets mems_allowed changed behind our backs, + * update current->mems_allowed and mems_generation to the new value. + * Do not call this routine if in_interrupt(). + */ + +void cpuset_update_current_mems_allowed(void) +{ + struct cpuset *cs = current->cpuset; + + if (!cs) + return; /* task is exiting */ + if (current->cpuset_mems_generation != cs->mems_generation) { + down(&cpuset_sem); + refresh_mems(); + up(&cpuset_sem); + } +} + +void cpuset_restrict_to_mems_allowed(unsigned long *nodes) +{ + bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), + MAX_NUMNODES); +} + +/* + * Are any of the nodes on zonelist zl allowed in current->mems_allowed? + */ +int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) +{ + int i; + + for (i = 0; zl->zones[i]; i++) { + int nid = zl->zones[i]->zone_pgdat->node_id; + + if (node_isset(nid, current->mems_allowed)) + return 1; + } + return 0; +} + +/* + * Is 'current' valid, and is zone z allowed in current->mems_allowed? + */ +int cpuset_zone_allowed(struct zone *z) +{ + return in_interrupt() || + node_isset(z->zone_pgdat->node_id, current->mems_allowed); +} + +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc/<pid>/cpuset. + */ + +static int proc_cpuset_show(struct seq_file *m, void *v) +{ + struct cpuset *cs; + struct task_struct *tsk; + char *buf; + int retval = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + tsk = m->private; + down(&cpuset_sem); + task_lock(tsk); + cs = tsk->cpuset; + task_unlock(tsk); + if (!cs) { + retval = -EINVAL; + goto out; + } + + retval = cpuset_path(cs, buf, PAGE_SIZE); + if (retval < 0) + goto out; + seq_puts(m, buf); + seq_putc(m, '\n'); +out: + up(&cpuset_sem); + kfree(buf); + return retval; +} + +static int cpuset_open(struct inode *inode, struct file *file) +{ + struct task_struct *tsk = PROC_I(inode)->task; + return single_open(file, proc_cpuset_show, tsk); +} + +struct file_operations proc_cpuset_operations = { + .open = cpuset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ +char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) +{ + buffer += sprintf(buffer, "Cpus_allowed:\t"); + buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed); + buffer += sprintf(buffer, "\n"); + buffer += sprintf(buffer, "Mems_allowed:\t"); + buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed); + buffer += sprintf(buffer, "\n"); + return buffer; +} diff --git a/kernel/dma.c b/kernel/dma.c new file mode 100644 index 00000000000..aef0a45b789 --- /dev/null +++ b/kernel/dma.c @@ -0,0 +1,158 @@ +/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ + * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. + * + * Written by Hennus Bergman, 1992. + * + * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma. + * In the previous version the reported device could end up being wrong, + * if a device requested a DMA channel that was already in use. + * [It also happened to remove the sizeof(char *) == sizeof(int) + * assumption introduced because of those /proc/dma patches. -- Hennus] + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <asm/dma.h> +#include <asm/system.h> + + + +/* A note on resource allocation: + * + * All drivers needing DMA channels, should allocate and release them + * through the public routines `request_dma()' and `free_dma()'. + * + * In order to avoid problems, all processes should allocate resources in + * the same sequence and release them in the reverse order. + * + * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA. + * When releasing them, first release the DMA, then release the IRQ. + * If you don't, you may cause allocation requests to fail unnecessarily. + * This doesn't really matter now, but it will once we get real semaphores + * in the kernel. + */ + + +DEFINE_SPINLOCK(dma_spin_lock); + +/* + * If our port doesn't define this it has no PC like DMA + */ + +#ifdef MAX_DMA_CHANNELS + + +/* Channel n is busy iff dma_chan_busy[n].lock != 0. + * DMA0 used to be reserved for DRAM refresh, but apparently not any more... + * DMA4 is reserved for cascading. + */ + +struct dma_chan { + int lock; + const char *device_id; +}; + +static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { + [4] = { 1, "cascade" }, +}; + + +int request_dma(unsigned int dmanr, const char * device_id) +{ + if (dmanr >= MAX_DMA_CHANNELS) + return -EINVAL; + + if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) + return -EBUSY; + + dma_chan_busy[dmanr].device_id = device_id; + + /* old flag was 0, now contains 1 to indicate busy */ + return 0; +} /* request_dma */ + + +void free_dma(unsigned int dmanr) +{ + if (dmanr >= MAX_DMA_CHANNELS) { + printk(KERN_WARNING "Trying to free DMA%d\n", dmanr); + return; + } + + if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { + printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); + return; + } + +} /* free_dma */ + +#else + +int request_dma(unsigned int dmanr, const char *device_id) +{ + return -EINVAL; +} + +void free_dma(unsigned int dmanr) +{ +} + +#endif + +#ifdef CONFIG_PROC_FS + +#ifdef MAX_DMA_CHANNELS +static int proc_dma_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { + if (dma_chan_busy[i].lock) { + seq_printf(m, "%2d: %s\n", i, + dma_chan_busy[i].device_id); + } + } + return 0; +} +#else +static int proc_dma_show(struct seq_file *m, void *v) +{ + seq_puts(m, "No DMA\n"); + return 0; +} +#endif /* MAX_DMA_CHANNELS */ + +static int proc_dma_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_dma_show, NULL); +} + +static struct file_operations proc_dma_operations = { + .open = proc_dma_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_dma_init(void) +{ + struct proc_dir_entry *e; + + e = create_proc_entry("dma", 0, NULL); + if (e) + e->proc_fops = &proc_dma_operations; + + return 0; +} + +__initcall(proc_dma_init); +#endif + +EXPORT_SYMBOL(request_dma); +EXPORT_SYMBOL(free_dma); +EXPORT_SYMBOL(dma_spin_lock); diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c new file mode 100644 index 00000000000..867d6dbeb57 --- /dev/null +++ b/kernel/exec_domain.c @@ -0,0 +1,209 @@ +/* + * Handling of different ABIs (personalities). + * + * We group personalities into execution domains which have their + * own handlers for kernel entry points, signal mapping, etc... + * + * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) + */ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/personality.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/sysctl.h> +#include <linux/types.h> + + +static void default_handler(int, struct pt_regs *); + +static struct exec_domain *exec_domains = &default_exec_domain; +static DEFINE_RWLOCK(exec_domains_lock); + + +static u_long ident_map[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 +}; + +struct exec_domain default_exec_domain = { + .name = "Linux", /* name */ + .handler = default_handler, /* lcall7 causes a seg fault. */ + .pers_low = 0, /* PER_LINUX personality. */ + .pers_high = 0, /* PER_LINUX personality. */ + .signal_map = ident_map, /* Identity map signals. */ + .signal_invmap = ident_map, /* - both ways. */ +}; + + +static void +default_handler(int segment, struct pt_regs *regp) +{ + set_personality(0); + + if (current_thread_info()->exec_domain->handler != default_handler) + current_thread_info()->exec_domain->handler(segment, regp); + else + send_sig(SIGSEGV, current, 1); +} + +static struct exec_domain * +lookup_exec_domain(u_long personality) +{ + struct exec_domain * ep; + u_long pers = personality(personality); + + read_lock(&exec_domains_lock); + for (ep = exec_domains; ep; ep = ep->next) { + if (pers >= ep->pers_low && pers <= ep->pers_high) + if (try_module_get(ep->module)) + goto out; + } + +#ifdef CONFIG_KMOD + read_unlock(&exec_domains_lock); + request_module("personality-%ld", pers); + read_lock(&exec_domains_lock); + + for (ep = exec_domains; ep; ep = ep->next) { + if (pers >= ep->pers_low && pers <= ep->pers_high) + if (try_module_get(ep->module)) + goto out; + } +#endif + + ep = &default_exec_domain; +out: + read_unlock(&exec_domains_lock); + return (ep); +} + +int +register_exec_domain(struct exec_domain *ep) +{ + struct exec_domain *tmp; + int err = -EBUSY; + + if (ep == NULL) + return -EINVAL; + + if (ep->next != NULL) + return -EBUSY; + + write_lock(&exec_domains_lock); + for (tmp = exec_domains; tmp; tmp = tmp->next) { + if (tmp == ep) + goto out; + } + + ep->next = exec_domains; + exec_domains = ep; + err = 0; + +out: + write_unlock(&exec_domains_lock); + return (err); +} + +int +unregister_exec_domain(struct exec_domain *ep) +{ + struct exec_domain **epp; + + epp = &exec_domains; + write_lock(&exec_domains_lock); + for (epp = &exec_domains; *epp; epp = &(*epp)->next) { + if (ep == *epp) + goto unregister; + } + write_unlock(&exec_domains_lock); + return -EINVAL; + +unregister: + *epp = ep->next; + ep->next = NULL; + write_unlock(&exec_domains_lock); + return 0; +} + +int +__set_personality(u_long personality) +{ + struct exec_domain *ep, *oep; + + ep = lookup_exec_domain(personality); + if (ep == current_thread_info()->exec_domain) { + current->personality = personality; + return 0; + } + + if (atomic_read(¤t->fs->count) != 1) { + struct fs_struct *fsp, *ofsp; + + fsp = copy_fs_struct(current->fs); + if (fsp == NULL) { + module_put(ep->module); + return -ENOMEM; + } + + task_lock(current); + ofsp = current->fs; + current->fs = fsp; + task_unlock(current); + + put_fs_struct(ofsp); + } + + /* + * At that point we are guaranteed to be the sole owner of + * current->fs. + */ + + current->personality = personality; + oep = current_thread_info()->exec_domain; + current_thread_info()->exec_domain = ep; + set_fs_altroot(); + + module_put(oep->module); + return 0; +} + +int +get_exec_domain_list(char *page) +{ + struct exec_domain *ep; + int len = 0; + + read_lock(&exec_domains_lock); + for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) + len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", + ep->pers_low, ep->pers_high, ep->name, + module_name(ep->module)); + read_unlock(&exec_domains_lock); + return (len); +} + +asmlinkage long +sys_personality(u_long personality) +{ + u_long old = current->personality; + + if (personality != 0xffffffff) { + set_personality(personality); + if (current->personality != personality) + return -EINVAL; + } + + return (long)old; +} + + +EXPORT_SYMBOL(register_exec_domain); +EXPORT_SYMBOL(unregister_exec_domain); +EXPORT_SYMBOL(__set_personality); diff --git a/kernel/exit.c b/kernel/exit.c new file mode 100644 index 00000000000..6dd4ebe1dd9 --- /dev/null +++ b/kernel/exit.c @@ -0,0 +1,1527 @@ +/* + * linux/kernel/exit.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/smp_lock.h> +#include <linux/module.h> +#include <linux/completion.h> +#include <linux/personality.h> +#include <linux/tty.h> +#include <linux/namespace.h> +#include <linux/key.h> +#include <linux/security.h> +#include <linux/cpu.h> +#include <linux/acct.h> +#include <linux/file.h> +#include <linux/binfmts.h> +#include <linux/ptrace.h> +#include <linux/profile.h> +#include <linux/mount.h> +#include <linux/proc_fs.h> +#include <linux/mempolicy.h> +#include <linux/cpuset.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/pgtable.h> +#include <asm/mmu_context.h> + +extern void sem_exit (void); +extern struct task_struct *child_reaper; + +int getrusage(struct task_struct *, int, struct rusage __user *); + +static void __unhash_process(struct task_struct *p) +{ + nr_threads--; + detach_pid(p, PIDTYPE_PID); + detach_pid(p, PIDTYPE_TGID); + if (thread_group_leader(p)) { + detach_pid(p, PIDTYPE_PGID); + detach_pid(p, PIDTYPE_SID); + if (p->pid) + __get_cpu_var(process_counts)--; + } + + REMOVE_LINKS(p); +} + +void release_task(struct task_struct * p) +{ + int zap_leader; + task_t *leader; + struct dentry *proc_dentry; + +repeat: + atomic_dec(&p->user->processes); + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) + __ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + __exit_signal(p); + __exit_sighand(p); + __unhash_process(p); + + /* + * If we are the last non-leader member of the thread + * group, and the leader is zombie, then notify the + * group leader's parent process. (if it wants notification.) + */ + zap_leader = 0; + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + BUG_ON(leader->exit_signal == -1); + do_notify_parent(leader, leader->exit_signal); + /* + * If we were the last child thread and the leader has + * exited already, and the leader's parent ignores SIGCHLD, + * then we are the one who should release the leader. + * + * do_notify_parent() will have marked it self-reaping in + * that case. + */ + zap_leader = (leader->exit_signal == -1); + } + + sched_exit(p); + write_unlock_irq(&tasklist_lock); + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); + release_thread(p); + put_task_struct(p); + + p = leader; + if (unlikely(zap_leader)) + goto repeat; +} + +/* we are using it only for SMP init */ + +void unhash_process(struct task_struct *p) +{ + struct dentry *proc_dentry; + + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); + write_lock_irq(&tasklist_lock); + __unhash_process(p); + write_unlock_irq(&tasklist_lock); + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); +} + +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + */ +int session_of_pgrp(int pgrp) +{ + struct task_struct *p; + int sid = -1; + + read_lock(&tasklist_lock); + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + if (p->signal->session > 0) { + sid = p->signal->session; + goto out; + } + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + p = find_task_by_pid(pgrp); + if (p) + sid = p->signal->session; +out: + read_unlock(&tasklist_lock); + + return sid; +} + +/* + * Determine if a process group is "orphaned", according to the POSIX + * definition in 2.2.2.52. Orphaned process groups are not to be affected + * by terminal-generated stop signals. Newly orphaned process groups are + * to receive a SIGHUP and a SIGCONT. + * + * "I ask you, have you ever known what it is to be an orphan?" + */ +static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) +{ + struct task_struct *p; + int ret = 1; + + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + if (p == ignored_task + || p->exit_state + || p->real_parent->pid == 1) + continue; + if (process_group(p->real_parent) != pgrp + && p->real_parent->signal->session == p->signal->session) { + ret = 0; + break; + } + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + return ret; /* (sighing) "Often!" */ +} + +int is_orphaned_pgrp(int pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(pgrp, NULL); + read_unlock(&tasklist_lock); + + return retval; +} + +static inline int has_stopped_jobs(int pgrp) +{ + int retval = 0; + struct task_struct *p; + + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + if (p->state != TASK_STOPPED) + continue; + + /* If p is stopped by a debugger on a signal that won't + stop it, then don't count p as stopped. This isn't + perfect but it's a good approximation. */ + if (unlikely (p->ptrace) + && p->exit_code != SIGSTOP + && p->exit_code != SIGTSTP + && p->exit_code != SIGTTOU + && p->exit_code != SIGTTIN) + continue; + + retval = 1; + break; + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + return retval; +} + +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + ptrace_unlink(current); + /* Reparent to init */ + REMOVE_LINKS(current); + current->parent = child_reaper; + current->real_parent = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + security_task_reparent_to_init(current); + memcpy(current->signal->rlim, init_task.signal->rlim, + sizeof(current->signal->rlim)); + atomic_inc(&(INIT_USER->__count)); + write_unlock_irq(&tasklist_lock); + switch_uid(INIT_USER); +} + +void __set_special_pids(pid_t session, pid_t pgrp) +{ + struct task_struct *curr = current; + + if (curr->signal->session != session) { + detach_pid(curr, PIDTYPE_SID); + curr->signal->session = session; + attach_pid(curr, PIDTYPE_SID, session); + } + if (process_group(curr) != pgrp) { + detach_pid(curr, PIDTYPE_PGID); + curr->signal->pgrp = pgrp; + attach_pid(curr, PIDTYPE_PGID, pgrp); + } +} + +void set_special_pids(pid_t session, pid_t pgrp) +{ + write_lock_irq(&tasklist_lock); + __set_special_pids(session, pgrp); + write_unlock_irq(&tasklist_lock); +} + +/* + * Let kernel threads use this to say that they + * allow a certain signal (since daemonize() will + * have disabled all of them by default). + */ +int allow_signal(int sig) +{ + if (sig < 1 || sig > _NSIG) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + sigdelset(¤t->blocked, sig); + if (!current->mm) { + /* Kernel threads handle their own signals. + Let the signal code know it'll be handled, so + that they don't get converted to SIGKILL or + just silently dropped */ + current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; + } + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(allow_signal); + +int disallow_signal(int sig) +{ + if (sig < 1 || sig > _NSIG) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(disallow_signal); + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(const char *name, ...) +{ + va_list args; + struct fs_struct *fs; + sigset_t blocked; + + va_start(args, name); + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + + set_special_pids(1, 1); + down(&tty_sem); + current->signal->tty = NULL; + up(&tty_sem); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_init(); +} + +EXPORT_SYMBOL(daemonize); + +static inline void close_files(struct files_struct * files) +{ + int i, j; + + j = 0; + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= files->max_fdset || i >= files->max_fds) + break; + set = files->open_fds->fds_bits[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&files->fd[i], NULL); + if (file) + filp_close(file, files); + } + i++; + set >>= 1; + } + } +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void fastcall put_files_struct(struct files_struct *files) +{ + if (atomic_dec_and_test(&files->count)) { + close_files(files); + /* + * Free the fd and fdset arrays if we expanded them. + */ + if (files->fd != &files->fd_array[0]) + free_fd_array(files->fd, files->max_fds); + if (files->max_fdset > __FD_SETSIZE) { + free_fdset(files->open_fds, files->max_fdset); + free_fdset(files->close_on_exec, files->max_fdset); + } + kmem_cache_free(files_cachep, files); + } +} + +EXPORT_SYMBOL(put_files_struct); + +static inline void __exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + +void exit_files(struct task_struct *tsk) +{ + __exit_files(tsk); +} + +static inline void __put_fs_struct(struct fs_struct *fs) +{ + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { + dput(fs->altroot); + mntput(fs->altrootmnt); + } + kmem_cache_free(fs_cachep, fs); + } +} + +void put_fs_struct(struct fs_struct *fs) +{ + __put_fs_struct(fs); +} + +static inline void __exit_fs(struct task_struct *tsk) +{ + struct fs_struct * fs = tsk->fs; + + if (fs) { + task_lock(tsk); + tsk->fs = NULL; + task_unlock(tsk); + __put_fs_struct(fs); + } +} + +void exit_fs(struct task_struct *tsk) +{ + __exit_fs(tsk); +} + +EXPORT_SYMBOL_GPL(exit_fs); + +/* + * Turn us into a lazy TLB process if we + * aren't already.. + */ +void exit_mm(struct task_struct * tsk) +{ + struct mm_struct *mm = tsk->mm; + + mm_release(tsk, mm); + if (!mm) + return; + /* + * Serialize with any possible pending coredump. + * We must hold mmap_sem around checking core_waiters + * and clearing tsk->mm. The core-inducing thread + * will increment core_waiters for each thread in the + * group with ->mm != NULL. + */ + down_read(&mm->mmap_sem); + if (mm->core_waiters) { + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + if (!--mm->core_waiters) + complete(mm->core_startup_done); + up_write(&mm->mmap_sem); + + wait_for_completion(&mm->core_done); + down_read(&mm->mmap_sem); + } + atomic_inc(&mm->mm_count); + if (mm != tsk->active_mm) BUG(); + /* more a memory barrier than a real lock */ + task_lock(tsk); + tsk->mm = NULL; + up_read(&mm->mmap_sem); + enter_lazy_tlb(mm, current); + task_unlock(tsk); + mmput(mm); +} + +static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) +{ + /* + * Make sure we're not reparenting to ourselves and that + * the parent is not a zombie. + */ + BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); + p->real_parent = reaper; + if (p->parent == p->real_parent) + BUG(); +} + +static inline void reparent_thread(task_t *p, task_t *father, int traced) +{ + /* We don't want people slaying init. */ + if (p->exit_signal != -1) + p->exit_signal = SIGCHLD; + + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, (void *) 0, p); + + /* Move the child from its dying parent to the new one. */ + if (unlikely(traced)) { + /* Preserve ptrace links if someone else is tracing this child. */ + list_del_init(&p->ptrace_list); + if (p->parent != p->real_parent) + list_add(&p->ptrace_list, &p->real_parent->ptrace_children); + } else { + /* If this child is being traced, then we're the one tracing it + * anyway, so let go of it. + */ + p->ptrace = 0; + list_del_init(&p->sibling); + p->parent = p->real_parent; + list_add_tail(&p->sibling, &p->parent->children); + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && + thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + else if (p->state == TASK_TRACED) { + /* + * If it was at a trace stop, turn it into + * a normal stop since it's no longer being + * traced. + */ + ptrace_untrace(p); + } + } + + /* + * process group orphan check + * Case ii: Our child is in a different pgrp + * than we are, and it was the only connection + * outside, so the child pgrp is now orphaned. + */ + if ((process_group(p) != process_group(father)) && + (p->signal->session == father->signal->session)) { + int pgrp = process_group(p); + + if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + __kill_pg_info(SIGHUP, (void *)1, pgrp); + __kill_pg_info(SIGCONT, (void *)1, pgrp); + } + } +} + +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the global child reaper process (ie "init") + */ +static inline void forget_original_parent(struct task_struct * father, + struct list_head *to_release) +{ + struct task_struct *p, *reaper = father; + struct list_head *_p, *_n; + + do { + reaper = next_thread(reaper); + if (reaper == father) { + reaper = child_reaper; + break; + } + } while (reaper->exit_state); + + /* + * There are only two places where our children can be: + * + * - in our child list + * - in our ptraced child list + * + * Search them and reparent children. + */ + list_for_each_safe(_p, _n, &father->children) { + int ptrace; + p = list_entry(_p,struct task_struct,sibling); + + ptrace = p->ptrace; + + /* if father isn't the real parent, then ptrace must be enabled */ + BUG_ON(father != p->real_parent && !ptrace); + + if (father == p->real_parent) { + /* reparent with a reaper, real father it's us */ + choose_new_parent(p, reaper, child_reaper); + reparent_thread(p, father, 0); + } else { + /* reparent ptraced task to its real parent */ + __ptrace_unlink (p); + if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && + thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + } + + /* + * if the ptraced child is a zombie with exit_signal == -1 + * we must collect it before we exit, or it will remain + * zombie forever since we prevented it from self-reap itself + * while it was being traced by us, to be able to see it in wait4. + */ + if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) + list_add(&p->ptrace_list, to_release); + } + list_for_each_safe(_p, _n, &father->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); + choose_new_parent(p, reaper, child_reaper); + reparent_thread(p, father, 1); + } +} + +/* + * Send signals to all our closest relatives so that they know + * to properly mourn us.. + */ +static void exit_notify(struct task_struct *tsk) +{ + int state; + struct task_struct *t; + struct list_head ptrace_dead, *_p, *_n; + + if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) + && !thread_group_empty(tsk)) { + /* + * This occurs when there was a race between our exit + * syscall and a group signal choosing us as the one to + * wake up. It could be that we are the only thread + * alerted to check for pending signals, but another thread + * should be woken now to take the signal since we will not. + * Now we'll wake all the threads in the group just to make + * sure someone gets all the pending signals. + */ + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + for (t = next_thread(tsk); t != tsk; t = next_thread(t)) + if (!signal_pending(t) && !(t->flags & PF_EXITING)) { + recalc_sigpending_tsk(t); + if (signal_pending(t)) + signal_wake_up(t, 0); + } + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + } + + write_lock_irq(&tasklist_lock); + + /* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ + + INIT_LIST_HEAD(&ptrace_dead); + forget_original_parent(tsk, &ptrace_dead); + BUG_ON(!list_empty(&tsk->children)); + BUG_ON(!list_empty(&tsk->ptrace_children)); + + /* + * Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * + * Case i: Our father is in a different pgrp than we are + * and we were the only connection outside, so our pgrp + * is about to become orphaned. + */ + + t = tsk->real_parent; + + if ((process_group(t) != process_group(tsk)) && + (t->signal->session == tsk->signal->session) && + will_become_orphaned_pgrp(process_group(tsk), tsk) && + has_stopped_jobs(process_group(tsk))) { + __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); + __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); + } + + /* Let father know we died + * + * Thread signals are configurable, but you aren't going to use + * that to send signals to arbitary processes. + * That stops right now. + * + * If the parent exec id doesn't match the exec id we saved + * when we started then we know the parent has changed security + * domain. + * + * If our self_exec id doesn't match our parent_exec_id then + * we have changed execution domain as these two values started + * the same after a fork. + * + */ + + if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && + ( tsk->parent_exec_id != t->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) + && !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + + + /* If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal + * only has special meaning to our real parent. + */ + if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { + int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; + do_notify_parent(tsk, signal); + } else if (tsk->ptrace) { + do_notify_parent(tsk, SIGCHLD); + } + + state = EXIT_ZOMBIE; + if (tsk->exit_signal == -1 && + (likely(tsk->ptrace == 0) || + unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) + state = EXIT_DEAD; + tsk->exit_state = state; + + write_unlock_irq(&tasklist_lock); + + list_for_each_safe(_p, _n, &ptrace_dead) { + list_del_init(_p); + t = list_entry(_p,struct task_struct,ptrace_list); + release_task(t); + } + + /* If the process is dead, release it - nobody will wait for it */ + if (state == EXIT_DEAD) + release_task(tsk); + + /* PF_DEAD causes final put_task_struct after we schedule. */ + preempt_disable(); + tsk->flags |= PF_DEAD; +} + +fastcall NORET_TYPE void do_exit(long code) +{ + struct task_struct *tsk = current; + int group_dead; + + profile_task_exit(tsk); + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + if (unlikely(tsk->pid == 1)) + panic("Attempted to kill init!"); + if (tsk->io_context) + exit_io_context(); + + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { + current->ptrace_message = code; + ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); + } + + tsk->flags |= PF_EXITING; + + /* + * Make sure we don't try to process any timer firings + * while we are already exiting. + */ + tsk->it_virt_expires = cputime_zero; + tsk->it_prof_expires = cputime_zero; + tsk->it_sched_expires = 0; + + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, current->pid, + preempt_count()); + + acct_update_integrals(tsk); + update_mem_hiwater(tsk); + group_dead = atomic_dec_and_test(&tsk->signal->live); + if (group_dead) { + del_timer_sync(&tsk->signal->real_timer); + acct_process(code); + } + exit_mm(tsk); + + exit_sem(tsk); + __exit_files(tsk); + __exit_fs(tsk); + exit_namespace(tsk); + exit_thread(); + cpuset_exit(tsk); + exit_keys(tsk); + + if (group_dead && tsk->signal->leader) + disassociate_ctty(1); + + module_put(tsk->thread_info->exec_domain->module); + if (tsk->binfmt) + module_put(tsk->binfmt->module); + + tsk->exit_code = code; + exit_notify(tsk); +#ifdef CONFIG_NUMA + mpol_free(tsk->mempolicy); + tsk->mempolicy = NULL; +#endif + + BUG_ON(!(current->flags & PF_DEAD)); + schedule(); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) ; +} + +NORET_TYPE void complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + do_exit(code); +} + +EXPORT_SYMBOL(complete_and_exit); + +asmlinkage long sys_exit(int error_code) +{ + do_exit((error_code&0xff)<<8); +} + +task_t fastcall *next_thread(const task_t *p) +{ + return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); +} + +EXPORT_SYMBOL(next_thread); + +/* + * Take down every thread in the group. This is called by fatal signals + * as well as by sys_exit_group (below). + */ +NORET_TYPE void +do_group_exit(int exit_code) +{ + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ + + if (current->signal->flags & SIGNAL_GROUP_EXIT) + exit_code = current->signal->group_exit_code; + else if (!thread_group_empty(current)) { + struct signal_struct *const sig = current->signal; + struct sighand_struct *const sighand = current->sighand; + read_lock(&tasklist_lock); + spin_lock_irq(&sighand->siglock); + if (sig->flags & SIGNAL_GROUP_EXIT) + /* Another thread got here before we took the lock. */ + exit_code = sig->group_exit_code; + else { + sig->flags = SIGNAL_GROUP_EXIT; + sig->group_exit_code = exit_code; + zap_other_threads(current); + } + spin_unlock_irq(&sighand->siglock); + read_unlock(&tasklist_lock); + } + + do_exit(exit_code); + /* NOTREACHED */ +} + +/* + * this kills every thread in the thread group. Note that any externally + * wait4()-ing process will get the correct exit code - even if this + * thread is not the thread group leader. + */ +asmlinkage void sys_exit_group(int error_code) +{ + do_group_exit((error_code & 0xff) << 8); +} + +static int eligible_child(pid_t pid, int options, task_t *p) +{ + if (pid > 0) { + if (p->pid != pid) + return 0; + } else if (!pid) { + if (process_group(p) != process_group(current)) + return 0; + } else if (pid != -1) { + if (process_group(p) != -pid) + return 0; + } + + /* + * Do not consider detached threads that are + * not ptraced: + */ + if (p->exit_signal == -1 && !p->ptrace) + return 0; + + /* Wait for all children (clone and not) if __WALL is set; + * otherwise, wait for clone children *only* if __WCLONE is + * set; otherwise, wait for non-clone children *only*. (Note: + * A "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD.) */ + if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) + && !(options & __WALL)) + return 0; + /* + * Do not consider thread group leaders that are + * in a non-empty thread group: + */ + if (current->tgid != p->tgid && delay_group_leader(p)) + return 2; + + if (security_task_wait(p)) + return 0; + + return 1; +} + +static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, + int why, int status, + struct siginfo __user *infop, + struct rusage __user *rusagep) +{ + int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; + put_task_struct(p); + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + if (!retval) + retval = pid; + return retval; +} + +/* + * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_zombie(task_t *p, int noreap, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + unsigned long state; + int retval; + int status; + + if (unlikely(noreap)) { + pid_t pid = p->pid; + uid_t uid = p->uid; + int exit_code = p->exit_code; + int why, status; + + if (unlikely(p->exit_state != EXIT_ZOMBIE)) + return 0; + if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) + return 0; + get_task_struct(p); + read_unlock(&tasklist_lock); + if ((exit_code & 0x7f) == 0) { + why = CLD_EXITED; + status = exit_code >> 8; + } else { + why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + status = exit_code & 0x7f; + } + return wait_noreap_copyout(p, pid, uid, why, + status, infop, ru); + } + + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->exit_state, EXIT_DEAD); + if (state != EXIT_ZOMBIE) { + BUG_ON(state != EXIT_DEAD); + return 0; + } + if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) { + /* + * This can only happen in a race with a ptraced thread + * dying on another processor. + */ + return 0; + } + + if (likely(p->real_parent == p->parent) && likely(p->signal)) { + /* + * The resource counters for the group leader are in its + * own task_struct. Those for dead threads in the group + * are in its signal_struct, as are those for the child + * processes it has previously reaped. All these + * accumulate in the parent's signal_struct c* fields. + * + * We don't bother to take a lock here to protect these + * p->signal fields, because they are only touched by + * __exit_signal, which runs with tasklist_lock + * write-locked anyway, and so is excluded here. We do + * need to protect the access to p->parent->signal fields, + * as other threads in the parent group can be right + * here reaping other children at the same time. + */ + spin_lock_irq(&p->parent->sighand->siglock); + p->parent->signal->cutime = + cputime_add(p->parent->signal->cutime, + cputime_add(p->utime, + cputime_add(p->signal->utime, + p->signal->cutime))); + p->parent->signal->cstime = + cputime_add(p->parent->signal->cstime, + cputime_add(p->stime, + cputime_add(p->signal->stime, + p->signal->cstime))); + p->parent->signal->cmin_flt += + p->min_flt + p->signal->min_flt + p->signal->cmin_flt; + p->parent->signal->cmaj_flt += + p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; + p->parent->signal->cnvcsw += + p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; + p->parent->signal->cnivcsw += + p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; + spin_unlock_irq(&p->parent->sighand->siglock); + } + + /* + * Now we are sure this task is interesting, and no other + * thread can reap it because we set its state to EXIT_DEAD. + */ + read_unlock(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; + if (!retval && stat_addr) + retval = put_user(status, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) { + int why; + + if ((status & 0x7f) == 0) { + why = CLD_EXITED; + status >>= 8; + } else { + why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + status &= 0x7f; + } + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(status, &infop->si_status); + } + if (!retval && infop) + retval = put_user(p->pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(p->uid, &infop->si_uid); + if (retval) { + // TODO: is this safe? + p->exit_state = EXIT_ZOMBIE; + return retval; + } + retval = p->pid; + if (p->real_parent != p->parent) { + write_lock_irq(&tasklist_lock); + /* Double-check with lock held. */ + if (p->real_parent != p->parent) { + __ptrace_unlink(p); + // TODO: is this safe? + p->exit_state = EXIT_ZOMBIE; + /* + * If this is not a detached task, notify the parent. + * If it's still not detached after that, don't release + * it now. + */ + if (p->exit_signal != -1) { + do_notify_parent(p, p->exit_signal); + if (p->exit_signal != -1) + p = NULL; + } + } + write_unlock_irq(&tasklist_lock); + } + if (p != NULL) + release_task(p); + BUG_ON(!retval); + return retval; +} + +/* + * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval, exit_code; + + if (!p->exit_code) + return 0; + if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && + p->signal && p->signal->group_stop_count > 0) + /* + * A group stop is in progress and this is the group leader. + * We won't report until all threads have stopped. + */ + return 0; + + /* + * Now we are pretty sure this task is interesting. + * Make sure it doesn't get reaped out from under us while we + * give up the lock and then examine it below. We don't want to + * keep holding onto the tasklist_lock while we call getrusage and + * possibly take page faults for user memory. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (unlikely(noreap)) { + pid_t pid = p->pid; + uid_t uid = p->uid; + int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; + + exit_code = p->exit_code; + if (unlikely(!exit_code) || + unlikely(p->state > TASK_STOPPED)) + goto bail_ref; + return wait_noreap_copyout(p, pid, uid, + why, (exit_code << 8) | 0x7f, + infop, ru); + } + + write_lock_irq(&tasklist_lock); + + /* + * This uses xchg to be atomic with the thread resuming and setting + * it. It must also be done with the write lock held to prevent a + * race with the EXIT_ZOMBIE case. + */ + exit_code = xchg(&p->exit_code, 0); + if (unlikely(p->exit_state)) { + /* + * The task resumed and then died. Let the next iteration + * catch it in EXIT_ZOMBIE. Note that exit_code might + * already be zero here if it resumed and did _exit(0). + * The task itself is dead and won't touch exit_code again; + * other processors in this function are locked out. + */ + p->exit_code = exit_code; + exit_code = 0; + } + if (unlikely(exit_code == 0)) { + /* + * Another thread in this function got to it first, or it + * resumed, or it resumed and then died. + */ + write_unlock_irq(&tasklist_lock); +bail_ref: + put_task_struct(p); + /* + * We are returning to the wait loop without having successfully + * removed the process and having released the lock. We cannot + * continue, since the "p" task pointer is potentially stale. + * + * Return -EAGAIN, and do_wait() will restart the loop from the + * beginning. Do _not_ re-acquire the lock. + */ + return -EAGAIN; + } + + /* move to end of parent's list to avoid starvation */ + remove_parent(p); + add_parent(p, p->parent); + + write_unlock_irq(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) + retval = put_user((exit_code << 8) | 0x7f, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) + retval = put_user((short)((p->ptrace & PT_PTRACED) + ? CLD_TRAPPED : CLD_STOPPED), + &infop->si_code); + if (!retval && infop) + retval = put_user(exit_code, &infop->si_status); + if (!retval && infop) + retval = put_user(p->pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(p->uid, &infop->si_uid); + if (!retval) + retval = p->pid; + put_task_struct(p); + + BUG_ON(!retval); + return retval; +} + +/* + * Handle do_wait work for one task in a live, non-stopped state. + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_continued(task_t *p, int noreap, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval; + pid_t pid; + uid_t uid; + + if (unlikely(!p->signal)) + return 0; + + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) + return 0; + + spin_lock_irq(&p->sighand->siglock); + /* Re-check with the lock held. */ + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { + spin_unlock_irq(&p->sighand->siglock); + return 0; + } + if (!noreap) + p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + spin_unlock_irq(&p->sighand->siglock); + + pid = p->pid; + uid = p->uid; + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!infop) { + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + put_task_struct(p); + if (!retval && stat_addr) + retval = put_user(0xffff, stat_addr); + if (!retval) + retval = p->pid; + } else { + retval = wait_noreap_copyout(p, pid, uid, + CLD_CONTINUED, SIGCONT, + infop, ru); + BUG_ON(retval == 0); + } + + return retval; +} + + +static inline int my_ptrace_child(struct task_struct *p) +{ + if (!(p->ptrace & PT_PTRACED)) + return 0; + if (!(p->ptrace & PT_ATTACHED)) + return 1; + /* + * This child was PTRACE_ATTACH'd. We should be seeing it only if + * we are the attacher. If we are the real parent, this is a race + * inside ptrace_attach. It is waiting for the tasklist_lock, + * which we have to switch the parent links, but has already set + * the flags in p->ptrace. + */ + return (p->parent != p->real_parent); +} + +static long do_wait(pid_t pid, int options, struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + DECLARE_WAITQUEUE(wait, current); + struct task_struct *tsk; + int flag, retval; + + add_wait_queue(¤t->signal->wait_chldexit,&wait); +repeat: + /* + * We will set this flag if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ + flag = 0; + current->state = TASK_INTERRUPTIBLE; + read_lock(&tasklist_lock); + tsk = current; + do { + struct task_struct *p; + struct list_head *_p; + int ret; + + list_for_each(_p,&tsk->children) { + p = list_entry(_p,struct task_struct,sibling); + + ret = eligible_child(pid, options, p); + if (!ret) + continue; + + switch (p->state) { + case TASK_TRACED: + if (!my_ptrace_child(p)) + continue; + /*FALLTHROUGH*/ + case TASK_STOPPED: + /* + * It's stopped now, so it might later + * continue, exit, or stop again. + */ + flag = 1; + if (!(options & WUNTRACED) && + !my_ptrace_child(p)) + continue; + retval = wait_task_stopped(p, ret == 2, + (options & WNOWAIT), + infop, + stat_addr, ru); + if (retval == -EAGAIN) + goto repeat; + if (retval != 0) /* He released the lock. */ + goto end; + break; + default: + // case EXIT_DEAD: + if (p->exit_state == EXIT_DEAD) + continue; + // case EXIT_ZOMBIE: + if (p->exit_state == EXIT_ZOMBIE) { + /* + * Eligible but we cannot release + * it yet: + */ + if (ret == 2) + goto check_continued; + if (!likely(options & WEXITED)) + continue; + retval = wait_task_zombie( + p, (options & WNOWAIT), + infop, stat_addr, ru); + /* He released the lock. */ + if (retval != 0) + goto end; + break; + } +check_continued: + /* + * It's running now, so it might later + * exit, stop, or stop and then continue. + */ + flag = 1; + if (!unlikely(options & WCONTINUED)) + continue; + retval = wait_task_continued( + p, (options & WNOWAIT), + infop, stat_addr, ru); + if (retval != 0) /* He released the lock. */ + goto end; + break; + } + } + if (!flag) { + list_for_each(_p, &tsk->ptrace_children) { + p = list_entry(_p, struct task_struct, + ptrace_list); + if (!eligible_child(pid, options, p)) + continue; + flag = 1; + break; + } + } + if (options & __WNOTHREAD) + break; + tsk = next_thread(tsk); + if (tsk->signal != current->signal) + BUG(); + } while (tsk != current); + + read_unlock(&tasklist_lock); + if (flag) { + retval = 0; + if (options & WNOHANG) + goto end; + retval = -ERESTARTSYS; + if (signal_pending(current)) + goto end; + schedule(); + goto repeat; + } + retval = -ECHILD; +end: + current->state = TASK_RUNNING; + remove_wait_queue(¤t->signal->wait_chldexit,&wait); + if (infop) { + if (retval > 0) + retval = 0; + else { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!retval) + retval = put_user(0, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user(0, &infop->si_code); + if (!retval) + retval = put_user(0, &infop->si_pid); + if (!retval) + retval = put_user(0, &infop->si_uid); + if (!retval) + retval = put_user(0, &infop->si_status); + } + } + return retval; +} + +asmlinkage long sys_waitid(int which, pid_t pid, + struct siginfo __user *infop, int options, + struct rusage __user *ru) +{ + long ret; + + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + switch (which) { + case P_ALL: + pid = -1; + break; + case P_PID: + if (pid <= 0) + return -EINVAL; + break; + case P_PGID: + if (pid <= 0) + return -EINVAL; + pid = -pid; + break; + default: + return -EINVAL; + } + + ret = do_wait(pid, options, infop, NULL, ru); + + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; +} + +asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, + int options, struct rusage __user *ru) +{ + long ret; + + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) + return -EINVAL; + ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru); + + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; +} + +#ifdef __ARCH_WANT_SYS_WAITPID + +/* + * sys_waitpid() remains for compatibility. waitpid() should be + * implemented by calling sys_wait4() from libc.a. + */ +asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) +{ + return sys_wait4(pid, stat_addr, options, NULL); +} + +#endif diff --git a/kernel/extable.c b/kernel/extable.c new file mode 100644 index 00000000000..7501b531cee --- /dev/null +++ b/kernel/extable.c @@ -0,0 +1,67 @@ +/* Rewritten by Rusty Russell, on the backs of many others... + Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/module.h> +#include <linux/init.h> +#include <asm/uaccess.h> +#include <asm/sections.h> + +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +/* Sort the kernel's built-in exception table */ +void __init sort_main_extable(void) +{ + sort_extable(__start___ex_table, __stop___ex_table); +} + +/* Given an address, look for it in the exception tables. */ +const struct exception_table_entry *search_exception_tables(unsigned long addr) +{ + const struct exception_table_entry *e; + + e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + if (!e) + e = search_module_extables(addr); + return e; +} + +static int core_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && + addr <= (unsigned long)_etext) + return 1; + + if (addr >= (unsigned long)_sinittext && + addr <= (unsigned long)_einittext) + return 1; + return 0; +} + +int __kernel_text_address(unsigned long addr) +{ + if (core_kernel_text(addr)) + return 1; + return __module_text_address(addr) != NULL; +} + +int kernel_text_address(unsigned long addr) +{ + if (core_kernel_text(addr)) + return 1; + return module_text_address(addr) != NULL; +} diff --git a/kernel/fork.c b/kernel/fork.c new file mode 100644 index 00000000000..f42a17f8869 --- /dev/null +++ b/kernel/fork.c @@ -0,0 +1,1274 @@ +/* + * linux/kernel/fork.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'fork.c' contains the help-routines for the 'fork' system call + * (see also entry.S and others). + * Fork is rather simple, once you get the hang of it, but the memory + * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/unistd.h> +#include <linux/smp_lock.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/completion.h> +#include <linux/namespace.h> +#include <linux/personality.h> +#include <linux/mempolicy.h> +#include <linux/sem.h> +#include <linux/file.h> +#include <linux/key.h> +#include <linux/binfmts.h> +#include <linux/mman.h> +#include <linux/fs.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/security.h> +#include <linux/swap.h> +#include <linux/syscalls.h> +#include <linux/jiffies.h> +#include <linux/futex.h> +#include <linux/ptrace.h> +#include <linux/mount.h> +#include <linux/audit.h> +#include <linux/profile.h> +#include <linux/rmap.h> +#include <linux/acct.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +/* + * Protected counters by write_lock_irq(&tasklist_lock) + */ +unsigned long total_forks; /* Handle normal Linux uptimes. */ +int nr_threads; /* The idle threads do not count.. */ + +int max_threads; /* tunable limit on nr_threads */ + +DEFINE_PER_CPU(unsigned long, process_counts) = 0; + + __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ + +EXPORT_SYMBOL(tasklist_lock); + +int nr_processes(void) +{ + int cpu; + int total = 0; + + for_each_online_cpu(cpu) + total += per_cpu(process_counts, cpu); + + return total; +} + +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) +# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +static kmem_cache_t *task_struct_cachep; +#endif + +/* SLAB cache for signal_struct structures (tsk->signal) */ +kmem_cache_t *signal_cachep; + +/* SLAB cache for sighand_struct structures (tsk->sighand) */ +kmem_cache_t *sighand_cachep; + +/* SLAB cache for files_struct structures (tsk->files) */ +kmem_cache_t *files_cachep; + +/* SLAB cache for fs_struct structures (tsk->fs) */ +kmem_cache_t *fs_cachep; + +/* SLAB cache for vm_area_struct structures */ +kmem_cache_t *vm_area_cachep; + +/* SLAB cache for mm_struct structures (tsk->mm) */ +static kmem_cache_t *mm_cachep; + +void free_task(struct task_struct *tsk) +{ + free_thread_info(tsk->thread_info); + free_task_struct(tsk); +} +EXPORT_SYMBOL(free_task); + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + if (unlikely(tsk->audit_context)) + audit_free(tsk); + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + + if (!profile_handoff_task(tsk)) + free_task(tsk); +} + +void __init fork_init(unsigned long mempages) +{ +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef ARCH_MIN_TASKALIGN +#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#endif + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), + ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); +#endif + + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most half + * of memory. + */ + max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); + + /* + * we need to allow at least 20 threads to boot a system + */ + if(max_threads < 20) + max_threads = 20; + + init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_SIGPENDING] = + init_task.signal->rlim[RLIMIT_NPROC]; +} + +static struct task_struct *dup_task_struct(struct task_struct *orig) +{ + struct task_struct *tsk; + struct thread_info *ti; + + prepare_to_copy(orig); + + tsk = alloc_task_struct(); + if (!tsk) + return NULL; + + ti = alloc_thread_info(tsk); + if (!ti) { + free_task_struct(tsk); + return NULL; + } + + *ti = *orig->thread_info; + *tsk = *orig; + tsk->thread_info = ti; + ti->task = tsk; + + /* One for us, one for whoever does the "release_task()" (usually parent) */ + atomic_set(&tsk->usage,2); + return tsk; +} + +#ifdef CONFIG_MMU +static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) +{ + struct vm_area_struct * mpnt, *tmp, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + struct mempolicy *pol; + + down_write(&oldmm->mmap_sem); + flush_cache_mm(current->mm); + mm->locked_vm = 0; + mm->mmap = NULL; + mm->mmap_cache = NULL; + mm->free_area_cache = oldmm->mmap_base; + mm->map_count = 0; + set_mm_counter(mm, rss, 0); + set_mm_counter(mm, anon_rss, 0); + cpus_clear(mm->cpu_vm_mask); + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + + for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + pol = mpol_copy(vma_policy(mpnt)); + retval = PTR_ERR(pol); + if (IS_ERR(pol)) + goto fail_nomem_policy; + vma_set_policy(tmp, pol); + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; + tmp->vm_next = NULL; + anon_vma_link(tmp); + file = tmp->vm_file; + if (file) { + struct inode *inode = file->f_dentry->d_inode; + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + /* insert tmp into the share list, just after mpnt */ + spin_lock(&file->f_mapping->i_mmap_lock); + tmp->vm_truncate_count = mpnt->vm_truncate_count; + flush_dcache_mmap_lock(file->f_mapping); + vma_prio_tree_add(tmp, mpnt); + flush_dcache_mmap_unlock(file->f_mapping); + spin_unlock(&file->f_mapping->i_mmap_lock); + } + + /* + * Link in the new vma and copy the page table entries: + * link in first so that swapoff can see swap entries, + * and try_to_unmap_one's find_vma find the new vma. + */ + spin_lock(&mm->page_table_lock); + *pprev = tmp; + pprev = &tmp->vm_next; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + retval = copy_page_range(mm, current->mm, tmp); + spin_unlock(&mm->page_table_lock); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + retval = 0; + +out: + flush_tlb_mm(current->mm); + up_write(&oldmm->mmap_sem); + return retval; +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct * mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct * mm) +{ + pgd_free(mm->pgd); +} +#else +#define dup_mmap(mm, oldmm) (0) +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + + __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +#include <linux/init_task.h> + +static struct mm_struct * mm_init(struct mm_struct * mm) +{ + atomic_set(&mm->mm_users, 1); + atomic_set(&mm->mm_count, 1); + init_rwsem(&mm->mmap_sem); + INIT_LIST_HEAD(&mm->mmlist); + mm->core_waiters = 0; + mm->nr_ptes = 0; + spin_lock_init(&mm->page_table_lock); + rwlock_init(&mm->ioctx_list_lock); + mm->ioctx_list = NULL; + mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); + mm->free_area_cache = TASK_UNMAPPED_BASE; + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + return mm; + } + free_mm(mm); + return NULL; +} + +/* + * Allocate and initialize an mm_struct. + */ +struct mm_struct * mm_alloc(void) +{ + struct mm_struct * mm; + + mm = allocate_mm(); + if (mm) { + memset(mm, 0, sizeof(*mm)); + mm = mm_init(mm); + } + return mm; +} + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +void fastcall __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + free_mm(mm); +} + +/* + * Decrement the use count and release all resources for an mm. + */ +void mmput(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + exit_aio(mm); + exit_mmap(mm); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + put_swap_token(mm); + mmdrop(mm); + } +} +EXPORT_SYMBOL_GPL(mmput); + +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * this kernel workthread has transiently adopted a user mm with use_mm, + * to do its AIO) is not set and if so returns a reference to it, after + * bumping up the use count. User must release the mm via mmput() + * after use. Typically used by /proc and ptrace. + */ +struct mm_struct *get_task_mm(struct task_struct *task) +{ + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (task->flags & PF_BORROWED_MM) + mm = NULL; + else + atomic_inc(&mm->mm_users); + } + task_unlock(task); + return mm; +} +EXPORT_SYMBOL_GPL(get_task_mm); + +/* Please note the differences between mmput and mm_release. + * mmput is called whenever we stop holding onto a mm_struct, + * error success whatever. + * + * mm_release is called after a mm_struct has been removed + * from the current process. + * + * This difference is important for error handling, when we + * only half set up a mm_struct for a new process and need to restore + * the old one. Because we mmput the new mm_struct before + * restoring the old one. . . + * Eric Biederman 10 January 1998 + */ +void mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + struct completion *vfork_done = tsk->vfork_done; + + /* Get rid of any cached register state */ + deactivate_mm(tsk, mm); + + /* notify parent sleeping on vfork() */ + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + u32 __user * tidptr = tsk->clear_child_tid; + tsk->clear_child_tid = NULL; + + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tidptr); + sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); + } +} + +static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +{ + struct mm_struct * mm, *oldmm; + int retval; + + tsk->min_flt = tsk->maj_flt = 0; + tsk->nvcsw = tsk->nivcsw = 0; + + tsk->mm = NULL; + tsk->active_mm = NULL; + + /* + * Are we cloning a kernel thread? + * + * We need to steal a active VM for that.. + */ + oldmm = current->mm; + if (!oldmm) + return 0; + + if (clone_flags & CLONE_VM) { + atomic_inc(&oldmm->mm_users); + mm = oldmm; + /* + * There are cases where the PTL is held to ensure no + * new threads start up in user mode using an mm, which + * allows optimizing out ipis; the tlb_gather_mmu code + * is an example. + */ + spin_unlock_wait(&oldmm->page_table_lock); + goto good_mm; + } + + retval = -ENOMEM; + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + /* Copy the current MM stuff.. */ + memcpy(mm, oldmm, sizeof(*mm)); + if (!mm_init(mm)) + goto fail_nomem; + + if (init_new_context(tsk,mm)) + goto fail_nocontext; + + retval = dup_mmap(mm, oldmm); + if (retval) + goto free_pt; + + mm->hiwater_rss = get_mm_counter(mm,rss); + mm->hiwater_vm = mm->total_vm; + +good_mm: + tsk->mm = mm; + tsk->active_mm = mm; + return 0; + +free_pt: + mmput(mm); +fail_nomem: + return retval; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return retval; +} + +static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + atomic_set(&fs->count, 1); + rwlock_init(&fs->lock); + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { + fs->altrootmnt = NULL; + fs->altroot = NULL; + } + read_unlock(&old->lock); + } + return fs; +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + return __copy_fs_struct(old); +} + +EXPORT_SYMBOL_GPL(copy_fs_struct); + +static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) +{ + if (clone_flags & CLONE_FS) { + atomic_inc(¤t->fs->count); + return 0; + } + tsk->fs = __copy_fs_struct(current->fs); + if (!tsk->fs) + return -ENOMEM; + return 0; +} + +static int count_open_files(struct files_struct *files, int size) +{ + int i; + + /* Find the last open fd */ + for (i = size/(8*sizeof(long)); i > 0; ) { + if (files->open_fds->fds_bits[--i]) + break; + } + i = (i+1) * 8 * sizeof(long); + return i; +} + +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + struct file **old_fds, **new_fds; + int open_files, size, i, error = 0, expand; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + /* + * Note: we may be using current for both targets (See exec.c) + * This works because we cache current->files (old) as oldf. Don't + * break this. + */ + tsk->files = NULL; + error = -ENOMEM; + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + newf->next_fd = 0; + newf->max_fds = NR_OPEN_DEFAULT; + newf->max_fdset = __FD_SETSIZE; + newf->close_on_exec = &newf->close_on_exec_init; + newf->open_fds = &newf->open_fds_init; + newf->fd = &newf->fd_array[0]; + + spin_lock(&oldf->file_lock); + + open_files = count_open_files(oldf, oldf->max_fdset); + expand = 0; + + /* + * Check whether we need to allocate a larger fd array or fd set. + * Note: we're not a clone task, so the open count won't change. + */ + if (open_files > newf->max_fdset) { + newf->max_fdset = 0; + expand = 1; + } + if (open_files > newf->max_fds) { + newf->max_fds = 0; + expand = 1; + } + + /* if the old fdset gets grown now, we'll only copy up to "size" fds */ + if (expand) { + spin_unlock(&oldf->file_lock); + spin_lock(&newf->file_lock); + error = expand_files(newf, open_files-1); + spin_unlock(&newf->file_lock); + if (error < 0) + goto out_release; + spin_lock(&oldf->file_lock); + } + + old_fds = oldf->fd; + new_fds = newf->fd; + + memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); + memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + + for (i = open_files; i != 0; i--) { + struct file *f = *old_fds++; + if (f) { + get_file(f); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + FD_CLR(open_files - i, newf->open_fds); + } + *new_fds++ = f; + } + spin_unlock(&oldf->file_lock); + + /* compute the remainder to be cleared */ + size = (newf->max_fds - open_files) * sizeof(struct file *); + + /* This is long word aligned thus could use a optimized version */ + memset(new_fds, 0, size); + + if (newf->max_fdset > open_files) { + int left = (newf->max_fdset-open_files)/8; + int start = open_files / (8 * sizeof(unsigned long)); + + memset(&newf->open_fds->fds_bits[start], 0, left); + memset(&newf->close_on_exec->fds_bits[start], 0, left); + } + + tsk->files = newf; + error = 0; +out: + return error; + +out_release: + free_fdset (newf->close_on_exec, newf->max_fdset); + free_fdset (newf->open_fds, newf->max_fdset); + free_fd_array(newf->fd, newf->max_fds); + kmem_cache_free(files_cachep, newf); + goto out; +} + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(void) +{ + struct files_struct *files = current->files; + int rc; + + if(!files) + BUG(); + + /* This can race but the race causes us to copy when we don't + need to and drop the copy */ + if(atomic_read(&files->count) == 1) + { + atomic_inc(&files->count); + return 0; + } + rc = copy_files(0, current); + if(rc) + current->files = files; + return rc; +} + +EXPORT_SYMBOL(unshare_files); + +static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) +{ + struct sighand_struct *sig; + + if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + atomic_inc(¤t->sighand->count); + return 0; + } + sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + tsk->sighand = sig; + if (!sig) + return -ENOMEM; + spin_lock_init(&sig->siglock); + atomic_set(&sig->count, 1); + memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + return 0; +} + +static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) +{ + struct signal_struct *sig; + int ret; + + if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + return 0; + } + sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + tsk->signal = sig; + if (!sig) + return -ENOMEM; + + ret = copy_thread_group_keys(tsk); + if (ret < 0) { + kmem_cache_free(signal_cachep, sig); + return ret; + } + + atomic_set(&sig->count, 1); + atomic_set(&sig->live, 1); + init_waitqueue_head(&sig->wait_chldexit); + sig->flags = 0; + sig->group_exit_code = 0; + sig->group_exit_task = NULL; + sig->group_stop_count = 0; + sig->curr_target = NULL; + init_sigpending(&sig->shared_pending); + INIT_LIST_HEAD(&sig->posix_timers); + + sig->it_real_value = sig->it_real_incr = 0; + sig->real_timer.function = it_real_fn; + sig->real_timer.data = (unsigned long) tsk; + init_timer(&sig->real_timer); + + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + sig->tty = current->signal->tty; + sig->pgrp = process_group(current); + sig->session = current->signal->session; + sig->leader = 0; /* session leadership doesn't inherit */ + sig->tty_old_pgrp = 0; + + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; + sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + sig->sched_time = 0; + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); + + task_lock(current->group_leader); + memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); + task_unlock(current->group_leader); + + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + /* + * New sole thread in the process gets an expiry time + * of the whole CPU time limit. + */ + tsk->it_prof_expires = + secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + } + + return 0; +} + +static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) +{ + unsigned long new_flags = p->flags; + + new_flags &= ~PF_SUPERPRIV; + new_flags |= PF_FORKNOEXEC; + if (!(clone_flags & CLONE_PTRACE)) + p->ptrace = 0; + p->flags = new_flags; +} + +asmlinkage long sys_set_tid_address(int __user *tidptr) +{ + current->clear_child_tid = tidptr; + + return current->pid; +} + +/* + * This creates a new process as a copy of the old one, + * but does not actually start it yet. + * + * It copies the registers, and all the appropriate + * parts of the process environment (as per the clone + * flags). The actual kick-off is left to the caller. + */ +static task_t *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + int pid) +{ + int retval; + struct task_struct *p = NULL; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); + + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. + */ + if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) + return ERR_PTR(-EINVAL); + + /* + * Shared signal handlers imply shared VM. By way of the above, + * thread groups also imply shared VM. Blocking this case allows + * for various simplifications in other code. + */ + if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) + return ERR_PTR(-EINVAL); + + retval = security_task_create(clone_flags); + if (retval) + goto fork_out; + + retval = -ENOMEM; + p = dup_task_struct(current); + if (!p) + goto fork_out; + + retval = -EAGAIN; + if (atomic_read(&p->user->processes) >= + p->signal->rlim[RLIMIT_NPROC].rlim_cur) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && + p->user != &root_user) + goto bad_fork_free; + } + + atomic_inc(&p->user->__count); + atomic_inc(&p->user->processes); + get_group_info(p->group_info); + + /* + * If multiple threads are within copy_process(), then this check + * triggers too late. This doesn't hurt, the check is only there + * to stop root fork bombs. + */ + if (nr_threads >= max_threads) + goto bad_fork_cleanup_count; + + if (!try_module_get(p->thread_info->exec_domain->module)) + goto bad_fork_cleanup_count; + + if (p->binfmt && !try_module_get(p->binfmt->module)) + goto bad_fork_cleanup_put_domain; + + p->did_exec = 0; + copy_flags(clone_flags, p); + p->pid = pid; + retval = -EFAULT; + if (clone_flags & CLONE_PARENT_SETTID) + if (put_user(p->pid, parent_tidptr)) + goto bad_fork_cleanup; + + p->proc_dentry = NULL; + + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); + p->vfork_done = NULL; + spin_lock_init(&p->alloc_lock); + spin_lock_init(&p->proc_lock); + + clear_tsk_thread_flag(p, TIF_SIGPENDING); + init_sigpending(&p->pending); + + p->utime = cputime_zero; + p->stime = cputime_zero; + p->sched_time = 0; + p->rchar = 0; /* I/O counter: bytes read */ + p->wchar = 0; /* I/O counter: bytes written */ + p->syscr = 0; /* I/O counter: read syscalls */ + p->syscw = 0; /* I/O counter: write syscalls */ + acct_clear_integrals(p); + + p->it_virt_expires = cputime_zero; + p->it_prof_expires = cputime_zero; + p->it_sched_expires = 0; + INIT_LIST_HEAD(&p->cpu_timers[0]); + INIT_LIST_HEAD(&p->cpu_timers[1]); + INIT_LIST_HEAD(&p->cpu_timers[2]); + + p->lock_depth = -1; /* -1 = no lock */ + do_posix_clock_monotonic_gettime(&p->start_time); + p->security = NULL; + p->io_context = NULL; + p->io_wait = NULL; + p->audit_context = NULL; +#ifdef CONFIG_NUMA + p->mempolicy = mpol_copy(p->mempolicy); + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup; + } +#endif + + p->tgid = p->pid; + if (clone_flags & CLONE_THREAD) + p->tgid = current->tgid; + + if ((retval = security_task_alloc(p))) + goto bad_fork_cleanup_policy; + if ((retval = audit_alloc(p))) + goto bad_fork_cleanup_security; + /* copy all the process information */ + if ((retval = copy_semundo(clone_flags, p))) + goto bad_fork_cleanup_audit; + if ((retval = copy_files(clone_flags, p))) + goto bad_fork_cleanup_semundo; + if ((retval = copy_fs(clone_flags, p))) + goto bad_fork_cleanup_files; + if ((retval = copy_sighand(clone_flags, p))) + goto bad_fork_cleanup_fs; + if ((retval = copy_signal(clone_flags, p))) + goto bad_fork_cleanup_sighand; + if ((retval = copy_mm(clone_flags, p))) + goto bad_fork_cleanup_signal; + if ((retval = copy_keys(clone_flags, p))) + goto bad_fork_cleanup_mm; + if ((retval = copy_namespace(clone_flags, p))) + goto bad_fork_cleanup_keys; + retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); + if (retval) + goto bad_fork_cleanup_namespace; + + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + + /* + * Syscall tracing should be turned off in the child regardless + * of CLONE_PTRACE. + */ + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); + + /* Our parent execution domain becomes current domain + These must match for thread signalling to apply */ + + p->parent_exec_id = p->self_exec_id; + + /* ok, now we should be set up.. */ + p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + p->pdeath_signal = 0; + p->exit_state = 0; + + /* Perform scheduler related setup */ + sched_fork(p); + + /* + * Ok, make it visible to the rest of the system. + * We dont wake it up yet. + */ + p->group_leader = p; + INIT_LIST_HEAD(&p->ptrace_children); + INIT_LIST_HEAD(&p->ptrace_list); + + /* Need tasklist lock for parent etc handling! */ + write_lock_irq(&tasklist_lock); + + /* + * The task hasn't been attached yet, so cpus_allowed mask cannot + * have changed. The cpus_allowed mask of the parent may have + * changed after it was copied first time, and it may then move to + * another CPU - so we re-copy it here and set the child's CPU to + * the parent's CPU. This avoids alot of nasty races. + */ + p->cpus_allowed = current->cpus_allowed; + set_task_cpu(p, smp_processor_id()); + + /* + * Check for pending SIGKILL! The new thread should not be allowed + * to slip out of an OOM kill. (or normal SIGKILL.) + */ + if (sigismember(¤t->pending.signal, SIGKILL)) { + write_unlock_irq(&tasklist_lock); + retval = -EINTR; + goto bad_fork_cleanup_namespace; + } + + /* CLONE_PARENT re-uses the old parent */ + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + p->real_parent = current->real_parent; + else + p->real_parent = current; + p->parent = p->real_parent; + + if (clone_flags & CLONE_THREAD) { + spin_lock(¤t->sighand->siglock); + /* + * Important: if an exit-all has been started then + * do not create this new thread - the whole thread + * group is supposed to exit anyway. + */ + if (current->signal->flags & SIGNAL_GROUP_EXIT) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -EAGAIN; + goto bad_fork_cleanup_namespace; + } + p->group_leader = current->group_leader; + + if (current->signal->group_stop_count > 0) { + /* + * There is an all-stop in progress for the group. + * We ourselves will stop as soon as we check signals. + * Make the new thread part of that group stop too. + */ + current->signal->group_stop_count++; + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + + if (!cputime_eq(current->signal->it_virt_expires, + cputime_zero) || + !cputime_eq(current->signal->it_prof_expires, + cputime_zero) || + current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || + !list_empty(¤t->signal->cpu_timers[0]) || + !list_empty(¤t->signal->cpu_timers[1]) || + !list_empty(¤t->signal->cpu_timers[2])) { + /* + * Have child wake up on its first tick to check + * for process CPU timers. + */ + p->it_prof_expires = jiffies_to_cputime(1); + } + + spin_unlock(¤t->sighand->siglock); + } + + SET_LINKS(p); + if (unlikely(p->ptrace & PT_PTRACED)) + __ptrace_link(p, current->parent); + + cpuset_fork(p); + + attach_pid(p, PIDTYPE_PID, p->pid); + attach_pid(p, PIDTYPE_TGID, p->tgid); + if (thread_group_leader(p)) { + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, p->signal->session); + if (p->pid) + __get_cpu_var(process_counts)++; + } + + nr_threads++; + total_forks++; + write_unlock_irq(&tasklist_lock); + retval = 0; + +fork_out: + if (retval) + return ERR_PTR(retval); + return p; + +bad_fork_cleanup_namespace: + exit_namespace(p); +bad_fork_cleanup_keys: + exit_keys(p); +bad_fork_cleanup_mm: + if (p->mm) + mmput(p->mm); +bad_fork_cleanup_signal: + exit_signal(p); +bad_fork_cleanup_sighand: + exit_sighand(p); +bad_fork_cleanup_fs: + exit_fs(p); /* blocking */ +bad_fork_cleanup_files: + exit_files(p); /* blocking */ +bad_fork_cleanup_semundo: + exit_sem(p); +bad_fork_cleanup_audit: + audit_free(p); +bad_fork_cleanup_security: + security_task_free(p); +bad_fork_cleanup_policy: +#ifdef CONFIG_NUMA + mpol_free(p->mempolicy); +#endif +bad_fork_cleanup: + if (p->binfmt) + module_put(p->binfmt->module); +bad_fork_cleanup_put_domain: + module_put(p->thread_info->exec_domain->module); +bad_fork_cleanup_count: + put_group_info(p->group_info); + atomic_dec(&p->user->processes); + free_uid(p->user); +bad_fork_free: + free_task(p); + goto fork_out; +} + +struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + return regs; +} + +task_t * __devinit fork_idle(int cpu) +{ + task_t *task; + struct pt_regs regs; + + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); + if (!task) + return ERR_PTR(-ENOMEM); + init_idle(task, cpu); + unhash_process(task); + return task; +} + +static inline int fork_traceflag (unsigned clone_flags) +{ + if (clone_flags & CLONE_UNTRACED) + return 0; + else if (clone_flags & CLONE_VFORK) { + if (current->ptrace & PT_TRACE_VFORK) + return PTRACE_EVENT_VFORK; + } else if ((clone_flags & CSIGNAL) != SIGCHLD) { + if (current->ptrace & PT_TRACE_CLONE) + return PTRACE_EVENT_CLONE; + } else if (current->ptrace & PT_TRACE_FORK) + return PTRACE_EVENT_FORK; + + return 0; +} + +/* + * Ok, this is the main fork-routine. + * + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + struct task_struct *p; + int trace = 0; + long pid = alloc_pidmap(); + + if (pid < 0) + return -EAGAIN; + if (unlikely(current->ptrace)) { + trace = fork_traceflag (clone_flags); + if (trace) + clone_flags |= CLONE_PTRACE; + } + + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. + */ + if (!IS_ERR(p)) { + struct completion vfork; + + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + } + + if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + /* + * We'll start up with an immediate SIGSTOP. + */ + sigaddset(&p->pending.signal, SIGSTOP); + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + + if (!(clone_flags & CLONE_STOPPED)) + wake_up_new_task(p, clone_flags); + else + p->state = TASK_STOPPED; + + if (unlikely (trace)) { + current->ptrace_message = pid; + ptrace_notify ((trace << 8) | SIGTRAP); + } + + if (clone_flags & CLONE_VFORK) { + wait_for_completion(&vfork); + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } + } else { + free_pidmap(pid); + pid = PTR_ERR(p); + } + return pid; +} + +void __init proc_caches_init(void) +{ + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL, NULL); + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); +} diff --git a/kernel/futex.c b/kernel/futex.c new file mode 100644 index 00000000000..7b54a672d0a --- /dev/null +++ b/kernel/futex.c @@ -0,0 +1,798 @@ +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar + * (C) Copyright 2003 Red Hat Inc, All Rights Reserved + * + * Removed page pinning, fix privately mapped COW pages and other cleanups + * (C) Copyright 2003, 2004 Jamie Lokier + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/jhash.h> +#include <linux/init.h> +#include <linux/futex.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> + +#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) + +/* + * Futexes are matched on equal values of this key. + * The key type depends on whether it's a shared or private mapping. + * Don't rearrange members without looking at hash_futex(). + * + * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. + * We set bit 0 to indicate if it's an inode-based key. + */ +union futex_key { + struct { + unsigned long pgoff; + struct inode *inode; + int offset; + } shared; + struct { + unsigned long uaddr; + struct mm_struct *mm; + int offset; + } private; + struct { + unsigned long word; + void *ptr; + int offset; + } both; +}; + +/* + * We use this hashed waitqueue instead of a normal wait_queue_t, so + * we can wake only the relevant ones (hashed queues may be shared). + * + * A futex_q has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. + * The order of wakup is always to make the first condition true, then + * wake up q->waiters, then make the second condition true. + */ +struct futex_q { + struct list_head list; + wait_queue_head_t waiters; + + /* Which hash list lock to use. */ + spinlock_t *lock_ptr; + + /* Key which the futex is hashed on. */ + union futex_key key; + + /* For fd, sigio sent using these. */ + int fd; + struct file *filp; +}; + +/* + * Split the global futex_lock into every hash list lock. + */ +struct futex_hash_bucket { + spinlock_t lock; + struct list_head chain; +}; + +static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; + +/* Futex-fs vfsmount entry: */ +static struct vfsmount *futex_mnt; + +/* + * We hash on the keys returned from get_futex_key (see below). + */ +static struct futex_hash_bucket *hash_futex(union futex_key *key) +{ + u32 hash = jhash2((u32*)&key->both.word, + (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + key->both.offset); + return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; +} + +/* + * Return 1 if two futex_keys are equal, 0 otherwise. + */ +static inline int match_futex(union futex_key *key1, union futex_key *key2) +{ + return (key1->both.word == key2->both.word + && key1->both.ptr == key2->both.ptr + && key1->both.offset == key2->both.offset); +} + +/* + * Get parameters which are the keys for a futex. + * + * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, + * offset_within_page). For private mappings, it's (uaddr, current->mm). + * We can usually work out the index without swapping in the page. + * + * Returns: 0, or negative error code. + * The key words are stored in *key on success. + * + * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. + */ +static int get_futex_key(unsigned long uaddr, union futex_key *key) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct page *page; + int err; + + /* + * The futex address must be "naturally" aligned. + */ + key->both.offset = uaddr % PAGE_SIZE; + if (unlikely((key->both.offset % sizeof(u32)) != 0)) + return -EINVAL; + uaddr -= key->both.offset; + + /* + * The futex is hashed differently depending on whether + * it's in a shared or private mapping. So check vma first. + */ + vma = find_extend_vma(mm, uaddr); + if (unlikely(!vma)) + return -EFAULT; + + /* + * Permissions. + */ + if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) + return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; + + /* + * Private mappings are handled in a simple way. + * + * NOTE: When userspace waits on a MAP_SHARED mapping, even if + * it's a read-only handle, it's expected that futexes attach to + * the object not the particular process. Therefore we use + * VM_MAYSHARE here, not VM_SHARED which is restricted to shared + * mappings of _writable_ handles. + */ + if (likely(!(vma->vm_flags & VM_MAYSHARE))) { + key->private.mm = mm; + key->private.uaddr = uaddr; + return 0; + } + + /* + * Linear file mappings are also simple. + */ + key->shared.inode = vma->vm_file->f_dentry->d_inode; + key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ + if (likely(!(vma->vm_flags & VM_NONLINEAR))) { + key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff); + return 0; + } + + /* + * We could walk the page table to read the non-linear + * pte, and get the page index without fetching the page + * from swap. But that's a lot of code to duplicate here + * for a rare case, so we simply fetch the page. + */ + + /* + * Do a quick atomic lookup first - this is the fastpath. + */ + spin_lock(¤t->mm->page_table_lock); + page = follow_page(mm, uaddr, 0); + if (likely(page != NULL)) { + key->shared.pgoff = + page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + spin_unlock(¤t->mm->page_table_lock); + return 0; + } + spin_unlock(¤t->mm->page_table_lock); + + /* + * Do it the general way. + */ + err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); + if (err >= 0) { + key->shared.pgoff = + page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + put_page(page); + return 0; + } + return err; +} + +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + * NOTE: mmap_sem MUST be held between get_futex_key() and calling this + * function, if it is called at all. mmap_sem keeps key->shared.inode valid. + */ +static inline void get_key_refs(union futex_key *key) +{ + if (key->both.ptr != 0) { + if (key->both.offset & 1) + atomic_inc(&key->shared.inode->i_count); + else + atomic_inc(&key->private.mm->mm_count); + } +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static void drop_key_refs(union futex_key *key) +{ + if (key->both.ptr != 0) { + if (key->both.offset & 1) + iput(key->shared.inode); + else + mmdrop(key->private.mm); + } +} + +static inline int get_futex_value_locked(int *dest, int __user *from) +{ + int ret; + + inc_preempt_count(); + ret = __copy_from_user_inatomic(dest, from, sizeof(int)); + dec_preempt_count(); + + return ret ? -EFAULT : 0; +} + +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. + */ +static void wake_futex(struct futex_q *q) +{ + list_del_init(&q->list); + if (q->filp) + send_sigio(&q->filp->f_owner, q->fd, POLL_IN); + /* + * The lock in wake_up_all() is a crucial memory barrier after the + * list_del_init() and also before assigning to q->lock_ptr. + */ + wake_up_all(&q->waiters); + /* + * The waiting task can free the futex_q as soon as this is written, + * without taking any locks. This must come last. + */ + q->lock_ptr = NULL; +} + +/* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int futex_wake(unsigned long uaddr, int nr_wake) +{ + union futex_key key; + struct futex_hash_bucket *bh; + struct list_head *head; + struct futex_q *this, *next; + int ret; + + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &key); + if (unlikely(ret != 0)) + goto out; + + bh = hash_futex(&key); + spin_lock(&bh->lock); + head = &bh->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + spin_unlock(&bh->lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +/* + * Requeue all waiters hashed on one physical page to another + * physical page. + */ +static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, + int nr_wake, int nr_requeue, int *valp) +{ + union futex_key key1, key2; + struct futex_hash_bucket *bh1, *bh2; + struct list_head *head1; + struct futex_q *this, *next; + int ret, drop_count = 0; + + retry: + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr1, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, &key2); + if (unlikely(ret != 0)) + goto out; + + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); + + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); + + if (likely(valp != NULL)) { + int curval; + + ret = get_futex_value_locked(&curval, (int __user *)uaddr1); + + if (unlikely(ret)) { + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); + + /* If we would have faulted, release mmap_sem, fault + * it in and start all over again. + */ + up_read(¤t->mm->mmap_sem); + + ret = get_user(curval, (int __user *)uaddr1); + + if (!ret) + goto retry; + + return ret; + } + if (curval != *valp) { + ret = -EAGAIN; + goto out_unlock; + } + } + + head1 = &bh1->chain; + list_for_each_entry_safe(this, next, head1, list) { + if (!match_futex (&this->key, &key1)) + continue; + if (++ret <= nr_wake) { + wake_futex(this); + } else { + list_move_tail(&this->list, &bh2->chain); + this->lock_ptr = &bh2->lock; + this->key = key2; + get_key_refs(&key2); + drop_count++; + + if (ret - nr_wake >= nr_requeue) + break; + /* Make sure to stop if key1 == key2 */ + if (head1 == &bh2->chain && head1 != &next->list) + head1 = &this->list; + } + } + +out_unlock: + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); + + /* drop_key_refs() must be called outside the spinlocks. */ + while (--drop_count >= 0) + drop_key_refs(&key1); + +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +/* The key must be already stored in q->key. */ +static inline struct futex_hash_bucket * +queue_lock(struct futex_q *q, int fd, struct file *filp) +{ + struct futex_hash_bucket *bh; + + q->fd = fd; + q->filp = filp; + + init_waitqueue_head(&q->waiters); + + get_key_refs(&q->key); + bh = hash_futex(&q->key); + q->lock_ptr = &bh->lock; + + spin_lock(&bh->lock); + return bh; +} + +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) +{ + list_add_tail(&q->list, &bh->chain); + spin_unlock(&bh->lock); +} + +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) +{ + spin_unlock(&bh->lock); + drop_key_refs(&q->key); +} + +/* + * queue_me and unqueue_me must be called as a pair, each + * exactly once. They are called with the hashed spinlock held. + */ + +/* The key must be already stored in q->key. */ +static void queue_me(struct futex_q *q, int fd, struct file *filp) +{ + struct futex_hash_bucket *bh; + bh = queue_lock(q, fd, filp); + __queue_me(q, bh); +} + +/* Return 1 if we were still queued (ie. 0 means we were woken) */ +static int unqueue_me(struct futex_q *q) +{ + int ret = 0; + spinlock_t *lock_ptr; + + /* In the common case we don't take the spinlock, which is nice. */ + retry: + lock_ptr = q->lock_ptr; + if (lock_ptr != 0) { + spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + spin_unlock(lock_ptr); + ret = 1; + } + + drop_key_refs(&q->key); + return ret; +} + +static int futex_wait(unsigned long uaddr, int val, unsigned long time) +{ + DECLARE_WAITQUEUE(wait, current); + int ret, curval; + struct futex_q q; + struct futex_hash_bucket *bh; + + retry: + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &q.key); + if (unlikely(ret != 0)) + goto out_release_sem; + + bh = queue_lock(&q, -1, NULL); + + /* + * Access the page AFTER the futex is queued. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we queued after testing *uaddr, that would open + * a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * A consequence is that futex_wait() can return zero and absorb + * a wakeup when *uaddr != val on entry to the syscall. This is + * rare, but normal. + * + * We hold the mmap semaphore, so the mapping cannot have changed + * since we looked it up in get_futex_key. + */ + + ret = get_futex_value_locked(&curval, (int __user *)uaddr); + + if (unlikely(ret)) { + queue_unlock(&q, bh); + + /* If we would have faulted, release mmap_sem, fault it in and + * start all over again. + */ + up_read(¤t->mm->mmap_sem); + + ret = get_user(curval, (int __user *)uaddr); + + if (!ret) + goto retry; + return ret; + } + if (curval != val) { + ret = -EWOULDBLOCK; + queue_unlock(&q, bh); + goto out_release_sem; + } + + /* Only actually queue if *uaddr contained val. */ + __queue_me(&q, bh); + + /* + * Now the futex is queued and we have checked the data, we + * don't want to hold mmap_sem while we sleep. + */ + up_read(¤t->mm->mmap_sem); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + + /* add_wait_queue is the barrier after __set_current_state. */ + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&q.waiters, &wait); + /* + * !list_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!list_empty(&q.list))) + time = schedule_timeout(time); + __set_current_state(TASK_RUNNING); + + /* + * NOTE: we don't remove ourselves from the waitqueue because + * we are the only user of it. + */ + + /* If we were woken (and unqueued), we succeeded, whatever. */ + if (!unqueue_me(&q)) + return 0; + if (time == 0) + return -ETIMEDOUT; + /* We expect signal_pending(current), but another thread may + * have handled it for us already. */ + return -EINTR; + + out_release_sem: + up_read(¤t->mm->mmap_sem); + return ret; +} + +static int futex_close(struct inode *inode, struct file *filp) +{ + struct futex_q *q = filp->private_data; + + unqueue_me(q); + kfree(q); + return 0; +} + +/* This is one-shot: once it's gone off you need a new fd */ +static unsigned int futex_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct futex_q *q = filp->private_data; + int ret = 0; + + poll_wait(filp, &q->waiters, wait); + + /* + * list_empty() is safe here without any lock. + * q->lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (list_empty(&q->list)) + ret = POLLIN | POLLRDNORM; + + return ret; +} + +static struct file_operations futex_fops = { + .release = futex_close, + .poll = futex_poll, +}; + +/* + * Signal allows caller to avoid the race which would occur if they + * set the sigio stuff up afterwards. + */ +static int futex_fd(unsigned long uaddr, int signal) +{ + struct futex_q *q; + struct file *filp; + int ret, err; + + ret = -EINVAL; + if (signal < 0 || signal > _NSIG) + goto out; + + ret = get_unused_fd(); + if (ret < 0) + goto out; + filp = get_empty_filp(); + if (!filp) { + put_unused_fd(ret); + ret = -ENFILE; + goto out; + } + filp->f_op = &futex_fops; + filp->f_vfsmnt = mntget(futex_mnt); + filp->f_dentry = dget(futex_mnt->mnt_root); + filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + + if (signal) { + int err; + err = f_setown(filp, current->pid, 1); + if (err < 0) { + put_unused_fd(ret); + put_filp(filp); + ret = err; + goto out; + } + filp->f_owner.signum = signal; + } + + q = kmalloc(sizeof(*q), GFP_KERNEL); + if (!q) { + put_unused_fd(ret); + put_filp(filp); + ret = -ENOMEM; + goto out; + } + + down_read(¤t->mm->mmap_sem); + err = get_futex_key(uaddr, &q->key); + + if (unlikely(err != 0)) { + up_read(¤t->mm->mmap_sem); + put_unused_fd(ret); + put_filp(filp); + kfree(q); + return err; + } + + /* + * queue_me() must be called before releasing mmap_sem, because + * key->shared.inode needs to be referenced while holding it. + */ + filp->private_data = q; + + queue_me(q, ret, filp); + up_read(¤t->mm->mmap_sem); + + /* Now we map fd to filp, so userspace can access it */ + fd_install(ret, filp); +out: + return ret; +} + +long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, + unsigned long uaddr2, int val2, int val3) +{ + int ret; + + switch (op) { + case FUTEX_WAIT: + ret = futex_wait(uaddr, val, timeout); + break; + case FUTEX_WAKE: + ret = futex_wake(uaddr, val); + break; + case FUTEX_FD: + /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ + ret = futex_fd(uaddr, val); + break; + case FUTEX_REQUEUE: + ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); + break; + case FUTEX_CMP_REQUEUE: + ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); + break; + default: + ret = -ENOSYS; + } + return ret; +} + + +asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, + struct timespec __user *utime, u32 __user *uaddr2, + int val3) +{ + struct timespec t; + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + int val2 = 0; + + if ((op == FUTEX_WAIT) && utime) { + if (copy_from_user(&t, utime, sizeof(t)) != 0) + return -EFAULT; + timeout = timespec_to_jiffies(&t) + 1; + } + /* + * requeue parameter in 'utime' if op == FUTEX_REQUEUE. + */ + if (op >= FUTEX_REQUEUE) + val2 = (int) (unsigned long) utime; + + return do_futex((unsigned long)uaddr, op, val, timeout, + (unsigned long)uaddr2, val2, val3); +} + +static struct super_block * +futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); +} + +static struct file_system_type futex_fs_type = { + .name = "futexfs", + .get_sb = futexfs_get_sb, + .kill_sb = kill_anon_super, +}; + +static int __init init(void) +{ + unsigned int i; + + register_filesystem(&futex_fs_type); + futex_mnt = kern_mount(&futex_fs_type); + + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + INIT_LIST_HEAD(&futex_queues[i].chain); + spin_lock_init(&futex_queues[i].lock); + } + return 0; +} +__initcall(init); diff --git a/kernel/intermodule.c b/kernel/intermodule.c new file mode 100644 index 00000000000..388977f3e9b --- /dev/null +++ b/kernel/intermodule.c @@ -0,0 +1,182 @@ +/* Deprecated, do not use. Moved from module.c to here. --RR */ + +/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */ +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/slab.h> + +/* inter_module functions are always available, even when the kernel is + * compiled without modules. Consumers of inter_module_xxx routines + * will always work, even when both are built into the kernel, this + * approach removes lots of #ifdefs in mainline code. + */ + +static struct list_head ime_list = LIST_HEAD_INIT(ime_list); +static DEFINE_SPINLOCK(ime_lock); +static int kmalloc_failed; + +struct inter_module_entry { + struct list_head list; + const char *im_name; + struct module *owner; + const void *userdata; +}; + +/** + * inter_module_register - register a new set of inter module data. + * @im_name: an arbitrary string to identify the data, must be unique + * @owner: module that is registering the data, always use THIS_MODULE + * @userdata: pointer to arbitrary userdata to be registered + * + * Description: Check that the im_name has not already been registered, + * complain if it has. For new data, add it to the inter_module_entry + * list. + */ +void inter_module_register(const char *im_name, struct module *owner, const void *userdata) +{ + struct list_head *tmp; + struct inter_module_entry *ime, *ime_new; + + if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { + /* Overloaded kernel, not fatal */ + printk(KERN_ERR + "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", + im_name); + kmalloc_failed = 1; + return; + } + memset(ime_new, 0, sizeof(*ime_new)); + ime_new->im_name = im_name; + ime_new->owner = owner; + ime_new->userdata = userdata; + + spin_lock(&ime_lock); + list_for_each(tmp, &ime_list) { + ime = list_entry(tmp, struct inter_module_entry, list); + if (strcmp(ime->im_name, im_name) == 0) { + spin_unlock(&ime_lock); + kfree(ime_new); + /* Program logic error, fatal */ + printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); + BUG(); + } + } + list_add(&(ime_new->list), &ime_list); + spin_unlock(&ime_lock); +} + +/** + * inter_module_unregister - unregister a set of inter module data. + * @im_name: an arbitrary string to identify the data, must be unique + * + * Description: Check that the im_name has been registered, complain if + * it has not. For existing data, remove it from the + * inter_module_entry list. + */ +void inter_module_unregister(const char *im_name) +{ + struct list_head *tmp; + struct inter_module_entry *ime; + + spin_lock(&ime_lock); + list_for_each(tmp, &ime_list) { + ime = list_entry(tmp, struct inter_module_entry, list); + if (strcmp(ime->im_name, im_name) == 0) { + list_del(&(ime->list)); + spin_unlock(&ime_lock); + kfree(ime); + return; + } + } + spin_unlock(&ime_lock); + if (kmalloc_failed) { + printk(KERN_ERR + "inter_module_unregister: no entry for '%s', " + "probably caused by previous kmalloc failure\n", + im_name); + return; + } + else { + /* Program logic error, fatal */ + printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); + BUG(); + } +} + +/** + * inter_module_get - return arbitrary userdata from another module. + * @im_name: an arbitrary string to identify the data, must be unique + * + * Description: If the im_name has not been registered, return NULL. + * Try to increment the use count on the owning module, if that fails + * then return NULL. Otherwise return the userdata. + */ +static const void *inter_module_get(const char *im_name) +{ + struct list_head *tmp; + struct inter_module_entry *ime; + const void *result = NULL; + + spin_lock(&ime_lock); + list_for_each(tmp, &ime_list) { + ime = list_entry(tmp, struct inter_module_entry, list); + if (strcmp(ime->im_name, im_name) == 0) { + if (try_module_get(ime->owner)) + result = ime->userdata; + break; + } + } + spin_unlock(&ime_lock); + return(result); +} + +/** + * inter_module_get_request - im get with automatic request_module. + * @im_name: an arbitrary string to identify the data, must be unique + * @modname: module that is expected to register im_name + * + * Description: If inter_module_get fails, do request_module then retry. + */ +const void *inter_module_get_request(const char *im_name, const char *modname) +{ + const void *result = inter_module_get(im_name); + if (!result) { + request_module("%s", modname); + result = inter_module_get(im_name); + } + return(result); +} + +/** + * inter_module_put - release use of data from another module. + * @im_name: an arbitrary string to identify the data, must be unique + * + * Description: If the im_name has not been registered, complain, + * otherwise decrement the use count on the owning module. + */ +void inter_module_put(const char *im_name) +{ + struct list_head *tmp; + struct inter_module_entry *ime; + + spin_lock(&ime_lock); + list_for_each(tmp, &ime_list) { + ime = list_entry(tmp, struct inter_module_entry, list); + if (strcmp(ime->im_name, im_name) == 0) { + if (ime->owner) + module_put(ime->owner); + spin_unlock(&ime_lock); + return; + } + } + spin_unlock(&ime_lock); + printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); + BUG(); +} + +EXPORT_SYMBOL(inter_module_register); +EXPORT_SYMBOL(inter_module_unregister); +EXPORT_SYMBOL(inter_module_get_request); +EXPORT_SYMBOL(inter_module_put); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile new file mode 100644 index 00000000000..49378738ff5 --- /dev/null +++ b/kernel/irq/Makefile @@ -0,0 +1,5 @@ + +obj-y := handle.o manage.o spurious.o +obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o +obj-$(CONFIG_PROC_FS) += proc.o + diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c new file mode 100644 index 00000000000..98d62d8efea --- /dev/null +++ b/kernel/irq/autoprobe.c @@ -0,0 +1,189 @@ +/* + * linux/kernel/irq/autoprobe.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the interrupt probing code and driver APIs. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/interrupt.h> + +/* + * Autodetection depends on the fact that any interrupt that + * comes in on to an unassigned handler will get stuck with + * "IRQ_WAITING" cleared and the interrupt disabled. + */ +static DECLARE_MUTEX(probe_sem); + +/** + * probe_irq_on - begin an interrupt autodetect + * + * Commence probing for an interrupt. The interrupts are scanned + * and a mask of potential interrupt lines is returned. + * + */ +unsigned long probe_irq_on(void) +{ + unsigned long val, delay; + irq_desc_t *desc; + unsigned int i; + + down(&probe_sem); + /* + * something may have generated an irq long ago and we want to + * flush such a longstanding irq before considering it as spurious. + */ + for (i = NR_IRQS-1; i > 0; i--) { + desc = irq_desc + i; + + spin_lock_irq(&desc->lock); + if (!irq_desc[i].action) + irq_desc[i].handler->startup(i); + spin_unlock_irq(&desc->lock); + } + + /* Wait for longstanding interrupts to trigger. */ + for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) + /* about 20ms delay */ barrier(); + + /* + * enable any unassigned irqs + * (we must startup again here because if a longstanding irq + * happened in the previous stage, it may have masked itself) + */ + for (i = NR_IRQS-1; i > 0; i--) { + desc = irq_desc + i; + + spin_lock_irq(&desc->lock); + if (!desc->action) { + desc->status |= IRQ_AUTODETECT | IRQ_WAITING; + if (desc->handler->startup(i)) + desc->status |= IRQ_PENDING; + } + spin_unlock_irq(&desc->lock); + } + + /* + * Wait for spurious interrupts to trigger + */ + for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) + /* about 100ms delay */ barrier(); + + /* + * Now filter out any obviously spurious interrupts + */ + val = 0; + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + unsigned int status; + + spin_lock_irq(&desc->lock); + status = desc->status; + + if (status & IRQ_AUTODETECT) { + /* It triggered already - consider it spurious. */ + if (!(status & IRQ_WAITING)) { + desc->status = status & ~IRQ_AUTODETECT; + desc->handler->shutdown(i); + } else + if (i < 32) + val |= 1 << i; + } + spin_unlock_irq(&desc->lock); + } + + return val; +} + +EXPORT_SYMBOL(probe_irq_on); + +/** + * probe_irq_mask - scan a bitmap of interrupt lines + * @val: mask of interrupts to consider + * + * Scan the interrupt lines and return a bitmap of active + * autodetect interrupts. The interrupt probe logic state + * is then returned to its previous value. + * + * Note: we need to scan all the irq's even though we will + * only return autodetect irq numbers - just so that we reset + * them all to a known state. + */ +unsigned int probe_irq_mask(unsigned long val) +{ + unsigned int mask; + int i; + + mask = 0; + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + unsigned int status; + + spin_lock_irq(&desc->lock); + status = desc->status; + + if (status & IRQ_AUTODETECT) { + if (i < 16 && !(status & IRQ_WAITING)) + mask |= 1 << i; + + desc->status = status & ~IRQ_AUTODETECT; + desc->handler->shutdown(i); + } + spin_unlock_irq(&desc->lock); + } + up(&probe_sem); + + return mask & val; +} +EXPORT_SYMBOL(probe_irq_mask); + +/** + * probe_irq_off - end an interrupt autodetect + * @val: mask of potential interrupts (unused) + * + * Scans the unused interrupt lines and returns the line which + * appears to have triggered the interrupt. If no interrupt was + * found then zero is returned. If more than one interrupt is + * found then minus the first candidate is returned to indicate + * their is doubt. + * + * The interrupt probe logic state is returned to its previous + * value. + * + * BUGS: When used in a module (which arguably shouldn't happen) + * nothing prevents two IRQ probe callers from overlapping. The + * results of this are non-optimal. + */ +int probe_irq_off(unsigned long val) +{ + int i, irq_found = 0, nr_irqs = 0; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + unsigned int status; + + spin_lock_irq(&desc->lock); + status = desc->status; + + if (status & IRQ_AUTODETECT) { + if (!(status & IRQ_WAITING)) { + if (!nr_irqs) + irq_found = i; + nr_irqs++; + } + desc->status = status & ~IRQ_AUTODETECT; + desc->handler->shutdown(i); + } + spin_unlock_irq(&desc->lock); + } + up(&probe_sem); + + if (nr_irqs > 1) + irq_found = -irq_found; + return irq_found; +} + +EXPORT_SYMBOL(probe_irq_off); + diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c new file mode 100644 index 00000000000..2fb0e46e11f --- /dev/null +++ b/kernel/irq/handle.c @@ -0,0 +1,193 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the core interrupt handling code. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +#include "internals.h" + +/* + * Linux has a controller-independent interrupt architecture. + * Every controller has a 'controller-template', that is used + * by the main code to do the right thing. Each driver-visible + * interrupt source is transparently wired to the apropriate + * controller. Thus drivers need not be aware of the + * interrupt-controller. + * + * The code is designed to be easily extended with new/different + * interrupt controllers, without having to do assembly magic or + * having to touch the generic code. + * + * Controller mappings for all interrupt sources: + */ +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { + [0 ... NR_IRQS-1] = { + .handler = &no_irq_type, + .lock = SPIN_LOCK_UNLOCKED + } +}; + +/* + * Generic 'no controller' code + */ +static void end_none(unsigned int irq) { } +static void enable_none(unsigned int irq) { } +static void disable_none(unsigned int irq) { } +static void shutdown_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } + +static void ack_none(unsigned int irq) +{ + /* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themself. + */ + ack_bad_irq(irq); +} + +struct hw_interrupt_type no_irq_type = { + .typename = "none", + .startup = startup_none, + .shutdown = shutdown_none, + .enable = enable_none, + .disable = disable_none, + .ack = ack_none, + .end = end_none, + .set_affinity = NULL +}; + +/* + * Special, empty irq handler: + */ +irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) +{ + return IRQ_NONE; +} + +/* + * Have got an event to handle: + */ +fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, + struct irqaction *action) +{ + int ret, retval = 0, status = 0; + + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + + do { + ret = action->handler(irq, action->dev_id, regs); + if (ret == IRQ_HANDLED) + status |= action->flags; + retval |= ret; + action = action->next; + } while (action); + + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); + local_irq_disable(); + + return retval; +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) +{ + irq_desc_t *desc = irq_desc + irq; + struct irqaction * action; + unsigned int status; + + kstat_this_cpu.irqs[irq]++; + if (desc->status & IRQ_PER_CPU) { + irqreturn_t action_ret; + + /* + * No locking required for CPU-local interrupts: + */ + desc->handler->ack(irq); + action_ret = handle_IRQ_event(irq, regs, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + desc->handler->end(irq); + return 1; + } + + spin_lock(&desc->lock); + desc->handler->ack(irq); + /* + * REPLAY is when Linux resends an IRQ that was dropped earlier + * WAITING is used by probe to mark irqs that are being tested + */ + status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); + status |= IRQ_PENDING; /* we _want_ to handle it */ + + /* + * If the IRQ is disabled for whatever reason, we cannot + * use the action we have. + */ + action = NULL; + if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { + action = desc->action; + status &= ~IRQ_PENDING; /* we commit to handling */ + status |= IRQ_INPROGRESS; /* we are handling it */ + } + desc->status = status; + + /* + * If there is no IRQ handler or it was disabled, exit early. + * Since we set PENDING, if another processor is handling + * a different instance of this same irq, the other processor + * will take care of it. + */ + if (unlikely(!action)) + goto out; + + /* + * Edge triggered interrupts need to remember + * pending events. + * This applies to any hw interrupts that allow a second + * instance of the same irq to arrive while we are in do_IRQ + * or in the handler. But the code here only handles the _second_ + * instance of the irq, not the third or fourth. So it is mostly + * useful for irq hardware that does not mask cleanly in an + * SMP environment. + */ + for (;;) { + irqreturn_t action_ret; + + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + + spin_lock(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + if (likely(!(desc->status & IRQ_PENDING))) + break; + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + +out: + /* + * The ->end() handler has to deal with interrupts which got + * disabled while the handler was running. + */ + desc->handler->end(irq); + spin_unlock(&desc->lock); + + return 1; +} + diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h new file mode 100644 index 00000000000..46feba63026 --- /dev/null +++ b/kernel/irq/internals.h @@ -0,0 +1,18 @@ +/* + * IRQ subsystem internal functions and variables: + */ + +extern int noirqdebug; + +#ifdef CONFIG_PROC_FS +extern void register_irq_proc(unsigned int irq); +extern void register_handler_proc(unsigned int irq, struct irqaction *action); +extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); +#else +static inline void register_irq_proc(unsigned int irq) { } +static inline void register_handler_proc(unsigned int irq, + struct irqaction *action) { } +static inline void unregister_handler_proc(unsigned int irq, + struct irqaction *action) { } +#endif + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c new file mode 100644 index 00000000000..5202e4c4a5b --- /dev/null +++ b/kernel/irq/manage.c @@ -0,0 +1,349 @@ +/* + * linux/kernel/irq/manage.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains driver APIs to the irq subsystem. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> + +#include "internals.h" + +#ifdef CONFIG_SMP + +cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); +} + +EXPORT_SYMBOL(synchronize_irq); + +#endif + +/** + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and Enables are + * nested. + * Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + * + * This function may be called from IRQ context. + */ +void disable_irq_nosync(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->depth++) { + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +EXPORT_SYMBOL(disable_irq_nosync); + +/** + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are + * nested. + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void disable_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + + disable_irq_nosync(irq); + if (desc->action) + synchronize_irq(irq); +} + +EXPORT_SYMBOL(disable_irq); + +/** + * enable_irq - enable handling of an irq + * @irq: Interrupt to enable + * + * Undoes the effect of one call to disable_irq(). If this + * matches the last disable, processing of interrupts on this + * IRQ line is re-enabled. + * + * This function may be called from IRQ context. + */ +void enable_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + switch (desc->depth) { + case 0: + WARN_ON(1); + break; + case 1: { + unsigned int status = desc->status & ~IRQ_DISABLED; + + desc->status = status; + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status = status | IRQ_REPLAY; + hw_resend_irq(desc->handler,irq); + } + desc->handler->enable(irq); + /* fall-through */ + } + default: + desc->depth--; + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +EXPORT_SYMBOL(enable_irq); + +/* + * Internal function that tells the architecture code whether a + * particular irq has been exclusively allocated or is available + * for driver use. + */ +int can_request_irq(unsigned int irq, unsigned long irqflags) +{ + struct irqaction *action; + + if (irq >= NR_IRQS) + return 0; + + action = irq_desc[irq].action; + if (action) + if (irqflags & action->flags & SA_SHIRQ) + action = NULL; + + return !action; +} + +/* + * Internal function to register an irqaction - typically used to + * allocate special interrupts that are part of the architecture. + */ +int setup_irq(unsigned int irq, struct irqaction * new) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *old, **p; + unsigned long flags; + int shared = 0; + + if (desc->handler == &no_irq_type) + return -ENOSYS; + /* + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. + */ + if (new->flags & SA_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); + } + + /* + * The following block of code has to be executed atomically + */ + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + if ((old = *p) != NULL) { + /* Can't share interrupts unless both agree to */ + if (!(old->flags & new->flags & SA_SHIRQ)) { + spin_unlock_irqrestore(&desc->lock,flags); + return -EBUSY; + } + + /* add new interrupt at end of irq queue */ + do { + p = &old->next; + old = *p; + } while (old); + shared = 1; + } + + *p = new; + + if (!shared) { + desc->depth = 0; + desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | + IRQ_WAITING | IRQ_INPROGRESS); + if (desc->handler->startup) + desc->handler->startup(irq); + else + desc->handler->enable(irq); + } + spin_unlock_irqrestore(&desc->lock,flags); + + new->irq = irq; + register_irq_proc(irq); + new->dir = NULL; + register_handler_proc(irq, new); + + return 0; +} + +/** + * free_irq - free an interrupt + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function must not be called from interrupt context. + */ +void free_irq(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc; + struct irqaction **p; + unsigned long flags; + + if (irq >= NR_IRQS) + return; + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + for (;;) { + struct irqaction * action = *p; + + if (action) { + struct irqaction **pp = p; + + p = &action->next; + if (action->dev_id != dev_id) + continue; + + /* Found it - now remove it from the list of entries */ + *pp = action->next; + if (!desc->action) { + desc->status |= IRQ_DISABLED; + if (desc->handler->shutdown) + desc->handler->shutdown(irq); + else + desc->handler->disable(irq); + } + spin_unlock_irqrestore(&desc->lock,flags); + unregister_handler_proc(irq, action); + + /* Make sure it's not being used on another CPU */ + synchronize_irq(irq); + kfree(action); + return; + } + printk(KERN_ERR "Trying to free free IRQ%d\n",irq); + spin_unlock_irqrestore(&desc->lock,flags); + return; + } +} + +EXPORT_SYMBOL(free_irq); + +/** + * request_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. From the point this + * call is made your handler function may be invoked. Since + * your handler function must clear any interrupt the board + * raises, you must take care both to initialise your hardware + * and to set up the interrupt handler in the right order. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id + * as this is required when freeing the interrupt. + * + * Flags: + * + * SA_SHIRQ Interrupt is shared + * SA_INTERRUPT Disable local interrupts while processing + * SA_SAMPLE_RANDOM The interrupt can be used for entropy + * + */ +int request_irq(unsigned int irq, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, void *dev_id) +{ + struct irqaction * action; + int retval; + + /* + * Sanity-check: shared interrupts must pass in a real dev-ID, + * otherwise we'll have trouble later trying to figure out + * which interrupt is which (messes up the interrupt freeing + * logic etc). + */ + if ((irqflags & SA_SHIRQ) && !dev_id) + return -EINVAL; + if (irq >= NR_IRQS) + return -EINVAL; + if (!handler) + return -EINVAL; + + action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags; + cpus_clear(action->mask); + action->name = devname; + action->next = NULL; + action->dev_id = dev_id; + + retval = setup_irq(irq, action); + if (retval) + kfree(action); + + return retval; +} + +EXPORT_SYMBOL(request_irq); + diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c new file mode 100644 index 00000000000..85d08daa660 --- /dev/null +++ b/kernel/irq/proc.c @@ -0,0 +1,159 @@ +/* + * linux/kernel/irq/proc.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the /proc/irq/ handling code. + */ + +#include <linux/irq.h> +#include <linux/proc_fs.h> +#include <linux/interrupt.h> + +static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; + +#ifdef CONFIG_SMP + +/* + * The /proc/irq/<irq>/smp_affinity values: + */ +static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; + +void __attribute__((weak)) +proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +{ + irq_affinity[irq] = mask_val; + irq_desc[irq].handler->set_affinity(irq, mask_val); +} + +static int irq_affinity_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); + + if (count - len < 2) + return -EINVAL; + len += sprintf(page + len, "\n"); + return len; +} + +int no_irq_affinity; +static int irq_affinity_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int irq = (int)(long)data, full_count = count, err; + cpumask_t new_value, tmp; + + if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) + return -EIO; + + err = cpumask_parse(buffer, count, new_value); + if (err) + return err; + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) + return -EINVAL; + + proc_set_irq_affinity(irq, new_value); + + return full_count; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_dir[irq] || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_dir[irq]); +} + +#undef MAX_NAMELEN + +#define MAX_NAMELEN 10 + +void register_irq_proc(unsigned int irq) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || + (irq_desc[irq].handler == &no_irq_type) || + irq_dir[irq]) + return; + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%d", irq); + + /* create /proc/irq/1234 */ + irq_dir[irq] = proc_mkdir(name, root_irq_dir); + +#ifdef CONFIG_SMP + { + struct proc_dir_entry *entry; + + /* create /proc/irq/<irq>/smp_affinity */ + entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); + + if (entry) { + entry->nlink = 1; + entry->data = (void *)(long)irq; + entry->read_proc = irq_affinity_read_proc; + entry->write_proc = irq_affinity_write_proc; + } + smp_affinity_entry[irq] = entry; + } +#endif +} + +#undef MAX_NAMELEN + +void unregister_handler_proc(unsigned int irq, struct irqaction *action) +{ + if (action->dir) + remove_proc_entry(action->dir->name, irq_dir[irq]); +} + +void init_irq_proc(void) +{ + int i; + + /* create /proc/irq */ + root_irq_dir = proc_mkdir("irq", NULL); + if (!root_irq_dir) + return; + + /* + * Create entries for all existing IRQs. + */ + for (i = 0; i < NR_IRQS; i++) + register_irq_proc(i); +} + diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c new file mode 100644 index 00000000000..f6297c30690 --- /dev/null +++ b/kernel/irq/spurious.c @@ -0,0 +1,96 @@ +/* + * linux/kernel/irq/spurious.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains spurious interrupt handling. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/interrupt.h> + +/* + * If 99,900 of the previous 100,000 interrupts have not been handled + * then assume that the IRQ is stuck in some manner. Drop a diagnostic + * and try to turn the IRQ off. + * + * (The other 100-of-100,000 interrupts may have been a correctly + * functioning device sharing an IRQ with the failing one) + * + * Called under desc->lock + */ + +static void +__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + struct irqaction *action; + + if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { + printk(KERN_ERR "irq event %d: bogus return value %x\n", + irq, action_ret); + } else { + printk(KERN_ERR "irq %d: nobody cared!\n", irq); + } + dump_stack(); + printk(KERN_ERR "handlers:\n"); + action = desc->action; + while (action) { + printk(KERN_ERR "[<%p>]", action->handler); + print_symbol(" (%s)", + (unsigned long)action->handler); + printk("\n"); + action = action->next; + } +} + +void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + static int count = 100; + + if (count > 0) { + count--; + __report_bad_irq(irq, desc, action_ret); + } +} + +void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + if (action_ret != IRQ_HANDLED) { + desc->irqs_unhandled++; + if (action_ret != IRQ_NONE) + report_bad_irq(irq, desc, action_ret); + } + + desc->irq_count++; + if (desc->irq_count < 100000) + return; + + desc->irq_count = 0; + if (desc->irqs_unhandled > 99900) { + /* + * The interrupt is stuck + */ + __report_bad_irq(irq, desc, action_ret); + /* + * Now kill the IRQ + */ + printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + desc->irqs_unhandled = 0; +} + +int noirqdebug; + +int __init noirqdebug_setup(char *str) +{ + noirqdebug = 1; + printk(KERN_INFO "IRQ lockup detection disabled\n"); + return 1; +} + +__setup("noirqdebug", noirqdebug_setup); + diff --git a/kernel/itimer.c b/kernel/itimer.c new file mode 100644 index 00000000000..e9a40e947e0 --- /dev/null +++ b/kernel/itimer.c @@ -0,0 +1,241 @@ +/* + * linux/kernel/itimer.c + * + * Copyright (C) 1992 Darren Senn + */ + +/* These are all the functions necessary to implement itimers */ + +#include <linux/mm.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/syscalls.h> +#include <linux/time.h> +#include <linux/posix-timers.h> + +#include <asm/uaccess.h> + +static unsigned long it_real_value(struct signal_struct *sig) +{ + unsigned long val = 0; + if (timer_pending(&sig->real_timer)) { + val = sig->real_timer.expires - jiffies; + + /* look out for negative/zero itimer.. */ + if ((long) val <= 0) + val = 1; + } + return val; +} + +int do_getitimer(int which, struct itimerval *value) +{ + struct task_struct *tsk = current; + unsigned long interval, val; + cputime_t cinterval, cval; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + interval = tsk->signal->it_real_incr; + val = it_real_value(tsk->signal); + spin_unlock_irq(&tsk->sighand->siglock); + jiffies_to_timeval(val, &value->it_value); + jiffies_to_timeval(interval, &value->it_interval); + break; + case ITIMER_VIRTUAL: + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + cval = tsk->signal->it_virt_expires; + cinterval = tsk->signal->it_virt_incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_struct *t = tsk; + cputime_t utime = tsk->signal->utime; + do { + utime = cputime_add(utime, t->utime); + t = next_thread(t); + } while (t != tsk); + if (cputime_le(cval, utime)) { /* about to fire */ + cval = jiffies_to_cputime(1); + } else { + cval = cputime_sub(cval, utime); + } + } + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); + break; + case ITIMER_PROF: + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + cval = tsk->signal->it_prof_expires; + cinterval = tsk->signal->it_prof_incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_struct *t = tsk; + cputime_t ptime = cputime_add(tsk->signal->utime, + tsk->signal->stime); + do { + ptime = cputime_add(ptime, + cputime_add(t->utime, + t->stime)); + t = next_thread(t); + } while (t != tsk); + if (cputime_le(cval, ptime)) { /* about to fire */ + cval = jiffies_to_cputime(1); + } else { + cval = cputime_sub(cval, ptime); + } + } + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); + break; + default: + return(-EINVAL); + } + return 0; +} + +asmlinkage long sys_getitimer(int which, struct itimerval __user *value) +{ + int error = -EFAULT; + struct itimerval get_buffer; + + if (value) { + error = do_getitimer(which, &get_buffer); + if (!error && + copy_to_user(value, &get_buffer, sizeof(get_buffer))) + error = -EFAULT; + } + return error; +} + +/* + * Called with P->sighand->siglock held and P->signal->real_timer inactive. + * If interval is nonzero, arm the timer for interval ticks from now. + */ +static inline void it_real_arm(struct task_struct *p, unsigned long interval) +{ + p->signal->it_real_value = interval; /* XXX unnecessary field?? */ + if (interval == 0) + return; + if (interval > (unsigned long) LONG_MAX) + interval = LONG_MAX; + p->signal->real_timer.expires = jiffies + interval; + add_timer(&p->signal->real_timer); +} + +void it_real_fn(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); + + /* + * Now restart the timer if necessary. We don't need any locking + * here because do_setitimer makes sure we have finished running + * before it touches anything. + */ + it_real_arm(p, p->signal->it_real_incr); +} + +int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +{ + struct task_struct *tsk = current; + unsigned long val, interval; + cputime_t cval, cinterval, nval, ninterval; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + interval = tsk->signal->it_real_incr; + val = it_real_value(tsk->signal); + if (val) + del_timer_sync(&tsk->signal->real_timer); + tsk->signal->it_real_incr = + timeval_to_jiffies(&value->it_interval); + it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); + spin_unlock_irq(&tsk->sighand->siglock); + if (ovalue) { + jiffies_to_timeval(val, &ovalue->it_value); + jiffies_to_timeval(interval, + &ovalue->it_interval); + } + break; + case ITIMER_VIRTUAL: + nval = timeval_to_cputime(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + cval = tsk->signal->it_virt_expires; + cinterval = tsk->signal->it_virt_incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, + jiffies_to_cputime(1)); + set_process_cpu_timer(tsk, CPUCLOCK_VIRT, + &nval, &cval); + } + tsk->signal->it_virt_expires = nval; + tsk->signal->it_virt_incr = ninterval; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } + break; + case ITIMER_PROF: + nval = timeval_to_cputime(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + cval = tsk->signal->it_prof_expires; + cinterval = tsk->signal->it_prof_incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, + jiffies_to_cputime(1)); + set_process_cpu_timer(tsk, CPUCLOCK_PROF, + &nval, &cval); + } + tsk->signal->it_prof_expires = nval; + tsk->signal->it_prof_incr = ninterval; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } + break; + default: + return -EINVAL; + } + return 0; +} + +asmlinkage long sys_setitimer(int which, + struct itimerval __user *value, + struct itimerval __user *ovalue) +{ + struct itimerval set_buffer, get_buffer; + int error; + + if (value) { + if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) + return -EFAULT; + } else + memset((char *) &set_buffer, 0, sizeof(set_buffer)); + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + + if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + return -EFAULT; + return 0; +} diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c new file mode 100644 index 00000000000..1627f8d6e0c --- /dev/null +++ b/kernel/kallsyms.c @@ -0,0 +1,411 @@ +/* + * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. + * + * Rewritten and vastly simplified by Rusty Russell for in-kernel + * module loader: + * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * + * ChangeLog: + * + * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> + * Changed the compression method from stem compression to "table lookup" + * compression (see scripts/kallsyms.c for a more complete description) + */ +#include <linux/kallsyms.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/err.h> +#include <linux/proc_fs.h> +#include <linux/mm.h> + +#include <asm/sections.h> + +#ifdef CONFIG_KALLSYMS_ALL +#define all_var 1 +#else +#define all_var 0 +#endif + +/* These will be re-linked against their real values during the second link stage */ +extern unsigned long kallsyms_addresses[] __attribute__((weak)); +extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); +extern u8 kallsyms_names[] __attribute__((weak)); + +extern u8 kallsyms_token_table[] __attribute__((weak)); +extern u16 kallsyms_token_index[] __attribute__((weak)); + +extern unsigned long kallsyms_markers[] __attribute__((weak)); + +static inline int is_kernel_inittext(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext + && addr <= (unsigned long)_einittext) + return 1; + return 0; +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) + return 1; + return in_gate_area_no_task(addr); +} + +static inline int is_kernel(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) + return 1; + return in_gate_area_no_task(addr); +} + +/* expand a compressed symbol data into the resulting uncompressed string, + given the offset to where the symbol is in the compressed stream */ +static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) +{ + int len, skipped_first = 0; + u8 *tptr, *data; + + /* get the compressed symbol length from the first symbol byte */ + data = &kallsyms_names[off]; + len = *data; + data++; + + /* update the offset to return the offset for the next symbol on + * the compressed stream */ + off += len + 1; + + /* for every byte on the compressed symbol data, copy the table + entry for that byte */ + while(len) { + tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; + data++; + len--; + + while (*tptr) { + if(skipped_first) { + *result = *tptr; + result++; + } else + skipped_first = 1; + tptr++; + } + } + + *result = '\0'; + + /* return to offset to the next symbol */ + return off; +} + +/* get symbol type information. This is encoded as a single char at the + * begining of the symbol name */ +static char kallsyms_get_symbol_type(unsigned int off) +{ + /* get just the first code, look it up in the token table, and return the + * first char from this token */ + return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; +} + + +/* find the offset on the compressed stream given and index in the + * kallsyms array */ +static unsigned int get_symbol_offset(unsigned long pos) +{ + u8 *name; + int i; + + /* use the closest marker we have. We have markers every 256 positions, + * so that should be close enough */ + name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; + + /* sequentially scan all the symbols up to the point we're searching for. + * Every symbol is stored in a [<len>][<len> bytes of data] format, so we + * just need to add the len to the current pointer for every symbol we + * wish to skip */ + for(i = 0; i < (pos&0xFF); i++) + name = name + (*name) + 1; + + return name - kallsyms_names; +} + +/* Lookup the address for this symbol. Returns 0 if not found. */ +unsigned long kallsyms_lookup_name(const char *name) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long i; + unsigned int off; + + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { + off = kallsyms_expand_symbol(off, namebuf); + + if (strcmp(namebuf, name) == 0) + return kallsyms_addresses[i]; + } + return module_kallsyms_lookup_name(name); +} +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); + +/* + * Lookup an address + * - modname is set to NULL if it's in the kernel + * - we guarantee that the returned name is valid until we reschedule even if + * it resides in a module + * - we also guarantee that modname will be valid until rescheduled + */ +const char *kallsyms_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, char *namebuf) +{ + unsigned long i, low, high, mid; + const char *msym; + + /* This kernel should never had been booted. */ + BUG_ON(!kallsyms_addresses); + + namebuf[KSYM_NAME_LEN] = 0; + namebuf[0] = 0; + + if ((all_var && is_kernel(addr)) || + (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) { + unsigned long symbol_end=0; + + /* do a binary search on the sorted kallsyms_addresses array */ + low = 0; + high = kallsyms_num_syms; + + while (high-low > 1) { + mid = (low + high) / 2; + if (kallsyms_addresses[mid] <= addr) low = mid; + else high = mid; + } + + /* search for the first aliased symbol. Aliased symbols are + symbols with the same address */ + while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low]) + --low; + + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(low), namebuf); + + /* Search for next non-aliased symbol */ + for (i = low + 1; i < kallsyms_num_syms; i++) { + if (kallsyms_addresses[i] > kallsyms_addresses[low]) { + symbol_end = kallsyms_addresses[i]; + break; + } + } + + /* if we found no next symbol, we use the end of the section */ + if (!symbol_end) { + if (is_kernel_inittext(addr)) + symbol_end = (unsigned long)_einittext; + else + symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext; + } + + *symbolsize = symbol_end - kallsyms_addresses[low]; + *modname = NULL; + *offset = addr - kallsyms_addresses[low]; + return namebuf; + } + + /* see if it's in a module */ + msym = module_address_lookup(addr, symbolsize, offset, modname); + if (msym) + return strncpy(namebuf, msym, KSYM_NAME_LEN); + + return NULL; +} + +/* Replace "%s" in format with address, or returns -errno. */ +void __print_symbol(const char *fmt, unsigned long address) +{ + char *modname; + const char *name; + unsigned long offset, size; + char namebuf[KSYM_NAME_LEN+1]; + char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN + + 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1]; + + name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + + if (!name) + sprintf(buffer, "0x%lx", address); + else { + if (modname) + sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, + size, modname); + else + sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); + } + printk(fmt, buffer); +} + +/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ +struct kallsym_iter +{ + loff_t pos; + struct module *owner; + unsigned long value; + unsigned int nameoff; /* If iterating in core kernel symbols */ + char type; + char name[KSYM_NAME_LEN+1]; +}; + +/* Only label it "global" if it is exported. */ +static void upcase_if_global(struct kallsym_iter *iter) +{ + if (is_exported(iter->name, iter->owner)) + iter->type += 'A' - 'a'; +} + +static int get_ksymbol_mod(struct kallsym_iter *iter) +{ + iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, + &iter->type, iter->name); + if (iter->owner == NULL) + return 0; + + upcase_if_global(iter); + return 1; +} + +/* Returns space to next name. */ +static unsigned long get_ksymbol_core(struct kallsym_iter *iter) +{ + unsigned off = iter->nameoff; + + iter->owner = NULL; + iter->value = kallsyms_addresses[iter->pos]; + + iter->type = kallsyms_get_symbol_type(off); + + off = kallsyms_expand_symbol(off, iter->name); + + return off - iter->nameoff; +} + +static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) +{ + iter->name[0] = '\0'; + iter->nameoff = get_symbol_offset(new_pos); + iter->pos = new_pos; +} + +/* Returns false if pos at or past end of file. */ +static int update_iter(struct kallsym_iter *iter, loff_t pos) +{ + /* Module symbols can be accessed randomly. */ + if (pos >= kallsyms_num_syms) { + iter->pos = pos; + return get_ksymbol_mod(iter); + } + + /* If we're not on the desired position, reset to new position. */ + if (pos != iter->pos) + reset_iter(iter, pos); + + iter->nameoff += get_ksymbol_core(iter); + iter->pos++; + + return 1; +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + (*pos)++; + + if (!update_iter(m->private, *pos)) + return NULL; + return p; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + if (!update_iter(m->private, *pos)) + return NULL; + return m->private; +} + +static void s_stop(struct seq_file *m, void *p) +{ +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kallsym_iter *iter = m->private; + + /* Some debugging symbols have no name. Ignore them. */ + if (!iter->name[0]) + return 0; + + if (iter->owner) + seq_printf(m, "%0*lx %c %s\t[%s]\n", + (int)(2*sizeof(void*)), + iter->value, iter->type, iter->name, + module_name(iter->owner)); + else + seq_printf(m, "%0*lx %c %s\n", + (int)(2*sizeof(void*)), + iter->value, iter->type, iter->name); + return 0; +} + +static struct seq_operations kallsyms_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show +}; + +static int kallsyms_open(struct inode *inode, struct file *file) +{ + /* We keep iterator in m->private, since normal case is to + * s_start from where we left off, so we avoid doing + * using get_symbol_offset for every symbol */ + struct kallsym_iter *iter; + int ret; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + reset_iter(iter, 0); + + ret = seq_open(file, &kallsyms_op); + if (ret == 0) + ((struct seq_file *)file->private_data)->private = iter; + else + kfree(iter); + return ret; +} + +static int kallsyms_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + kfree(m->private); + return seq_release(inode, file); +} + +static struct file_operations kallsyms_operations = { + .open = kallsyms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kallsyms_release, +}; + +static int __init kallsyms_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("kallsyms", 0444, NULL); + if (entry) + entry->proc_fops = &kallsyms_operations; + return 0; +} +__initcall(kallsyms_init); + +EXPORT_SYMBOL(__print_symbol); diff --git a/kernel/kfifo.c b/kernel/kfifo.c new file mode 100644 index 00000000000..179baafcdd9 --- /dev/null +++ b/kernel/kfifo.c @@ -0,0 +1,168 @@ +/* + * A simple kernel FIFO implementation. + * + * Copyright (C) 2004 Stelian Pop <stelian@popies.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/kfifo.h> + +/** + * kfifo_init - allocates a new FIFO using a preallocated buffer + * @buffer: the preallocated buffer to be used. + * @size: the size of the internal buffer, this have to be a power of 2. + * @gfp_mask: get_free_pages mask, passed to kmalloc() + * @lock: the lock to be used to protect the fifo buffer + * + * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the + * struct kfifo with kfree(). + */ +struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, + unsigned int __nocast gfp_mask, spinlock_t *lock) +{ + struct kfifo *fifo; + + /* size must be a power of 2 */ + BUG_ON(size & (size - 1)); + + fifo = kmalloc(sizeof(struct kfifo), gfp_mask); + if (!fifo) + return ERR_PTR(-ENOMEM); + + fifo->buffer = buffer; + fifo->size = size; + fifo->in = fifo->out = 0; + fifo->lock = lock; + + return fifo; +} +EXPORT_SYMBOL(kfifo_init); + +/** + * kfifo_alloc - allocates a new FIFO and its internal buffer + * @size: the size of the internal buffer to be allocated. + * @gfp_mask: get_free_pages mask, passed to kmalloc() + * @lock: the lock to be used to protect the fifo buffer + * + * The size will be rounded-up to a power of 2. + */ +struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) +{ + unsigned char *buffer; + struct kfifo *ret; + + /* + * round up to the next power of 2, since our 'let the indices + * wrap' tachnique works only in this case. + */ + if (size & (size - 1)) { + BUG_ON(size > 0x80000000); + size = roundup_pow_of_two(size); + } + + buffer = kmalloc(size, gfp_mask); + if (!buffer) + return ERR_PTR(-ENOMEM); + + ret = kfifo_init(buffer, size, gfp_mask, lock); + + if (IS_ERR(ret)) + kfree(buffer); + + return ret; +} +EXPORT_SYMBOL(kfifo_alloc); + +/** + * kfifo_free - frees the FIFO + * @fifo: the fifo to be freed. + */ +void kfifo_free(struct kfifo *fifo) +{ + kfree(fifo->buffer); + kfree(fifo); +} +EXPORT_SYMBOL(kfifo_free); + +/** + * __kfifo_put - puts some data into the FIFO, no locking version + * @fifo: the fifo to be used. + * @buffer: the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most 'len' bytes from the 'buffer' into + * the FIFO depending on the free space, and returns the number of + * bytes copied. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int __kfifo_put(struct kfifo *fifo, + unsigned char *buffer, unsigned int len) +{ + unsigned int l; + + len = min(len, fifo->size - fifo->in + fifo->out); + + /* first put the data starting from fifo->in to buffer end */ + l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); + memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); + + /* then put the rest (if any) at the beginning of the buffer */ + memcpy(fifo->buffer, buffer + l, len - l); + + fifo->in += len; + + return len; +} +EXPORT_SYMBOL(__kfifo_put); + +/** + * __kfifo_get - gets some data from the FIFO, no locking version + * @fifo: the fifo to be used. + * @buffer: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most 'len' bytes from the FIFO into the + * 'buffer' and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int __kfifo_get(struct kfifo *fifo, + unsigned char *buffer, unsigned int len) +{ + unsigned int l; + + len = min(len, fifo->in - fifo->out); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); + memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + + /* then get the rest (if any) from the beginning of the buffer */ + memcpy(buffer + l, fifo->buffer, len - l); + + fifo->out += len; + + return len; +} +EXPORT_SYMBOL(__kfifo_get); diff --git a/kernel/kmod.c b/kernel/kmod.c new file mode 100644 index 00000000000..eed53d4f523 --- /dev/null +++ b/kernel/kmod.c @@ -0,0 +1,256 @@ +/* + kmod, the new module loader (replaces kerneld) + Kirk Petersen + + Reorganized not to be a daemon by Adam Richter, with guidance + from Greg Zornetzer. + + Modified to avoid chroot and file sharing problems. + Mikael Pettersson + + Limit the concurrent number of kmod modprobes to catch loops from + "modprobe needs a service that is in a module". + Keith Owens <kaos@ocs.com.au> December 1999 + + Unblock all signals when we exec a usermode process. + Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000 + + call_usermodehelper wait flag, and remove exec_usermodehelper. + Rusty Russell <rusty@rustcorp.com.au> Jan 2003 +*/ +#define __KERNEL_SYSCALLS__ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/smp_lock.h> +#include <linux/slab.h> +#include <linux/namespace.h> +#include <linux/completion.h> +#include <linux/file.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/uaccess.h> + +extern int max_threads; + +static struct workqueue_struct *khelper_wq; + +#ifdef CONFIG_KMOD + +/* + modprobe_path is set via /proc/sys. +*/ +char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; + +/** + * request_module - try to load a kernel module + * @fmt: printf style format string for the name of the module + * @varargs: arguements as specified in the format string + * + * Load a module using the user mode module loader. The function returns + * zero on success or a negative errno code on failure. Note that a + * successful module load does not mean the module did not then unload + * and exit on an error of its own. Callers must check that the service + * they requested is now available not blindly invoke it. + * + * If module auto-loading support is disabled then this function + * becomes a no-operation. + */ +int request_module(const char *fmt, ...) +{ + va_list args; + char module_name[MODULE_NAME_LEN]; + unsigned int max_modprobes; + int ret; + char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL }; + static atomic_t kmod_concurrent = ATOMIC_INIT(0); +#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ + static int kmod_loop_msg; + + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); + if (ret >= MODULE_NAME_LEN) + return -ENAMETOOLONG; + + /* If modprobe needs a service that is in a module, we get a recursive + * loop. Limit the number of running kmod threads to max_threads/2 or + * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method + * would be to run the parents of this process, counting how many times + * kmod was invoked. That would mean accessing the internals of the + * process tables to get the command line, proc_pid_cmdline is static + * and it is not worth changing the proc code just to handle this case. + * KAO. + * + * "trace the ppid" is simple, but will fail if someone's + * parent exits. I think this is as good as it gets. --RR + */ + max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); + atomic_inc(&kmod_concurrent); + if (atomic_read(&kmod_concurrent) > max_modprobes) { + /* We may be blaming an innocent here, but unlikely */ + if (kmod_loop_msg++ < 5) + printk(KERN_ERR + "request_module: runaway loop modprobe %s\n", + module_name); + atomic_dec(&kmod_concurrent); + return -ENOMEM; + } + + ret = call_usermodehelper(modprobe_path, argv, envp, 1); + atomic_dec(&kmod_concurrent); + return ret; +} +EXPORT_SYMBOL(request_module); +#endif /* CONFIG_KMOD */ + +struct subprocess_info { + struct completion *complete; + char *path; + char **argv; + char **envp; + int wait; + int retval; +}; + +/* + * This is the task which runs the usermode application + */ +static int ____call_usermodehelper(void *data) +{ + struct subprocess_info *sub_info = data; + int retval; + + /* Unblock all signals. */ + flush_signals(current); + spin_lock_irq(¤t->sighand->siglock); + flush_signal_handlers(current, 1); + sigemptyset(¤t->blocked); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + /* We can run anywhere, unlike our parent keventd(). */ + set_cpus_allowed(current, CPU_MASK_ALL); + + retval = -EPERM; + if (current->fs->root) + retval = execve(sub_info->path, sub_info->argv,sub_info->envp); + + /* Exec failed? */ + sub_info->retval = retval; + do_exit(0); +} + +/* Keventd can't block, but this (a child) can. */ +static int wait_for_helper(void *data) +{ + struct subprocess_info *sub_info = data; + pid_t pid; + struct k_sigaction sa; + + /* Install a handler: if SIGCLD isn't handled sys_wait4 won't + * populate the status, but will return -ECHILD. */ + sa.sa.sa_handler = SIG_IGN; + sa.sa.sa_flags = 0; + siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); + do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + allow_signal(SIGCHLD); + + pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + } else { + /* + * Normally it is bogus to call wait4() from in-kernel because + * wait4() wants to write the exit code to a userspace address. + * But wait_for_helper() always runs as keventd, and put_user() + * to a kernel address works OK for kernel threads, due to their + * having an mm_segment_t which spans the entire address space. + * + * Thus the __user pointer cast is valid here. + */ + sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); + } + + complete(sub_info->complete); + return 0; +} + +/* This is run by khelper thread */ +static void __call_usermodehelper(void *data) +{ + struct subprocess_info *sub_info = data; + pid_t pid; + + /* CLONE_VFORK: wait until the usermode helper has execve'd + * successfully We need the data structures to stay around + * until that is done. */ + if (sub_info->wait) + pid = kernel_thread(wait_for_helper, sub_info, + CLONE_FS | CLONE_FILES | SIGCHLD); + else + pid = kernel_thread(____call_usermodehelper, sub_info, + CLONE_VFORK | SIGCHLD); + + if (pid < 0) { + sub_info->retval = pid; + complete(sub_info->complete); + } else if (!sub_info->wait) + complete(sub_info->complete); +} + +/** + * call_usermodehelper - start a usermode application + * @path: pathname for the application + * @argv: null-terminated argument list + * @envp: null-terminated environment list + * @wait: wait for the application to finish and return status. + * + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of keventd. + * (ie. it runs with full root capabilities). + * + * Must be called from process context. Returns a negative error code + * if program was not execed successfully, or 0. + */ +int call_usermodehelper(char *path, char **argv, char **envp, int wait) +{ + DECLARE_COMPLETION(done); + struct subprocess_info sub_info = { + .complete = &done, + .path = path, + .argv = argv, + .envp = envp, + .wait = wait, + .retval = 0, + }; + DECLARE_WORK(work, __call_usermodehelper, &sub_info); + + if (!khelper_wq) + return -EBUSY; + + if (path[0] == '\0') + return 0; + + queue_work(khelper_wq, &work); + wait_for_completion(&done); + return sub_info.retval; +} +EXPORT_SYMBOL(call_usermodehelper); + +void __init usermodehelper_init(void) +{ + khelper_wq = create_singlethread_workqueue("khelper"); + BUG_ON(!khelper_wq); +} diff --git a/kernel/kprobes.c b/kernel/kprobes.c new file mode 100644 index 00000000000..1d5dd1337bd --- /dev/null +++ b/kernel/kprobes.c @@ -0,0 +1,157 @@ +/* + * Kernel Probes (KProbes) + * kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation (includes suggestions from + * Rusty Russell). + * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with + * hlists and exceptions notifier as suggested by Andi Kleen. + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes + * exceptions notifier to be first on the priority list. + */ +#include <linux/kprobes.h> +#include <linux/spinlock.h> +#include <linux/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <asm/cacheflush.h> +#include <asm/errno.h> +#include <asm/kdebug.h> + +#define KPROBE_HASH_BITS 6 +#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; + +unsigned int kprobe_cpu = NR_CPUS; +static DEFINE_SPINLOCK(kprobe_lock); + +/* Locks kprobe: irqs must be disabled */ +void lock_kprobes(void) +{ + spin_lock(&kprobe_lock); + kprobe_cpu = smp_processor_id(); +} + +void unlock_kprobes(void) +{ + kprobe_cpu = NR_CPUS; + spin_unlock(&kprobe_lock); +} + +/* You have to be holding the kprobe_lock */ +struct kprobe *get_kprobe(void *addr) +{ + struct hlist_head *head; + struct hlist_node *node; + + head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; + hlist_for_each(node, head) { + struct kprobe *p = hlist_entry(node, struct kprobe, hlist); + if (p->addr == addr) + return p; + } + return NULL; +} + +int register_kprobe(struct kprobe *p) +{ + int ret = 0; + unsigned long flags = 0; + + if ((ret = arch_prepare_kprobe(p)) != 0) { + goto rm_kprobe; + } + spin_lock_irqsave(&kprobe_lock, flags); + INIT_HLIST_NODE(&p->hlist); + if (get_kprobe(p->addr)) { + ret = -EEXIST; + goto out; + } + arch_copy_kprobe(p); + + hlist_add_head(&p->hlist, + &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + p->opcode = *p->addr; + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +out: + spin_unlock_irqrestore(&kprobe_lock, flags); +rm_kprobe: + if (ret == -EEXIST) + arch_remove_kprobe(p); + return ret; +} + +void unregister_kprobe(struct kprobe *p) +{ + unsigned long flags; + arch_remove_kprobe(p); + spin_lock_irqsave(&kprobe_lock, flags); + *p->addr = p->opcode; + hlist_del(&p->hlist); + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irqrestore(&kprobe_lock, flags); +} + +static struct notifier_block kprobe_exceptions_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to notified first */ +}; + +int register_jprobe(struct jprobe *jp) +{ + /* Todo: Verify probepoint is a function entry point */ + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + + return register_kprobe(&jp->kp); +} + +void unregister_jprobe(struct jprobe *jp) +{ + unregister_kprobe(&jp->kp); +} + +static int __init init_kprobes(void) +{ + int i, err = 0; + + /* FIXME allocate the probe table, currently defined statically */ + /* initialize all list heads */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&kprobe_table[i]); + + err = register_die_notifier(&kprobe_exceptions_nb); + return err; +} + +__initcall(init_kprobes); + +EXPORT_SYMBOL_GPL(register_kprobe); +EXPORT_SYMBOL_GPL(unregister_kprobe); +EXPORT_SYMBOL_GPL(register_jprobe); +EXPORT_SYMBOL_GPL(unregister_jprobe); +EXPORT_SYMBOL_GPL(jprobe_return); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c new file mode 100644 index 00000000000..1f064a63f8c --- /dev/null +++ b/kernel/ksysfs.c @@ -0,0 +1,57 @@ +/* + * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which + * are not related to any other subsystem + * + * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> + * + * This file is release under the GPLv2 + * + */ + +#include <linux/config.h> +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/module.h> +#include <linux/init.h> + +#define KERNEL_ATTR_RO(_name) \ +static struct subsys_attribute _name##_attr = __ATTR_RO(_name) + +#define KERNEL_ATTR_RW(_name) \ +static struct subsys_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +#ifdef CONFIG_HOTPLUG +static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); +} +KERNEL_ATTR_RO(hotplug_seqnum); +#endif + +decl_subsys(kernel, NULL, NULL); +EXPORT_SYMBOL_GPL(kernel_subsys); + +static struct attribute * kernel_attrs[] = { +#ifdef CONFIG_HOTPLUG + &hotplug_seqnum_attr.attr, +#endif + NULL +}; + +static struct attribute_group kernel_attr_group = { + .attrs = kernel_attrs, +}; + +static int __init ksysfs_init(void) +{ + int error = subsystem_register(&kernel_subsys); + if (!error) + error = sysfs_create_group(&kernel_subsys.kset.kobj, + &kernel_attr_group); + + return error; +} + +core_initcall(ksysfs_init); diff --git a/kernel/kthread.c b/kernel/kthread.c new file mode 100644 index 00000000000..e377e224410 --- /dev/null +++ b/kernel/kthread.c @@ -0,0 +1,202 @@ +/* Kernel thread helper functions. + * Copyright (C) 2004 IBM Corporation, Rusty Russell. + * + * Creation is done via keventd, so that we get a clean environment + * even if we're invoked from userspace (think modprobe, hotplug cpu, + * etc.). + */ +#include <linux/sched.h> +#include <linux/kthread.h> +#include <linux/completion.h> +#include <linux/err.h> +#include <linux/unistd.h> +#include <linux/file.h> +#include <linux/module.h> +#include <asm/semaphore.h> + +/* + * We dont want to execute off keventd since it might + * hold a semaphore our callers hold too: + */ +static struct workqueue_struct *helper_wq; + +struct kthread_create_info +{ + /* Information passed to kthread() from keventd. */ + int (*threadfn)(void *data); + void *data; + struct completion started; + + /* Result passed back to kthread_create() from keventd. */ + struct task_struct *result; + struct completion done; +}; + +struct kthread_stop_info +{ + struct task_struct *k; + int err; + struct completion done; +}; + +/* Thread stopping is done by setthing this var: lock serializes + * multiple kthread_stop calls. */ +static DECLARE_MUTEX(kthread_stop_lock); +static struct kthread_stop_info kthread_stop_info; + +int kthread_should_stop(void) +{ + return (kthread_stop_info.k == current); +} +EXPORT_SYMBOL(kthread_should_stop); + +static void kthread_exit_files(void) +{ + struct fs_struct *fs; + struct task_struct *tsk = current; + + exit_fs(tsk); /* current->fs->count--; */ + fs = init_task.fs; + tsk->fs = fs; + atomic_inc(&fs->count); + exit_files(tsk); + current->files = init_task.files; + atomic_inc(&tsk->files->count); +} + +static int kthread(void *_create) +{ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data); + void *data; + sigset_t blocked; + int ret = -EINTR; + + kthread_exit_files(); + + /* Copy data: it's on keventd's stack */ + threadfn = create->threadfn; + data = create->data; + + /* Block and flush all signals (in case we're not from keventd). */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* By default we can run anywhere, unlike keventd. */ + set_cpus_allowed(current, CPU_MASK_ALL); + + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_INTERRUPTIBLE); + complete(&create->started); + schedule(); + + if (!kthread_should_stop()) + ret = threadfn(data); + + /* It might have exited on its own, w/o kthread_stop. Check. */ + if (kthread_should_stop()) { + kthread_stop_info.err = ret; + complete(&kthread_stop_info.done); + } + return 0; +} + +/* We are keventd: create a thread. */ +static void keventd_create_kthread(void *_create) +{ + struct kthread_create_info *create = _create; + int pid; + + /* We want our own signal handler (we take no signals by default). */ + pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); + if (pid < 0) { + create->result = ERR_PTR(pid); + } else { + wait_for_completion(&create->started); + create->result = find_task_by_pid(pid); + } + complete(&create->done); +} + +struct task_struct *kthread_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], + ...) +{ + struct kthread_create_info create; + DECLARE_WORK(work, keventd_create_kthread, &create); + + create.threadfn = threadfn; + create.data = data; + init_completion(&create.started); + init_completion(&create.done); + + /* + * The workqueue needs to start up first: + */ + if (!helper_wq) + work.func(work.data); + else { + queue_work(helper_wq, &work); + wait_for_completion(&create.done); + } + if (!IS_ERR(create.result)) { + va_list args; + va_start(args, namefmt); + vsnprintf(create.result->comm, sizeof(create.result->comm), + namefmt, args); + va_end(args); + } + + return create.result; +} +EXPORT_SYMBOL(kthread_create); + +void kthread_bind(struct task_struct *k, unsigned int cpu) +{ + BUG_ON(k->state != TASK_INTERRUPTIBLE); + /* Must have done schedule() in kthread() before we set_task_cpu */ + wait_task_inactive(k); + set_task_cpu(k, cpu); + k->cpus_allowed = cpumask_of_cpu(cpu); +} +EXPORT_SYMBOL(kthread_bind); + +int kthread_stop(struct task_struct *k) +{ + int ret; + + down(&kthread_stop_lock); + + /* It could exit after stop_info.k set, but before wake_up_process. */ + get_task_struct(k); + + /* Must init completion *before* thread sees kthread_stop_info.k */ + init_completion(&kthread_stop_info.done); + wmb(); + + /* Now set kthread_should_stop() to true, and wake it up. */ + kthread_stop_info.k = k; + wake_up_process(k); + put_task_struct(k); + + /* Once it dies, reset stop ptr, gather result and we're done. */ + wait_for_completion(&kthread_stop_info.done); + kthread_stop_info.k = NULL; + ret = kthread_stop_info.err; + up(&kthread_stop_lock); + + return ret; +} +EXPORT_SYMBOL(kthread_stop); + +static __init int helper_init(void) +{ + helper_wq = create_singlethread_workqueue("kthread"); + BUG_ON(!helper_wq); + + return 0; +} +core_initcall(helper_init); + diff --git a/kernel/module.c b/kernel/module.c new file mode 100644 index 00000000000..2dbfa0773fa --- /dev/null +++ b/kernel/module.c @@ -0,0 +1,2108 @@ +/* Rewritten by Rusty Russell, on the backs of many others... + Copyright (C) 2002 Richard Henderson + Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/moduleloader.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/elf.h> +#include <linux/seq_file.h> +#include <linux/syscalls.h> +#include <linux/fcntl.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/moduleparam.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/vermagic.h> +#include <linux/notifier.h> +#include <linux/stop_machine.h> +#include <linux/device.h> +#include <asm/uaccess.h> +#include <asm/semaphore.h> +#include <asm/cacheflush.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , a...) +#endif + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* If this is set, the section belongs in the init part of the module */ +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) + +/* Protects module list */ +static DEFINE_SPINLOCK(modlist_lock); + +/* List of modules, protected by module_mutex AND modlist_lock */ +static DECLARE_MUTEX(module_mutex); +static LIST_HEAD(modules); + +static DECLARE_MUTEX(notify_mutex); +static struct notifier_block * module_notify_list; + +int register_module_notifier(struct notifier_block * nb) +{ + int err; + down(¬ify_mutex); + err = notifier_chain_register(&module_notify_list, nb); + up(¬ify_mutex); + return err; +} +EXPORT_SYMBOL(register_module_notifier); + +int unregister_module_notifier(struct notifier_block * nb) +{ + int err; + down(¬ify_mutex); + err = notifier_chain_unregister(&module_notify_list, nb); + up(¬ify_mutex); + return err; +} +EXPORT_SYMBOL(unregister_module_notifier); + +/* We require a truly strong try_module_get() */ +static inline int strong_try_module_get(struct module *mod) +{ + if (mod && mod->state == MODULE_STATE_COMING) + return 0; + return try_module_get(mod); +} + +/* A thread that wants to hold a reference to a module only while it + * is running can call ths to safely exit. + * nfsd and lockd use this. + */ +void __module_put_and_exit(struct module *mod, long code) +{ + module_put(mod); + do_exit(code); +} +EXPORT_SYMBOL(__module_put_and_exit); + +/* Find a module section: 0 means not found. */ +static unsigned int find_sec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings, + const char *name) +{ + unsigned int i; + + for (i = 1; i < hdr->e_shnum; i++) + /* Alloc bit cleared means "ignore it." */ + if ((sechdrs[i].sh_flags & SHF_ALLOC) + && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) + return i; + return 0; +} + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const unsigned long __start___kcrctab[]; +extern const unsigned long __start___kcrctab_gpl[]; + +#ifndef CONFIG_MODVERSIONS +#define symversion(base, idx) NULL +#else +#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) +#endif + +/* Find a symbol, return value, crc and module which owns it */ +static unsigned long __find_symbol(const char *name, + struct module **owner, + const unsigned long **crc, + int gplok) +{ + struct module *mod; + unsigned int i; + + /* Core kernel first. */ + *owner = NULL; + for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) { + if (strcmp(__start___ksymtab[i].name, name) == 0) { + *crc = symversion(__start___kcrctab, i); + return __start___ksymtab[i].value; + } + } + if (gplok) { + for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++) + if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) { + *crc = symversion(__start___kcrctab_gpl, i); + return __start___ksymtab_gpl[i].value; + } + } + + /* Now try modules. */ + list_for_each_entry(mod, &modules, list) { + *owner = mod; + for (i = 0; i < mod->num_syms; i++) + if (strcmp(mod->syms[i].name, name) == 0) { + *crc = symversion(mod->crcs, i); + return mod->syms[i].value; + } + + if (gplok) { + for (i = 0; i < mod->num_gpl_syms; i++) { + if (strcmp(mod->gpl_syms[i].name, name) == 0) { + *crc = symversion(mod->gpl_crcs, i); + return mod->gpl_syms[i].value; + } + } + } + } + DEBUGP("Failed to find symbol %s\n", name); + return 0; +} + +/* Find a symbol in this elf symbol table */ +static unsigned long find_local_symbol(Elf_Shdr *sechdrs, + unsigned int symindex, + const char *strtab, + const char *name) +{ + unsigned int i; + Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; + + /* Search (defined) internal symbols first. */ + for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) { + if (sym[i].st_shndx != SHN_UNDEF + && strcmp(name, strtab + sym[i].st_name) == 0) + return sym[i].st_value; + } + return 0; +} + +/* Search for module by name: must hold module_mutex. */ +static struct module *find_module(const char *name) +{ + struct module *mod; + + list_for_each_entry(mod, &modules, list) { + if (strcmp(mod->name, name) == 0) + return mod; + } + return NULL; +} + +#ifdef CONFIG_SMP +/* Number of blocks used and allocated. */ +static unsigned int pcpu_num_used, pcpu_num_allocated; +/* Size of each block. -ve means used. */ +static int *pcpu_size; + +static int split_block(unsigned int i, unsigned short size) +{ + /* Reallocation required? */ + if (pcpu_num_used + 1 > pcpu_num_allocated) { + int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, + GFP_KERNEL); + if (!new) + return 0; + + memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); + pcpu_num_allocated *= 2; + kfree(pcpu_size); + pcpu_size = new; + } + + /* Insert a new subblock */ + memmove(&pcpu_size[i+1], &pcpu_size[i], + sizeof(pcpu_size[0]) * (pcpu_num_used - i)); + pcpu_num_used++; + + pcpu_size[i+1] -= size; + pcpu_size[i] = size; + return 1; +} + +static inline unsigned int block_size(int val) +{ + if (val < 0) + return -val; + return val; +} + +/* Created by linker magic */ +extern char __per_cpu_start[], __per_cpu_end[]; + +static void *percpu_modalloc(unsigned long size, unsigned long align) +{ + unsigned long extra; + unsigned int i; + void *ptr; + + BUG_ON(align > SMP_CACHE_BYTES); + + ptr = __per_cpu_start; + for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { + /* Extra for alignment requirement. */ + extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; + BUG_ON(i == 0 && extra != 0); + + if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) + continue; + + /* Transfer extra to previous block. */ + if (pcpu_size[i-1] < 0) + pcpu_size[i-1] -= extra; + else + pcpu_size[i-1] += extra; + pcpu_size[i] -= extra; + ptr += extra; + + /* Split block if warranted */ + if (pcpu_size[i] - size > sizeof(unsigned long)) + if (!split_block(i, size)) + return NULL; + + /* Mark allocated */ + pcpu_size[i] = -pcpu_size[i]; + return ptr; + } + + printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", + size); + return NULL; +} + +static void percpu_modfree(void *freeme) +{ + unsigned int i; + void *ptr = __per_cpu_start + block_size(pcpu_size[0]); + + /* First entry is core kernel percpu data. */ + for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { + if (ptr == freeme) { + pcpu_size[i] = -pcpu_size[i]; + goto free; + } + } + BUG(); + + free: + /* Merge with previous? */ + if (pcpu_size[i-1] >= 0) { + pcpu_size[i-1] += pcpu_size[i]; + pcpu_num_used--; + memmove(&pcpu_size[i], &pcpu_size[i+1], + (pcpu_num_used - i) * sizeof(pcpu_size[0])); + i--; + } + /* Merge with next? */ + if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { + pcpu_size[i] += pcpu_size[i+1]; + pcpu_num_used--; + memmove(&pcpu_size[i+1], &pcpu_size[i+2], + (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); + } +} + +static unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static int percpu_modinit(void) +{ + pcpu_num_used = 2; + pcpu_num_allocated = 2; + pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, + GFP_KERNEL); + /* Static in-kernel percpu data (used). */ + pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); + /* Free room. */ + pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; + if (pcpu_size[1] < 0) { + printk(KERN_ERR "No per-cpu room for modules.\n"); + pcpu_num_used = 1; + } + + return 0; +} +__initcall(percpu_modinit); +#else /* ... !CONFIG_SMP */ +static inline void *percpu_modalloc(unsigned long size, unsigned long align) +{ + return NULL; +} +static inline void percpu_modfree(void *pcpuptr) +{ + BUG(); +} +static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return 0; +} +static inline void percpu_modcopy(void *pcpudst, const void *src, + unsigned long size) +{ + /* pcpusec should be 0, and size of that section should be 0. */ + BUG_ON(size != 0); +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_MODULE_UNLOAD +/* Init the unload section of the module. */ +static void module_unload_init(struct module *mod) +{ + unsigned int i; + + INIT_LIST_HEAD(&mod->modules_which_use_me); + for (i = 0; i < NR_CPUS; i++) + local_set(&mod->ref[i].count, 0); + /* Hold reference count during initialization. */ + local_set(&mod->ref[_smp_processor_id()].count, 1); + /* Backwards compatibility macros put refcount during init. */ + mod->waiter = current; +} + +/* modules using other modules */ +struct module_use +{ + struct list_head list; + struct module *module_which_uses; +}; + +/* Does a already use b? */ +static int already_uses(struct module *a, struct module *b) +{ + struct module_use *use; + + list_for_each_entry(use, &b->modules_which_use_me, list) { + if (use->module_which_uses == a) { + DEBUGP("%s uses %s!\n", a->name, b->name); + return 1; + } + } + DEBUGP("%s does not use %s!\n", a->name, b->name); + return 0; +} + +/* Module a uses b */ +static int use_module(struct module *a, struct module *b) +{ + struct module_use *use; + if (b == NULL || already_uses(a, b)) return 1; + + if (!strong_try_module_get(b)) + return 0; + + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk("%s: out of memory loading\n", a->name); + module_put(b); + return 0; + } + + use->module_which_uses = a; + list_add(&use->list, &b->modules_which_use_me); + return 1; +} + +/* Clear the unload stuff of the module. */ +static void module_unload_free(struct module *mod) +{ + struct module *i; + + list_for_each_entry(i, &modules, list) { + struct module_use *use; + + list_for_each_entry(use, &i->modules_which_use_me, list) { + if (use->module_which_uses == mod) { + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->list); + kfree(use); + /* There can be at most one match. */ + break; + } + } + } +} + +#ifdef CONFIG_MODULE_FORCE_UNLOAD +static inline int try_force(unsigned int flags) +{ + int ret = (flags & O_TRUNC); + if (ret) + tainted |= TAINT_FORCED_MODULE; + return ret; +} +#else +static inline int try_force(unsigned int flags) +{ + return 0; +} +#endif /* CONFIG_MODULE_FORCE_UNLOAD */ + +struct stopref +{ + struct module *mod; + int flags; + int *forced; +}; + +/* Whole machine is stopped with interrupts off when this runs. */ +static int __try_stop_module(void *_sref) +{ + struct stopref *sref = _sref; + + /* If it's not unused, quit unless we are told to block. */ + if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { + if (!(*sref->forced = try_force(sref->flags))) + return -EWOULDBLOCK; + } + + /* Mark it as dying. */ + sref->mod->state = MODULE_STATE_GOING; + return 0; +} + +static int try_stop_module(struct module *mod, int flags, int *forced) +{ + struct stopref sref = { mod, flags, forced }; + + return stop_machine_run(__try_stop_module, &sref, NR_CPUS); +} + +unsigned int module_refcount(struct module *mod) +{ + unsigned int i, total = 0; + + for (i = 0; i < NR_CPUS; i++) + total += local_read(&mod->ref[i].count); + return total; +} +EXPORT_SYMBOL(module_refcount); + +/* This exists whether we can unload or not */ +static void free_module(struct module *mod); + +static void wait_for_zero_refcount(struct module *mod) +{ + /* Since we might sleep for some time, drop the semaphore first */ + up(&module_mutex); + for (;;) { + DEBUGP("Looking at refcount...\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + if (module_refcount(mod) == 0) + break; + schedule(); + } + current->state = TASK_RUNNING; + down(&module_mutex); +} + +asmlinkage long +sys_delete_module(const char __user *name_user, unsigned int flags) +{ + struct module *mod; + char name[MODULE_NAME_LEN]; + int ret, forced = 0; + + if (!capable(CAP_SYS_MODULE)) + return -EPERM; + + if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) + return -EFAULT; + name[MODULE_NAME_LEN-1] = '\0'; + + if (down_interruptible(&module_mutex) != 0) + return -EINTR; + + mod = find_module(name); + if (!mod) { + ret = -ENOENT; + goto out; + } + + if (!list_empty(&mod->modules_which_use_me)) { + /* Other modules depend on us: get rid of them first. */ + ret = -EWOULDBLOCK; + goto out; + } + + /* Doing init or already dying? */ + if (mod->state != MODULE_STATE_LIVE) { + /* FIXME: if (force), slam module count and wake up + waiter --RR */ + DEBUGP("%s already dying\n", mod->name); + ret = -EBUSY; + goto out; + } + + /* If it has an init func, it must have an exit func to unload */ + if ((mod->init != NULL && mod->exit == NULL) + || mod->unsafe) { + forced = try_force(flags); + if (!forced) { + /* This module can't be removed */ + ret = -EBUSY; + goto out; + } + } + + /* Set this up before setting mod->state */ + mod->waiter = current; + + /* Stop the machine so refcounts can't move and disable module. */ + ret = try_stop_module(mod, flags, &forced); + if (ret != 0) + goto out; + + /* Never wait if forced. */ + if (!forced && module_refcount(mod) != 0) + wait_for_zero_refcount(mod); + + /* Final destruction now noone is using it. */ + if (mod->exit != NULL) { + up(&module_mutex); + mod->exit(); + down(&module_mutex); + } + free_module(mod); + + out: + up(&module_mutex); + return ret; +} + +static void print_unload_info(struct seq_file *m, struct module *mod) +{ + struct module_use *use; + int printed_something = 0; + + seq_printf(m, " %u ", module_refcount(mod)); + + /* Always include a trailing , so userspace can differentiate + between this and the old multi-field proc format. */ + list_for_each_entry(use, &mod->modules_which_use_me, list) { + printed_something = 1; + seq_printf(m, "%s,", use->module_which_uses->name); + } + + if (mod->unsafe) { + printed_something = 1; + seq_printf(m, "[unsafe],"); + } + + if (mod->init != NULL && mod->exit == NULL) { + printed_something = 1; + seq_printf(m, "[permanent],"); + } + + if (!printed_something) + seq_printf(m, "-"); +} + +void __symbol_put(const char *symbol) +{ + struct module *owner; + unsigned long flags; + const unsigned long *crc; + + spin_lock_irqsave(&modlist_lock, flags); + if (!__find_symbol(symbol, &owner, &crc, 1)) + BUG(); + module_put(owner); + spin_unlock_irqrestore(&modlist_lock, flags); +} +EXPORT_SYMBOL(__symbol_put); + +void symbol_put_addr(void *addr) +{ + unsigned long flags; + + spin_lock_irqsave(&modlist_lock, flags); + if (!kernel_text_address((unsigned long)addr)) + BUG(); + + module_put(module_text_address((unsigned long)addr)); + spin_unlock_irqrestore(&modlist_lock, flags); +} +EXPORT_SYMBOL_GPL(symbol_put_addr); + +static ssize_t show_refcnt(struct module_attribute *mattr, + struct module *mod, char *buffer) +{ + /* sysfs holds a reference */ + return sprintf(buffer, "%u\n", module_refcount(mod)-1); +} + +static struct module_attribute refcnt = { + .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, + .show = show_refcnt, +}; + +#else /* !CONFIG_MODULE_UNLOAD */ +static void print_unload_info(struct seq_file *m, struct module *mod) +{ + /* We don't know the usage count, or what modules are using. */ + seq_printf(m, " - -"); +} + +static inline void module_unload_free(struct module *mod) +{ +} + +static inline int use_module(struct module *a, struct module *b) +{ + return strong_try_module_get(b); +} + +static inline void module_unload_init(struct module *mod) +{ +} +#endif /* CONFIG_MODULE_UNLOAD */ + +#ifdef CONFIG_OBSOLETE_MODPARM +/* Bounds checking done below */ +static int obsparm_copy_string(const char *val, struct kernel_param *kp) +{ + strcpy(kp->arg, val); + return 0; +} + +int set_obsolete(const char *val, struct kernel_param *kp) +{ + unsigned int min, max; + unsigned int size, maxsize; + int dummy; + char *endp; + const char *p; + struct obsolete_modparm *obsparm = kp->arg; + + if (!val) { + printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); + return -EINVAL; + } + + /* type is: [min[-max]]{b,h,i,l,s} */ + p = obsparm->type; + min = simple_strtol(p, &endp, 10); + if (endp == obsparm->type) + min = max = 1; + else if (*endp == '-') { + p = endp+1; + max = simple_strtol(p, &endp, 10); + } else + max = min; + switch (*endp) { + case 'b': + return param_array(kp->name, val, min, max, obsparm->addr, + 1, param_set_byte, &dummy); + case 'h': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(short), param_set_short, &dummy); + case 'i': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(int), param_set_int, &dummy); + case 'l': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(long), param_set_long, &dummy); + case 's': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(char *), param_set_charp, &dummy); + + case 'c': + /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars, + and the decl is "char xxx[5][50];" */ + p = endp+1; + maxsize = simple_strtol(p, &endp, 10); + /* We check lengths here (yes, this is a hack). */ + p = val; + while (p[size = strcspn(p, ",")]) { + if (size >= maxsize) + goto oversize; + p += size+1; + } + if (size >= maxsize) + goto oversize; + return param_array(kp->name, val, min, max, obsparm->addr, + maxsize, obsparm_copy_string, &dummy); + } + printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); + return -EINVAL; + oversize: + printk(KERN_ERR + "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize); + return -EINVAL; +} + +static int obsolete_params(const char *name, + char *args, + struct obsolete_modparm obsparm[], + unsigned int num, + Elf_Shdr *sechdrs, + unsigned int symindex, + const char *strtab) +{ + struct kernel_param *kp; + unsigned int i; + int ret; + + kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); + if (!kp) + return -ENOMEM; + + for (i = 0; i < num; i++) { + char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)]; + + snprintf(sym_name, sizeof(sym_name), "%s%s", + MODULE_SYMBOL_PREFIX, obsparm[i].name); + + kp[i].name = obsparm[i].name; + kp[i].perm = 000; + kp[i].set = set_obsolete; + kp[i].get = NULL; + obsparm[i].addr + = (void *)find_local_symbol(sechdrs, symindex, strtab, + sym_name); + if (!obsparm[i].addr) { + printk("%s: falsely claims to have parameter %s\n", + name, obsparm[i].name); + ret = -EINVAL; + goto out; + } + kp[i].arg = &obsparm[i]; + } + + ret = parse_args(name, args, kp, num, NULL); + out: + kfree(kp); + return ret; +} +#else +static int obsolete_params(const char *name, + char *args, + struct obsolete_modparm obsparm[], + unsigned int num, + Elf_Shdr *sechdrs, + unsigned int symindex, + const char *strtab) +{ + if (num != 0) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + name); + return 0; +} +#endif /* CONFIG_OBSOLETE_MODPARM */ + +static const char vermagic[] = VERMAGIC_STRING; + +#ifdef CONFIG_MODVERSIONS +static int check_version(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *symname, + struct module *mod, + const unsigned long *crc) +{ + unsigned int i, num_versions; + struct modversion_info *versions; + + /* Exporting module didn't supply crcs? OK, we're already tainted. */ + if (!crc) + return 1; + + versions = (void *) sechdrs[versindex].sh_addr; + num_versions = sechdrs[versindex].sh_size + / sizeof(struct modversion_info); + + for (i = 0; i < num_versions; i++) { + if (strcmp(versions[i].name, symname) != 0) + continue; + + if (versions[i].crc == *crc) + return 1; + printk("%s: disagrees about version of symbol %s\n", + mod->name, symname); + DEBUGP("Found checksum %lX vs module %lX\n", + *crc, versions[i].crc); + return 0; + } + /* Not in module's version table. OK, but that taints the kernel. */ + if (!(tainted & TAINT_FORCED_MODULE)) { + printk("%s: no version for \"%s\" found: kernel tainted.\n", + mod->name, symname); + tainted |= TAINT_FORCED_MODULE; + } + return 1; +} + +static inline int check_modstruct_version(Elf_Shdr *sechdrs, + unsigned int versindex, + struct module *mod) +{ + const unsigned long *crc; + struct module *owner; + + if (!__find_symbol("struct_module", &owner, &crc, 1)) + BUG(); + return check_version(sechdrs, versindex, "struct_module", mod, + crc); +} + +/* First part is kernel version, which we ignore. */ +static inline int same_magic(const char *amagic, const char *bmagic) +{ + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + return strcmp(amagic, bmagic) == 0; +} +#else +static inline int check_version(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *symname, + struct module *mod, + const unsigned long *crc) +{ + return 1; +} + +static inline int check_modstruct_version(Elf_Shdr *sechdrs, + unsigned int versindex, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ + +/* Resolve a symbol for this module. I.e. if we find one, record usage. + Must be holding module_mutex. */ +static unsigned long resolve_symbol(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *name, + struct module *mod) +{ + struct module *owner; + unsigned long ret; + const unsigned long *crc; + + spin_lock_irq(&modlist_lock); + ret = __find_symbol(name, &owner, &crc, mod->license_gplok); + if (ret) { + /* use_module can fail due to OOM, or module unloading */ + if (!check_version(sechdrs, versindex, name, mod, crc) || + !use_module(mod, owner)) + ret = 0; + } + spin_unlock_irq(&modlist_lock); + return ret; +} + + +/* + * /sys/module/foo/sections stuff + * J. Corbet <corbet@lwn.net> + */ +#ifdef CONFIG_KALLSYMS +static ssize_t module_sect_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_sect_attr *sattr = + container_of(mattr, struct module_sect_attr, mattr); + return sprintf(buf, "0x%lx\n", sattr->address); +} + +static void add_sect_attrs(struct module *mod, unsigned int nsect, + char *secstrings, Elf_Shdr *sechdrs) +{ + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; + struct module_sect_attr *sattr; + struct attribute **gattr; + + /* Count loaded sections and allocate structures */ + for (i = 0; i < nsect; i++) + if (sechdrs[i].sh_flags & SHF_ALLOC) + nloaded++; + size[0] = ALIGN(sizeof(*sect_attrs) + + nloaded * sizeof(sect_attrs->attrs[0]), + sizeof(sect_attrs->grp.attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) + return; + + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.attrs[0]; + for (i = 0; i < nsect; i++) { + if (! (sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + sattr->address = sechdrs[i].sh_addr; + strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, + MODULE_SECT_NAME_LEN); + sattr->mattr.show = module_sect_show; + sattr->mattr.store = NULL; + sattr->mattr.attr.name = sattr->name; + sattr->mattr.attr.owner = mod; + sattr->mattr.attr.mode = S_IRUGO; + *(gattr++) = &(sattr++)->mattr.attr; + } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; + return; + out: + kfree(sect_attrs); +} + +static void remove_sect_attrs(struct module *mod) +{ + if (mod->sect_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. */ + kfree(mod->sect_attrs); + mod->sect_attrs = NULL; + } +} + + +#else +static inline void add_sect_attrs(struct module *mod, unsigned int nsect, + char *sectstrings, Elf_Shdr *sechdrs) +{ +} + +static inline void remove_sect_attrs(struct module *mod) +{ +} +#endif /* CONFIG_KALLSYMS */ + + +#ifdef CONFIG_MODULE_UNLOAD +static inline int module_add_refcnt_attr(struct module *mod) +{ + return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr); +} +static void module_remove_refcnt_attr(struct module *mod) +{ + return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr); +} +#else +static inline int module_add_refcnt_attr(struct module *mod) +{ + return 0; +} +static void module_remove_refcnt_attr(struct module *mod) +{ +} +#endif + + +static int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); + if (err) + goto out; + kobj_set_kset_s(&mod->mkobj, module_subsys); + mod->mkobj.mod = mod; + err = kobject_register(&mod->mkobj.kobj); + if (err) + goto out; + + err = module_add_refcnt_attr(mod); + if (err) + goto out_unreg; + + err = module_param_sysfs_setup(mod, kparam, num_params); + if (err) + goto out_unreg; + + return 0; + +out_unreg: + kobject_unregister(&mod->mkobj.kobj); +out: + return err; +} + +static void mod_kobject_remove(struct module *mod) +{ + module_remove_refcnt_attr(mod); + module_param_sysfs_remove(mod); + + kobject_unregister(&mod->mkobj.kobj); +} + +/* + * unlink the module with the whole machine is stopped with interrupts off + * - this defends against kallsyms not taking locks + */ +static int __unlink_module(void *_mod) +{ + struct module *mod = _mod; + list_del(&mod->list); + return 0; +} + +/* Free a module, remove from lists, etc (must hold module mutex). */ +static void free_module(struct module *mod) +{ + /* Delete from various lists */ + stop_machine_run(__unlink_module, mod, NR_CPUS); + remove_sect_attrs(mod); + mod_kobject_remove(mod); + + /* Arch-specific cleanup. */ + module_arch_cleanup(mod); + + /* Module unload stuff */ + module_unload_free(mod); + + /* This may be NULL, but that's OK */ + module_free(mod, mod->module_init); + kfree(mod->args); + if (mod->percpu) + percpu_modfree(mod->percpu); + + /* Finally, free the core (containing the module structure) */ + module_free(mod, mod->module_core); +} + +void *__symbol_get(const char *symbol) +{ + struct module *owner; + unsigned long value, flags; + const unsigned long *crc; + + spin_lock_irqsave(&modlist_lock, flags); + value = __find_symbol(symbol, &owner, &crc, 1); + if (value && !strong_try_module_get(owner)) + value = 0; + spin_unlock_irqrestore(&modlist_lock, flags); + + return (void *)value; +} +EXPORT_SYMBOL_GPL(__symbol_get); + +/* Change all symbols so that sh_value encodes the pointer directly. */ +static int simplify_symbols(Elf_Shdr *sechdrs, + unsigned int symindex, + const char *strtab, + unsigned int versindex, + unsigned int pcpuindex, + struct module *mod) +{ + Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; + unsigned long secbase; + unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + int ret = 0; + + for (i = 1; i < n; i++) { + switch (sym[i].st_shndx) { + case SHN_COMMON: + /* We compiled with -fno-common. These are not + supposed to happen. */ + DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); + printk("%s: please compile with -fno-common\n", + mod->name); + ret = -ENOEXEC; + break; + + case SHN_ABS: + /* Don't need to do anything */ + DEBUGP("Absolute symbol: 0x%08lx\n", + (long)sym[i].st_value); + break; + + case SHN_UNDEF: + sym[i].st_value + = resolve_symbol(sechdrs, versindex, + strtab + sym[i].st_name, mod); + + /* Ok if resolved. */ + if (sym[i].st_value != 0) + break; + /* Ok if weak. */ + if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + break; + + printk(KERN_WARNING "%s: Unknown symbol %s\n", + mod->name, strtab + sym[i].st_name); + ret = -ENOENT; + break; + + default: + /* Divert to percpu allocation if a percpu var. */ + if (sym[i].st_shndx == pcpuindex) + secbase = (unsigned long)mod->percpu; + else + secbase = sechdrs[sym[i].st_shndx].sh_addr; + sym[i].st_value += secbase; + break; + } + } + + return ret; +} + +/* Update size with this section: return offset. */ +static long get_offset(unsigned long *size, Elf_Shdr *sechdr) +{ + long ret; + + ret = ALIGN(*size, sechdr->sh_addralign ?: 1); + *size = ret + sechdr->sh_size; + return ret; +} + +/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld + might -- code, read-only data, read-write data, small data. Tally + sizes, and place the offsets into sh_entsize fields: high bit means it + belongs in init. */ +static void layout_sections(struct module *mod, + const Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + static unsigned long const masks[][2] = { + /* NOTE: all executable code must be the first section + * in this array; otherwise modify the text_size + * finder in the two loops below */ + { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, + { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, + { ARCH_SHF_SMALL | SHF_ALLOC, 0 } + }; + unsigned int m, i; + + for (i = 0; i < hdr->e_shnum; i++) + sechdrs[i].sh_entsize = ~0UL; + + DEBUGP("Core section allocation order:\n"); + for (m = 0; m < ARRAY_SIZE(masks); ++m) { + for (i = 0; i < hdr->e_shnum; ++i) { + Elf_Shdr *s = &sechdrs[i]; + + if ((s->sh_flags & masks[m][0]) != masks[m][0] + || (s->sh_flags & masks[m][1]) + || s->sh_entsize != ~0UL + || strncmp(secstrings + s->sh_name, + ".init", 5) == 0) + continue; + s->sh_entsize = get_offset(&mod->core_size, s); + DEBUGP("\t%s\n", secstrings + s->sh_name); + } + if (m == 0) + mod->core_text_size = mod->core_size; + } + + DEBUGP("Init section allocation order:\n"); + for (m = 0; m < ARRAY_SIZE(masks); ++m) { + for (i = 0; i < hdr->e_shnum; ++i) { + Elf_Shdr *s = &sechdrs[i]; + + if ((s->sh_flags & masks[m][0]) != masks[m][0] + || (s->sh_flags & masks[m][1]) + || s->sh_entsize != ~0UL + || strncmp(secstrings + s->sh_name, + ".init", 5) != 0) + continue; + s->sh_entsize = (get_offset(&mod->init_size, s) + | INIT_OFFSET_MASK); + DEBUGP("\t%s\n", secstrings + s->sh_name); + } + if (m == 0) + mod->init_text_size = mod->init_size; + } +} + +static inline int license_is_gpl_compatible(const char *license) +{ + return (strcmp(license, "GPL") == 0 + || strcmp(license, "GPL v2") == 0 + || strcmp(license, "GPL and additional rights") == 0 + || strcmp(license, "Dual BSD/GPL") == 0 + || strcmp(license, "Dual MPL/GPL") == 0); +} + +static void set_license(struct module *mod, const char *license) +{ + if (!license) + license = "unspecified"; + + mod->license_gplok = license_is_gpl_compatible(license); + if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { + printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", + mod->name, license); + tainted |= TAINT_PROPRIETARY_MODULE; + } +} + +/* Parse tag=value strings from .modinfo section */ +static char *next_string(char *string, unsigned long *secsize) +{ + /* Skip non-zero chars */ + while (string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + + /* Skip any zero padding. */ + while (!string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + return string; +} + +static char *get_modinfo(Elf_Shdr *sechdrs, + unsigned int info, + const char *tag) +{ + char *p; + unsigned int taglen = strlen(tag); + unsigned long size = sechdrs[info].sh_size; + + for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { + if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') + return p + taglen + 1; + } + return NULL; +} + +#ifdef CONFIG_KALLSYMS +int is_exported(const char *name, const struct module *mod) +{ + unsigned int i; + + if (!mod) { + for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) + if (strcmp(__start___ksymtab[i].name, name) == 0) + return 1; + return 0; + } + for (i = 0; i < mod->num_syms; i++) + if (strcmp(mod->syms[i].name, name) == 0) + return 1; + return 0; +} + +/* As per nm */ +static char elf_type(const Elf_Sym *sym, + Elf_Shdr *sechdrs, + const char *secstrings, + struct module *mod) +{ + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) + return 'v'; + else + return 'w'; + } + if (sym->st_shndx == SHN_UNDEF) + return 'U'; + if (sym->st_shndx == SHN_ABS) + return 'a'; + if (sym->st_shndx >= SHN_LORESERVE) + return '?'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) + return 't'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC + && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { + if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) + return 'r'; + else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 'g'; + else + return 'd'; + } + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 's'; + else + return 'b'; + } + if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug", strlen(".debug")) == 0) + return 'n'; + return '?'; +} + +static void add_kallsyms(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const char *secstrings) +{ + unsigned int i; + + mod->symtab = (void *)sechdrs[symindex].sh_addr; + mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + mod->strtab = (void *)sechdrs[strindex].sh_addr; + + /* Set types up while we still have access to sections. */ + for (i = 0; i < mod->num_symtab; i++) + mod->symtab[i].st_info + = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); +} +#else +static inline void add_kallsyms(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const char *secstrings) +{ +} +#endif /* CONFIG_KALLSYMS */ + +/* Allocate and load the module: note that size of section 0 is always + zero, and we rely on this for optional sections. */ +static struct module *load_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + Elf_Ehdr *hdr; + Elf_Shdr *sechdrs; + char *secstrings, *args, *modmagic, *strtab = NULL; + unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, + exportindex, modindex, obsparmindex, infoindex, gplindex, + crcindex, gplcrcindex, versindex, pcpuindex; + long arglen; + struct module *mod; + long err = 0; + void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + struct exception_table_entry *extable; + + DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + if (len < sizeof(*hdr)) + return ERR_PTR(-ENOEXEC); + + /* Suck in entire file: we'll want most of it. */ + /* vmalloc barfs on "unusual" numbers. Check here */ + if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) + return ERR_PTR(-ENOMEM); + if (copy_from_user(hdr, umod, len) != 0) { + err = -EFAULT; + goto free_hdr; + } + + /* Sanity checks against insmoding binaries or wrong arch, + weird elf version */ + if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 + || hdr->e_type != ET_REL + || !elf_check_arch(hdr) + || hdr->e_shentsize != sizeof(*sechdrs)) { + err = -ENOEXEC; + goto free_hdr; + } + + if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) + goto truncated; + + /* Convenience variables */ + sechdrs = (void *)hdr + hdr->e_shoff; + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + sechdrs[0].sh_addr = 0; + + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_NOBITS + && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) + goto truncated; + + /* Mark all sections sh_addr with their address in the + temporary image. */ + sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; + + /* Internal symbols and strings. */ + if (sechdrs[i].sh_type == SHT_SYMTAB) { + symindex = i; + strindex = sechdrs[i].sh_link; + strtab = (char *)hdr + sechdrs[strindex].sh_offset; + } +#ifndef CONFIG_MODULE_UNLOAD + /* Don't load .exit sections */ + if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) + sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; +#endif + } + + modindex = find_sec(hdr, sechdrs, secstrings, + ".gnu.linkonce.this_module"); + if (!modindex) { + printk(KERN_WARNING "No module found in object\n"); + err = -ENOEXEC; + goto free_hdr; + } + mod = (void *)sechdrs[modindex].sh_addr; + + if (symindex == 0) { + printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", + mod->name); + err = -ENOEXEC; + goto free_hdr; + } + + /* Optional sections */ + exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); + gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); + crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); + gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); + setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); + exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); + obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); + versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); + infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); + pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); + + /* Don't keep modinfo section */ + sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; +#ifdef CONFIG_KALLSYMS + /* Keep symbol and string tables for decoding later. */ + sechdrs[symindex].sh_flags |= SHF_ALLOC; + sechdrs[strindex].sh_flags |= SHF_ALLOC; +#endif + + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(sechdrs, versindex, mod)) { + err = -ENOEXEC; + goto free_hdr; + } + + modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); + /* This is allowed: modprobe --force will invalidate it. */ + if (!modmagic) { + tainted |= TAINT_FORCED_MODULE; + printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", + mod->name); + } else if (!same_magic(modmagic, vermagic)) { + printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", + mod->name, modmagic, vermagic); + err = -ENOEXEC; + goto free_hdr; + } + + /* Now copy in args */ + arglen = strlen_user(uargs); + if (!arglen) { + err = -EFAULT; + goto free_hdr; + } + args = kmalloc(arglen, GFP_KERNEL); + if (!args) { + err = -ENOMEM; + goto free_hdr; + } + if (copy_from_user(args, uargs, arglen) != 0) { + err = -EFAULT; + goto free_mod; + } + + if (find_module(mod->name)) { + err = -EEXIST; + goto free_mod; + } + + mod->state = MODULE_STATE_COMING; + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); + if (err < 0) + goto free_mod; + + if (pcpuindex) { + /* We have a special allocation for this section. */ + percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, + sechdrs[pcpuindex].sh_addralign); + if (!percpu) { + err = -ENOMEM; + goto free_mod; + } + sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; + mod->percpu = percpu; + } + + /* Determine total sizes, and put offsets in sh_entsize. For now + this is done generically; there doesn't appear to be any + special cases for the architectures. */ + layout_sections(mod, hdr, sechdrs, secstrings); + + /* Do the allocs. */ + ptr = module_alloc(mod->core_size); + if (!ptr) { + err = -ENOMEM; + goto free_percpu; + } + memset(ptr, 0, mod->core_size); + mod->module_core = ptr; + + ptr = module_alloc(mod->init_size); + if (!ptr && mod->init_size) { + err = -ENOMEM; + goto free_core; + } + memset(ptr, 0, mod->init_size); + mod->module_init = ptr; + + /* Transfer each section which specifies SHF_ALLOC */ + DEBUGP("final section addresses:\n"); + for (i = 0; i < hdr->e_shnum; i++) { + void *dest; + + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) + dest = mod->module_init + + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); + else + dest = mod->module_core + sechdrs[i].sh_entsize; + + if (sechdrs[i].sh_type != SHT_NOBITS) + memcpy(dest, (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size); + /* Update sh_addr to point to copy in image. */ + sechdrs[i].sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); + } + /* Module has been moved. */ + mod = (void *)sechdrs[modindex].sh_addr; + + /* Now we've moved module, initialize linked lists, etc. */ + module_unload_init(mod); + + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, + mod); + if (err < 0) + goto cleanup; + + /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ + mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); + mod->syms = (void *)sechdrs[exportindex].sh_addr; + if (crcindex) + mod->crcs = (void *)sechdrs[crcindex].sh_addr; + mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); + mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; + if (gplcrcindex) + mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; + +#ifdef CONFIG_MODVERSIONS + if ((mod->num_syms && !crcindex) || + (mod->num_gpl_syms && !gplcrcindex)) { + printk(KERN_WARNING "%s: No versions for exported symbols." + " Tainting kernel.\n", mod->name); + tainted |= TAINT_FORCED_MODULE; + } +#endif + + /* Now do relocations. */ + for (i = 1; i < hdr->e_shnum; i++) { + const char *strtab = (char *)sechdrs[strindex].sh_addr; + unsigned int info = sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (info >= hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(sechdrs[info].sh_flags & SHF_ALLOC)) + continue; + + if (sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(sechdrs, strtab, symindex, i,mod); + else if (sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(sechdrs, strtab, symindex, i, + mod); + if (err < 0) + goto cleanup; + } + + /* Set up and sort exception table */ + mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); + mod->extable = extable = (void *)sechdrs[exindex].sh_addr; + sort_extable(extable, extable + mod->num_exentries); + + /* Finally, copy percpu area over. */ + percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, + sechdrs[pcpuindex].sh_size); + + add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + + err = module_finalize(hdr, sechdrs, mod); + if (err < 0) + goto cleanup; + + mod->args = args; + if (obsparmindex) { + err = obsolete_params(mod->name, mod->args, + (struct obsolete_modparm *) + sechdrs[obsparmindex].sh_addr, + sechdrs[obsparmindex].sh_size + / sizeof(struct obsolete_modparm), + sechdrs, symindex, + (char *)sechdrs[strindex].sh_addr); + if (setupindex) + printk(KERN_WARNING "%s: Ignoring new-style " + "parameters in presence of obsolete ones\n", + mod->name); + } else { + /* Size of section 0 is 0, so this works well if no params */ + err = parse_args(mod->name, mod->args, + (struct kernel_param *) + sechdrs[setupindex].sh_addr, + sechdrs[setupindex].sh_size + / sizeof(struct kernel_param), + NULL); + } + if (err < 0) + goto arch_cleanup; + + err = mod_sysfs_setup(mod, + (struct kernel_param *) + sechdrs[setupindex].sh_addr, + sechdrs[setupindex].sh_size + / sizeof(struct kernel_param)); + if (err < 0) + goto arch_cleanup; + add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + + /* Get rid of temporary copy */ + vfree(hdr); + + /* Done! */ + return mod; + + arch_cleanup: + module_arch_cleanup(mod); + cleanup: + module_unload_free(mod); + module_free(mod, mod->module_init); + free_core: + module_free(mod, mod->module_core); + free_percpu: + if (percpu) + percpu_modfree(percpu); + free_mod: + kfree(args); + free_hdr: + vfree(hdr); + if (err < 0) return ERR_PTR(err); + else return ptr; + + truncated: + printk(KERN_ERR "Module len %lu truncated\n", len); + err = -ENOEXEC; + goto free_hdr; +} + +/* + * link the module with the whole machine is stopped with interrupts off + * - this defends against kallsyms not taking locks + */ +static int __link_module(void *_mod) +{ + struct module *mod = _mod; + list_add(&mod->list, &modules); + return 0; +} + +/* This is where the real work happens */ +asmlinkage long +sys_init_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + struct module *mod; + int ret = 0; + + /* Must have permission */ + if (!capable(CAP_SYS_MODULE)) + return -EPERM; + + /* Only one module load at a time, please */ + if (down_interruptible(&module_mutex) != 0) + return -EINTR; + + /* Do all the hard work */ + mod = load_module(umod, len, uargs); + if (IS_ERR(mod)) { + up(&module_mutex); + return PTR_ERR(mod); + } + + /* Flush the instruction cache, since we've played with text */ + if (mod->module_init) + flush_icache_range((unsigned long)mod->module_init, + (unsigned long)mod->module_init + + mod->init_size); + flush_icache_range((unsigned long)mod->module_core, + (unsigned long)mod->module_core + mod->core_size); + + /* Now sew it into the lists. They won't access us, since + strong_try_module_get() will fail. */ + stop_machine_run(__link_module, mod, NR_CPUS); + + /* Drop lock so they can recurse */ + up(&module_mutex); + + down(¬ify_mutex); + notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); + up(¬ify_mutex); + + /* Start the module */ + if (mod->init != NULL) + ret = mod->init(); + if (ret < 0) { + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_kernel(); + if (mod->unsafe) + printk(KERN_ERR "%s: module is now stuck!\n", + mod->name); + else { + module_put(mod); + down(&module_mutex); + free_module(mod); + up(&module_mutex); + } + return ret; + } + + /* Now it's a first class citizen! */ + down(&module_mutex); + mod->state = MODULE_STATE_LIVE; + /* Drop initial reference. */ + module_put(mod); + module_free(mod, mod->module_init); + mod->module_init = NULL; + mod->init_size = 0; + mod->init_text_size = 0; + up(&module_mutex); + + return 0; +} + +static inline int within(unsigned long addr, void *start, unsigned long size) +{ + return ((void *)addr >= start && (void *)addr < start + size); +} + +#ifdef CONFIG_KALLSYMS +/* + * This ignores the intensely annoying "mapping symbols" found + * in ARM ELF files: $a, $t and $d. + */ +static inline int is_arm_mapping_symbol(const char *str) +{ + return str[0] == '$' && strchr("atd", str[1]) + && (str[2] == '\0' || str[2] == '.'); +} + +static const char *get_ksymbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) +{ + unsigned int i, best = 0; + unsigned long nextval; + + /* At worse, next value is at end of module */ + if (within(addr, mod->module_init, mod->init_size)) + nextval = (unsigned long)mod->module_init+mod->init_text_size; + else + nextval = (unsigned long)mod->module_core+mod->core_text_size; + + /* Scan for closest preceeding symbol, and next symbol. (ELF + starts real symbols at 1). */ + for (i = 1; i < mod->num_symtab; i++) { + if (mod->symtab[i].st_shndx == SHN_UNDEF) + continue; + + /* We ignore unnamed symbols: they're uninformative + * and inserted at a whim. */ + if (mod->symtab[i].st_value <= addr + && mod->symtab[i].st_value > mod->symtab[best].st_value + && *(mod->strtab + mod->symtab[i].st_name) != '\0' + && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + best = i; + if (mod->symtab[i].st_value > addr + && mod->symtab[i].st_value < nextval + && *(mod->strtab + mod->symtab[i].st_name) != '\0' + && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + nextval = mod->symtab[i].st_value; + } + + if (!best) + return NULL; + + *size = nextval - mod->symtab[best].st_value; + *offset = addr - mod->symtab[best].st_value; + return mod->strtab + mod->symtab[best].st_name; +} + +/* For kallsyms to ask for address resolution. NULL means not found. + We don't lock, as this is used for oops resolution and races are a + lesser concern. */ +const char *module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname) +{ + struct module *mod; + + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) + || within(addr, mod->module_core, mod->core_size)) { + *modname = mod->name; + return get_ksymbol(mod, addr, size, offset); + } + } + return NULL; +} + +struct module *module_get_kallsym(unsigned int symnum, + unsigned long *value, + char *type, + char namebuf[128]) +{ + struct module *mod; + + down(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (symnum < mod->num_symtab) { + *value = mod->symtab[symnum].st_value; + *type = mod->symtab[symnum].st_info; + strncpy(namebuf, + mod->strtab + mod->symtab[symnum].st_name, + 127); + up(&module_mutex); + return mod; + } + symnum -= mod->num_symtab; + } + up(&module_mutex); + return NULL; +} + +static unsigned long mod_find_symname(struct module *mod, const char *name) +{ + unsigned int i; + + for (i = 0; i < mod->num_symtab; i++) + if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0) + return mod->symtab[i].st_value; + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + struct module *mod; + char *colon; + unsigned long ret = 0; + + /* Don't lock: we're in enough trouble already. */ + if ((colon = strchr(name, ':')) != NULL) { + *colon = '\0'; + if ((mod = find_module(name)) != NULL) + ret = mod_find_symname(mod, colon+1); + *colon = ':'; + } else { + list_for_each_entry(mod, &modules, list) + if ((ret = mod_find_symname(mod, name)) != 0) + break; + } + return ret; +} +#endif /* CONFIG_KALLSYMS */ + +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct list_head *i; + loff_t n = 0; + + down(&module_mutex); + list_for_each(i, &modules) { + if (n++ == *pos) + break; + } + if (i == &modules) + return NULL; + return i; +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct list_head *i = p; + (*pos)++; + if (i->next == &modules) + return NULL; + return i->next; +} + +static void m_stop(struct seq_file *m, void *p) +{ + up(&module_mutex); +} + +static int m_show(struct seq_file *m, void *p) +{ + struct module *mod = list_entry(p, struct module, list); + seq_printf(m, "%s %lu", + mod->name, mod->init_size + mod->core_size); + print_unload_info(m, mod); + + /* Informative for users. */ + seq_printf(m, " %s", + mod->state == MODULE_STATE_GOING ? "Unloading": + mod->state == MODULE_STATE_COMING ? "Loading": + "Live"); + /* Used by oprofile and other similar tools. */ + seq_printf(m, " 0x%p", mod->module_core); + + seq_printf(m, "\n"); + return 0; +} + +/* Format: modulename size refcount deps address + + Where refcount is a number or -, and deps is a comma-separated list + of depends or -. +*/ +struct seq_operations modules_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show +}; + +/* Given an address, look for it in the module exception tables. */ +const struct exception_table_entry *search_module_extables(unsigned long addr) +{ + unsigned long flags; + const struct exception_table_entry *e = NULL; + struct module *mod; + + spin_lock_irqsave(&modlist_lock, flags); + list_for_each_entry(mod, &modules, list) { + if (mod->num_exentries == 0) + continue; + + e = search_extable(mod->extable, + mod->extable + mod->num_exentries - 1, + addr); + if (e) + break; + } + spin_unlock_irqrestore(&modlist_lock, flags); + + /* Now, if we found one, we are running inside it now, hence + we cannot unload the module, hence no refcnt needed. */ + return e; +} + +/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ +struct module *__module_text_address(unsigned long addr) +{ + struct module *mod; + + list_for_each_entry(mod, &modules, list) + if (within(addr, mod->module_init, mod->init_text_size) + || within(addr, mod->module_core, mod->core_text_size)) + return mod; + return NULL; +} + +struct module *module_text_address(unsigned long addr) +{ + struct module *mod; + unsigned long flags; + + spin_lock_irqsave(&modlist_lock, flags); + mod = __module_text_address(addr); + spin_unlock_irqrestore(&modlist_lock, flags); + + return mod; +} + +/* Don't grab lock, we're oopsing. */ +void print_modules(void) +{ + struct module *mod; + + printk("Modules linked in:"); + list_for_each_entry(mod, &modules, list) + printk(" %s", mod->name); + printk("\n"); +} + +void module_add_driver(struct module *mod, struct device_driver *drv) +{ + if (!mod || !drv) + return; + + /* Don't check return code; this call is idempotent */ + sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); +} +EXPORT_SYMBOL(module_add_driver); + +void module_remove_driver(struct device_driver *drv) +{ + if (!drv) + return; + sysfs_remove_link(&drv->kobj, "module"); +} +EXPORT_SYMBOL(module_remove_driver); + +#ifdef CONFIG_MODVERSIONS +/* Generate the signature for struct module here, too, for modversions. */ +void struct_module(struct module *mod) { return; } +EXPORT_SYMBOL(struct_module); +#endif diff --git a/kernel/panic.c b/kernel/panic.c new file mode 100644 index 00000000000..0fa3f3a66fb --- /dev/null +++ b/kernel/panic.c @@ -0,0 +1,157 @@ +/* + * linux/kernel/panic.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * This function is used through-out the kernel (including mm and fs) + * to indicate a major problem. + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/sysrq.h> +#include <linux/interrupt.h> +#include <linux/nmi.h> + +int panic_timeout; +int panic_on_oops; +int tainted; + +EXPORT_SYMBOL(panic_timeout); + +struct notifier_block *panic_notifier_list; + +EXPORT_SYMBOL(panic_notifier_list); + +static int __init panic_setup(char *str) +{ + panic_timeout = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("panic=", panic_setup); + +static long no_blink(long time) +{ + return 0; +} + +/* Returns how long it waited in ms */ +long (*panic_blink)(long time); +EXPORT_SYMBOL(panic_blink); + +/** + * panic - halt the system + * @fmt: The text string to print + * + * Display a message, then perform cleanups. + * + * This function never returns. + */ + +NORET_TYPE void panic(const char * fmt, ...) +{ + long i; + static char buf[1024]; + va_list args; +#if defined(CONFIG_ARCH_S390) + unsigned long caller = (unsigned long) __builtin_return_address(0); +#endif + + bust_spinlocks(1); + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + bust_spinlocks(0); + +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + + notifier_call_chain(&panic_notifier_list, 0, buf); + + if (!panic_blink) + panic_blink = no_blink; + + if (panic_timeout > 0) + { + /* + * Delay timeout seconds before rebooting the machine. + * We can't use the "normal" timers since we just panicked.. + */ + printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); + for (i = 0; i < panic_timeout*1000; ) { + touch_nmi_watchdog(); + i += panic_blink(i); + mdelay(1); + i++; + } + /* + * Should we run the reboot notifier. For the moment Im + * choosing not too. It might crash, be corrupt or do + * more harm than good for other reasons. + */ + machine_restart(NULL); + } +#ifdef __sparc__ + { + extern int stop_a_enabled; + /* Make sure the user can actually press L1-A */ + stop_a_enabled = 1; + printk(KERN_EMERG "Press L1-A to return to the boot prom\n"); + } +#endif +#if defined(CONFIG_ARCH_S390) + disabled_wait(caller); +#endif + local_irq_enable(); + for (i = 0;;) { + i += panic_blink(i); + mdelay(1); + i++; + } +} + +EXPORT_SYMBOL(panic); + +/** + * print_tainted - return a string to represent the kernel taint state. + * + * 'P' - Proprietary module has been loaded. + * 'F' - Module has been forcibly loaded. + * 'S' - SMP with CPUs not designed for SMP. + * 'R' - User forced a module unload. + * 'M' - Machine had a machine check experience. + * 'B' - System has hit bad_page. + * + * The string is overwritten by the next call to print_taint(). + */ + +const char *print_tainted(void) +{ + static char buf[20]; + if (tainted) { + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", + tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', + tainted & TAINT_FORCED_MODULE ? 'F' : ' ', + tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', + tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', + tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', + tainted & TAINT_BAD_PAGE ? 'B' : ' '); + } + else + snprintf(buf, sizeof(buf), "Not tainted"); + return(buf); +} + +void add_taint(unsigned flag) +{ + tainted |= flag; +} +EXPORT_SYMBOL(add_taint); diff --git a/kernel/params.c b/kernel/params.c new file mode 100644 index 00000000000..5538608bd33 --- /dev/null +++ b/kernel/params.c @@ -0,0 +1,721 @@ +/* Helpers for initial module or kernel cmdline parsing + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/config.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/err.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt, a...) +#endif + +static inline int dash2underscore(char c) +{ + if (c == '-') + return '_'; + return c; +} + +static inline int parameq(const char *input, const char *paramname) +{ + unsigned int i; + for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) + if (input[i] == '\0') + return 1; + return 0; +} + +static int parse_one(char *param, + char *val, + struct kernel_param *params, + unsigned num_params, + int (*handle_unknown)(char *param, char *val)) +{ + unsigned int i; + + /* Find parameter */ + for (i = 0; i < num_params; i++) { + if (parameq(param, params[i].name)) { + DEBUGP("They are equal! Calling %p\n", + params[i].set); + return params[i].set(val, ¶ms[i]); + } + } + + if (handle_unknown) { + DEBUGP("Unknown argument: calling %p\n", handle_unknown); + return handle_unknown(param, val); + } + + DEBUGP("Unknown argument `%s'\n", param); + return -ENOENT; +} + +/* You can use " around spaces, but can't escape ". */ +/* Hyphens and underscores equivalent in parameter names. */ +static char *next_arg(char *args, char **param, char **val) +{ + unsigned int i, equals = 0; + int in_quote = 0, quoted = 0; + char *next; + + /* Chew any extra spaces */ + while (*args == ' ') args++; + if (*args == '"') { + args++; + in_quote = 1; + quoted = 1; + } + + for (i = 0; args[i]; i++) { + if (args[i] == ' ' && !in_quote) + break; + if (equals == 0) { + if (args[i] == '=') + equals = i; + } + if (args[i] == '"') + in_quote = !in_quote; + } + + *param = args; + if (!equals) + *val = NULL; + else { + args[equals] = '\0'; + *val = args + equals + 1; + + /* Don't include quotes in value. */ + if (**val == '"') { + (*val)++; + if (args[i-1] == '"') + args[i-1] = '\0'; + } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; + } + + if (args[i]) { + args[i] = '\0'; + next = args + i + 1; + } else + next = args + i; + return next; +} + +/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ +int parse_args(const char *name, + char *args, + struct kernel_param *params, + unsigned num, + int (*unknown)(char *param, char *val)) +{ + char *param, *val; + + DEBUGP("Parsing ARGS: %s\n", args); + + while (*args) { + int ret; + + args = next_arg(args, ¶m, &val); + ret = parse_one(param, val, params, num, unknown); + switch (ret) { + case -ENOENT: + printk(KERN_ERR "%s: Unknown parameter `%s'\n", + name, param); + return ret; + case -ENOSPC: + printk(KERN_ERR + "%s: `%s' too large for parameter `%s'\n", + name, val ?: "", param); + return ret; + case 0: + break; + default: + printk(KERN_ERR + "%s: `%s' invalid for parameter `%s'\n", + name, val ?: "", param); + return ret; + } + } + + /* All parsed OK. */ + return 0; +} + +/* Lazy bastard, eh? */ +#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ + int param_set_##name(const char *val, struct kernel_param *kp) \ + { \ + char *endp; \ + tmptype l; \ + \ + if (!val) return -EINVAL; \ + l = strtolfn(val, &endp, 0); \ + if (endp == val || ((type)l != l)) \ + return -EINVAL; \ + *((type *)kp->arg) = l; \ + return 0; \ + } \ + int param_get_##name(char *buffer, struct kernel_param *kp) \ + { \ + return sprintf(buffer, format, *((type *)kp->arg)); \ + } + +STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); + +int param_set_charp(const char *val, struct kernel_param *kp) +{ + if (!val) { + printk(KERN_ERR "%s: string parameter expected\n", + kp->name); + return -EINVAL; + } + + if (strlen(val) > 1024) { + printk(KERN_ERR "%s: string parameter too long\n", + kp->name); + return -ENOSPC; + } + + *(char **)kp->arg = (char *)val; + return 0; +} + +int param_get_charp(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "%s", *((char **)kp->arg)); +} + +int param_set_bool(const char *val, struct kernel_param *kp) +{ + /* No equals means "set"... */ + if (!val) val = "1"; + + /* One of =[yYnN01] */ + switch (val[0]) { + case 'y': case 'Y': case '1': + *(int *)kp->arg = 1; + return 0; + case 'n': case 'N': case '0': + *(int *)kp->arg = 0; + return 0; + } + return -EINVAL; +} + +int param_get_bool(char *buffer, struct kernel_param *kp) +{ + /* Y and N chosen as being relatively non-coder friendly */ + return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); +} + +int param_set_invbool(const char *val, struct kernel_param *kp) +{ + int boolval, ret; + struct kernel_param dummy = { .arg = &boolval }; + + ret = param_set_bool(val, &dummy); + if (ret == 0) + *(int *)kp->arg = !boolval; + return ret; +} + +int param_get_invbool(char *buffer, struct kernel_param *kp) +{ + int val; + struct kernel_param dummy = { .arg = &val }; + + val = !*(int *)kp->arg; + return param_get_bool(buffer, &dummy); +} + +/* We cheat here and temporarily mangle the string. */ +int param_array(const char *name, + const char *val, + unsigned int min, unsigned int max, + void *elem, int elemsize, + int (*set)(const char *, struct kernel_param *kp), + int *num) +{ + int ret; + struct kernel_param kp; + char save; + + /* Get the name right for errors. */ + kp.name = name; + kp.arg = elem; + + /* No equals sign? */ + if (!val) { + printk(KERN_ERR "%s: expects arguments\n", name); + return -EINVAL; + } + + *num = 0; + /* We expect a comma-separated list of values. */ + do { + int len; + + if (*num == max) { + printk(KERN_ERR "%s: can only take %i arguments\n", + name, max); + return -EINVAL; + } + len = strcspn(val, ","); + + /* nul-terminate and parse */ + save = val[len]; + ((char *)val)[len] = '\0'; + ret = set(val, &kp); + + if (ret != 0) + return ret; + kp.arg += elemsize; + val += len+1; + (*num)++; + } while (save == ','); + + if (*num < min) { + printk(KERN_ERR "%s: needs at least %i arguments\n", + name, min); + return -EINVAL; + } + return 0; +} + +int param_array_set(const char *val, struct kernel_param *kp) +{ + struct kparam_array *arr = kp->arg; + + return param_array(kp->name, val, 1, arr->max, arr->elem, + arr->elemsize, arr->set, arr->num ?: &arr->max); +} + +int param_array_get(char *buffer, struct kernel_param *kp) +{ + int i, off, ret; + struct kparam_array *arr = kp->arg; + struct kernel_param p; + + p = *kp; + for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + if (i) + buffer[off++] = ','; + p.arg = arr->elem + arr->elemsize * i; + ret = arr->get(buffer + off, &p); + if (ret < 0) + return ret; + off += ret; + } + buffer[off] = '\0'; + return off; +} + +int param_set_copystring(const char *val, struct kernel_param *kp) +{ + struct kparam_string *kps = kp->arg; + + if (strlen(val)+1 > kps->maxlen) { + printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", + kp->name, kps->maxlen-1); + return -ENOSPC; + } + strcpy(kps->string, val); + return 0; +} + +int param_get_string(char *buffer, struct kernel_param *kp) +{ + struct kparam_string *kps = kp->arg; + return strlcpy(buffer, kps->string, kps->maxlen); +} + +/* sysfs output in /sys/modules/XYZ/parameters/ */ + +extern struct kernel_param __start___param[], __stop___param[]; + +#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN + +struct param_attribute +{ + struct module_attribute mattr; + struct kernel_param *param; +}; + +struct module_param_attrs +{ + struct attribute_group grp; + struct param_attribute attrs[0]; +}; + +#define to_param_attr(n) container_of(n, struct param_attribute, mattr); + +static ssize_t param_attr_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + int count; + struct param_attribute *attribute = to_param_attr(mattr); + + if (!attribute->param->get) + return -EPERM; + + count = attribute->param->get(buf, attribute->param); + if (count > 0) { + strcat(buf, "\n"); + ++count; + } + return count; +} + +/* sysfs always hands a nul-terminated string in buf. We rely on that. */ +static ssize_t param_attr_store(struct module_attribute *mattr, + struct module *owner, + const char *buf, size_t len) +{ + int err; + struct param_attribute *attribute = to_param_attr(mattr); + + if (!attribute->param->set) + return -EPERM; + + err = attribute->param->set(buf, attribute->param); + if (!err) + return len; + return err; +} + +#ifdef CONFIG_MODULES +#define __modinit +#else +#define __modinit __init +#endif + +/* + * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME + * @mk: struct module_kobject (contains parent kobject) + * @kparam: array of struct kernel_param, the actual parameter definitions + * @num_params: number of entries in array + * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules" + * + * Create a kobject for a (per-module) group of parameters, and create files + * in sysfs. A pointer to the param_kobject is returned on success, + * NULL if there's no parameter to export, or other ERR_PTR(err). + */ +static __modinit struct module_param_attrs * +param_sysfs_setup(struct module_kobject *mk, + struct kernel_param *kparam, + unsigned int num_params, + unsigned int name_skip) +{ + struct module_param_attrs *mp; + unsigned int valid_attrs = 0; + unsigned int i, size[2]; + struct param_attribute *pattr; + struct attribute **gattr; + int err; + + for (i=0; i<num_params; i++) { + if (kparam[i].perm) + valid_attrs++; + } + + if (!valid_attrs) + return NULL; + + size[0] = ALIGN(sizeof(*mp) + + valid_attrs * sizeof(mp->attrs[0]), + sizeof(mp->grp.attrs[0])); + size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); + + mp = kmalloc(size[0] + size[1], GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + mp->grp.name = "parameters"; + mp->grp.attrs = (void *)mp + size[0]; + + pattr = &mp->attrs[0]; + gattr = &mp->grp.attrs[0]; + for (i = 0; i < num_params; i++) { + struct kernel_param *kp = &kparam[i]; + if (kp->perm) { + pattr->param = kp; + pattr->mattr.show = param_attr_show; + pattr->mattr.store = param_attr_store; + pattr->mattr.attr.name = (char *)&kp->name[name_skip]; + pattr->mattr.attr.owner = mk->mod; + pattr->mattr.attr.mode = kp->perm; + *(gattr++) = &(pattr++)->mattr.attr; + } + } + *gattr = NULL; + + if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { + kfree(mp); + return ERR_PTR(err); + } + return mp; +} + + +#ifdef CONFIG_MODULES + +/* + * module_param_sysfs_setup - setup sysfs support for one module + * @mod: module + * @kparam: module parameters (array) + * @num_params: number of module parameters + * + * Adds sysfs entries for module parameters, and creates a link from + * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ + */ +int module_param_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + struct module_param_attrs *mp; + + mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); + if (IS_ERR(mp)) + return PTR_ERR(mp); + + mod->param_attrs = mp; + return 0; +} + +/* + * module_param_sysfs_remove - remove sysfs support for one module + * @mod: module + * + * Remove sysfs entries for module parameters and the corresponding + * kobject. + */ +void module_param_sysfs_remove(struct module *mod) +{ + if (mod->param_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->param_attrs->grp); + /* We are positive that no one is using any param + * attrs at this point. Deallocate immediately. */ + kfree(mod->param_attrs); + mod->param_attrs = NULL; + } +} +#endif + +/* + * kernel_param_sysfs_setup - wrapper for built-in params support + */ +static void __init kernel_param_sysfs_setup(const char *name, + struct kernel_param *kparam, + unsigned int num_params, + unsigned int name_skip) +{ + struct module_kobject *mk; + + mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); + memset(mk, 0, sizeof(struct module_kobject)); + + mk->mod = THIS_MODULE; + kobj_set_kset_s(mk, module_subsys); + kobject_set_name(&mk->kobj, name); + kobject_register(&mk->kobj); + + /* no need to keep the kobject if no parameter is exported */ + if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { + kobject_unregister(&mk->kobj); + kfree(mk); + } +} + +/* + * param_sysfs_builtin - add contents in /sys/parameters for built-in modules + * + * Add module_parameters to sysfs for "modules" built into the kernel. + * + * The "module" name (KBUILD_MODNAME) is stored before a dot, the + * "parameter" name is stored behind a dot in kernel_param->name. So, + * extract the "module" name for all built-in kernel_param-eters, + * and for all who have the same, call kernel_param_sysfs_setup. + */ +static void __init param_sysfs_builtin(void) +{ + struct kernel_param *kp, *kp_begin = NULL; + unsigned int i, name_len, count = 0; + char modname[MAX_KBUILD_MODNAME + 1] = ""; + + for (i=0; i < __stop___param - __start___param; i++) { + char *dot; + + kp = &__start___param[i]; + + /* We do not handle args without periods. */ + dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME); + if (!dot) { + DEBUGP("couldn't find period in %s\n", kp->name); + continue; + } + name_len = dot - kp->name; + + /* new kbuild_modname? */ + if (strlen(modname) != name_len + || strncmp(modname, kp->name, name_len) != 0) { + /* add a new kobject for previous kernel_params. */ + if (count) + kernel_param_sysfs_setup(modname, + kp_begin, + count, + strlen(modname)+1); + + strncpy(modname, kp->name, name_len); + modname[name_len] = '\0'; + count = 0; + kp_begin = kp; + } + count++; + } + + /* last kernel_params need to be registered as well */ + if (count) + kernel_param_sysfs_setup(modname, kp_begin, count, + strlen(modname)+1); +} + + +/* module-related sysfs stuff */ +#ifdef CONFIG_MODULES + +#define to_module_attr(n) container_of(n, struct module_attribute, attr); +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); + +static ssize_t module_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct module_attribute *attribute; + struct module_kobject *mk; + int ret; + + attribute = to_module_attr(attr); + mk = to_module_kobject(kobj); + + if (!attribute->show) + return -EPERM; + + if (!try_module_get(mk->mod)) + return -ENODEV; + + ret = attribute->show(attribute, mk->mod, buf); + + module_put(mk->mod); + + return ret; +} + +static ssize_t module_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct module_attribute *attribute; + struct module_kobject *mk; + int ret; + + attribute = to_module_attr(attr); + mk = to_module_kobject(kobj); + + if (!attribute->store) + return -EPERM; + + if (!try_module_get(mk->mod)) + return -ENODEV; + + ret = attribute->store(attribute, mk->mod, buf, len); + + module_put(mk->mod); + + return ret; +} + +static struct sysfs_ops module_sysfs_ops = { + .show = module_attr_show, + .store = module_attr_store, +}; + +#else +static struct sysfs_ops module_sysfs_ops = { + .show = NULL, + .store = NULL, +}; +#endif + +static struct kobj_type module_ktype = { + .sysfs_ops = &module_sysfs_ops, +}; + +decl_subsys(module, &module_ktype, NULL); + +/* + * param_sysfs_init - wrapper for built-in params support + */ +static int __init param_sysfs_init(void) +{ + subsystem_register(&module_subsys); + + param_sysfs_builtin(); + + return 0; +} +__initcall(param_sysfs_init); + +EXPORT_SYMBOL(param_set_byte); +EXPORT_SYMBOL(param_get_byte); +EXPORT_SYMBOL(param_set_short); +EXPORT_SYMBOL(param_get_short); +EXPORT_SYMBOL(param_set_ushort); +EXPORT_SYMBOL(param_get_ushort); +EXPORT_SYMBOL(param_set_int); +EXPORT_SYMBOL(param_get_int); +EXPORT_SYMBOL(param_set_uint); +EXPORT_SYMBOL(param_get_uint); +EXPORT_SYMBOL(param_set_long); +EXPORT_SYMBOL(param_get_long); +EXPORT_SYMBOL(param_set_ulong); +EXPORT_SYMBOL(param_get_ulong); +EXPORT_SYMBOL(param_set_charp); +EXPORT_SYMBOL(param_get_charp); +EXPORT_SYMBOL(param_set_bool); +EXPORT_SYMBOL(param_get_bool); +EXPORT_SYMBOL(param_set_invbool); +EXPORT_SYMBOL(param_get_invbool); +EXPORT_SYMBOL(param_array_set); +EXPORT_SYMBOL(param_array_get); +EXPORT_SYMBOL(param_set_copystring); +EXPORT_SYMBOL(param_get_string); diff --git a/kernel/pid.c b/kernel/pid.c new file mode 100644 index 00000000000..edba31c681a --- /dev/null +++ b/kernel/pid.c @@ -0,0 +1,292 @@ +/* + * Generic pidhash and scalable, time-bounded PID allocator + * + * (C) 2002-2003 William Irwin, IBM + * (C) 2004 William Irwin, Oracle + * (C) 2002-2004 Ingo Molnar, Red Hat + * + * pid-structures are backing objects for tasks sharing a given ID to chain + * against. There is very little to them aside from hashing them and + * parking tasks using given ID's on a list. + * + * The hash is always changed with the tasklist_lock write-acquired, + * and the hash is only accessed with the tasklist_lock at least + * read-acquired, so there's no additional SMP locking needed here. + * + * We have a list of bitmap pages, which bitmaps represent the PID space. + * Allocating and freeing PIDs is completely lockless. The worst-case + * allocation scenario when all but one out of 1 million PIDs possible are + * allocated already: the scanning of 32 list entries and at most PAGE_SIZE + * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/hash.h> + +#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) +static struct hlist_head *pid_hash[PIDTYPE_MAX]; +static int pidhash_shift; + +int pid_max = PID_MAX_DEFAULT; +int last_pid; + +#define RESERVED_PIDS 300 + +int pid_max_min = RESERVED_PIDS + 1; +int pid_max_max = PID_MAX_LIMIT; + +#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) +#define BITS_PER_PAGE (PAGE_SIZE*8) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) +#define find_next_offset(map, off) \ + find_next_zero_bit((map)->page, BITS_PER_PAGE, off) + +/* + * PID-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way a low pid_max + * value does not cause lots of bitmaps to be allocated, but + * the scheme scales to up to 4 million PIDs, runtime. + */ +typedef struct pidmap { + atomic_t nr_free; + void *page; +} pidmap_t; + +static pidmap_t pidmap_array[PIDMAP_ENTRIES] = + { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); + +fastcall void free_pidmap(int pid) +{ + pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + int offset = pid & BITS_PER_PAGE_MASK; + + clear_bit(offset, map->page); + atomic_inc(&map->nr_free); +} + +int alloc_pidmap(void) +{ + int i, offset, max_scan, pid, last = last_pid; + pidmap_t *map; + + pid = last + 1; + if (pid >= pid_max) + pid = RESERVED_PIDS; + offset = pid & BITS_PER_PAGE_MASK; + map = &pidmap_array[pid/BITS_PER_PAGE]; + max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; + for (i = 0; i <= max_scan; ++i) { + if (unlikely(!map->page)) { + unsigned long page = get_zeroed_page(GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock(&pidmap_lock); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock(&pidmap_lock); + if (unlikely(!map->page)) + break; + } + if (likely(atomic_read(&map->nr_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { + atomic_dec(&map->nr_free); + last_pid = pid; + return pid; + } + offset = find_next_offset(map, offset); + pid = mk_pid(map, offset); + /* + * find_next_offset() found a bit, the pid from it + * is in-bounds, and if we fell back to the last + * bitmap block and the final block was the same + * as the starting point, pid is before last_pid. + */ + } while (offset < BITS_PER_PAGE && pid < pid_max && + (i != max_scan || pid < last || + !((last+1) & BITS_PER_PAGE_MASK))); + } + if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { + ++map; + offset = 0; + } else { + map = &pidmap_array[0]; + offset = RESERVED_PIDS; + if (unlikely(last == offset)) + break; + } + pid = mk_pid(map, offset); + } + return -1; +} + +struct pid * fastcall find_pid(enum pid_type type, int nr) +{ + struct hlist_node *elem; + struct pid *pid; + + hlist_for_each_entry(pid, elem, + &pid_hash[type][pid_hashfn(nr)], pid_chain) { + if (pid->nr == nr) + return pid; + } + return NULL; +} + +int fastcall attach_pid(task_t *task, enum pid_type type, int nr) +{ + struct pid *pid, *task_pid; + + task_pid = &task->pids[type]; + pid = find_pid(type, nr); + if (pid == NULL) { + hlist_add_head(&task_pid->pid_chain, + &pid_hash[type][pid_hashfn(nr)]); + INIT_LIST_HEAD(&task_pid->pid_list); + } else { + INIT_HLIST_NODE(&task_pid->pid_chain); + list_add_tail(&task_pid->pid_list, &pid->pid_list); + } + task_pid->nr = nr; + + return 0; +} + +static fastcall int __detach_pid(task_t *task, enum pid_type type) +{ + struct pid *pid, *pid_next; + int nr = 0; + + pid = &task->pids[type]; + if (!hlist_unhashed(&pid->pid_chain)) { + hlist_del(&pid->pid_chain); + + if (list_empty(&pid->pid_list)) + nr = pid->nr; + else { + pid_next = list_entry(pid->pid_list.next, + struct pid, pid_list); + /* insert next pid from pid_list to hash */ + hlist_add_head(&pid_next->pid_chain, + &pid_hash[type][pid_hashfn(pid_next->nr)]); + } + } + + list_del(&pid->pid_list); + pid->nr = 0; + + return nr; +} + +void fastcall detach_pid(task_t *task, enum pid_type type) +{ + int tmp, nr; + + nr = __detach_pid(task, type); + if (!nr) + return; + + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (tmp != type && find_pid(tmp, nr)) + return; + + free_pidmap(nr); +} + +task_t *find_task_by_pid_type(int type, int nr) +{ + struct pid *pid; + + pid = find_pid(type, nr); + if (!pid) + return NULL; + + return pid_task(&pid->pid_list, type); +} + +EXPORT_SYMBOL(find_task_by_pid_type); + +/* + * This function switches the PIDs if a non-leader thread calls + * sys_execve() - this must be done without releasing the PID. + * (which a detach_pid() would eventually do.) + */ +void switch_exec_pids(task_t *leader, task_t *thread) +{ + __detach_pid(leader, PIDTYPE_PID); + __detach_pid(leader, PIDTYPE_TGID); + __detach_pid(leader, PIDTYPE_PGID); + __detach_pid(leader, PIDTYPE_SID); + + __detach_pid(thread, PIDTYPE_PID); + __detach_pid(thread, PIDTYPE_TGID); + + leader->pid = leader->tgid = thread->pid; + thread->pid = thread->tgid; + + attach_pid(thread, PIDTYPE_PID, thread->pid); + attach_pid(thread, PIDTYPE_TGID, thread->tgid); + attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); + attach_pid(thread, PIDTYPE_SID, thread->signal->session); + list_add_tail(&thread->tasks, &init_task.tasks); + + attach_pid(leader, PIDTYPE_PID, leader->pid); + attach_pid(leader, PIDTYPE_TGID, leader->tgid); + attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); + attach_pid(leader, PIDTYPE_SID, leader->signal->session); +} + +/* + * The pid hash table is scaled according to the amount of memory in the + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or + * more. + */ +void __init pidhash_init(void) +{ + int i, j, pidhash_size; + unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); + + pidhash_shift = max(4, fls(megabytes * 4)); + pidhash_shift = min(12, pidhash_shift); + pidhash_size = 1 << pidhash_shift; + + printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", + pidhash_size, pidhash_shift, + PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); + + for (i = 0; i < PIDTYPE_MAX; i++) { + pid_hash[i] = alloc_bootmem(pidhash_size * + sizeof(*(pid_hash[i]))); + if (!pid_hash[i]) + panic("Could not alloc pidhash!\n"); + for (j = 0; j < pidhash_size; j++) + INIT_HLIST_HEAD(&pid_hash[i][j]); + } +} + +void __init pidmap_init(void) +{ + int i; + + pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + set_bit(0, pidmap_array->page); + atomic_dec(&pidmap_array->nr_free); + + /* + * Allocate PID 0, and hash it via all PID types: + */ + + for (i = 0; i < PIDTYPE_MAX; i++) + attach_pid(current, i, 0); +} diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c new file mode 100644 index 00000000000..ad85d3f0dcc --- /dev/null +++ b/kernel/posix-cpu-timers.c @@ -0,0 +1,1559 @@ +/* + * Implement CPU time clocks for the POSIX clock interface. + */ + +#include <linux/sched.h> +#include <linux/posix-timers.h> +#include <asm/uaccess.h> +#include <linux/errno.h> + +static int check_clock(clockid_t which_clock) +{ + int error = 0; + struct task_struct *p; + const pid_t pid = CPUCLOCK_PID(which_clock); + + if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + if (pid == 0) + return 0; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? + p->tgid != current->tgid : p->tgid != pid)) { + error = -EINVAL; + } + read_unlock(&tasklist_lock); + + return error; +} + +static inline union cpu_time_count +timespec_to_sample(clockid_t which_clock, const struct timespec *tp) +{ + union cpu_time_count ret; + ret.sched = 0; /* high half always zero when .cpu used */ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + } else { + ret.cpu = timespec_to_cputime(tp); + } + return ret; +} + +static void sample_to_timespec(clockid_t which_clock, + union cpu_time_count cpu, + struct timespec *tp) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + tp->tv_sec = div_long_long_rem(cpu.sched, + NSEC_PER_SEC, &tp->tv_nsec); + } else { + cputime_to_timespec(cpu.cpu, tp); + } +} + +static inline int cpu_time_before(clockid_t which_clock, + union cpu_time_count now, + union cpu_time_count then) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + return now.sched < then.sched; + } else { + return cputime_lt(now.cpu, then.cpu); + } +} +static inline void cpu_time_add(clockid_t which_clock, + union cpu_time_count *acc, + union cpu_time_count val) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + acc->sched += val.sched; + } else { + acc->cpu = cputime_add(acc->cpu, val.cpu); + } +} +static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, + union cpu_time_count a, + union cpu_time_count b) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + a.sched -= b.sched; + } else { + a.cpu = cputime_sub(a.cpu, b.cpu); + } + return a; +} + +/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +static inline void bump_cpu_timer(struct k_itimer *timer, + union cpu_time_count now) +{ + int i; + + if (timer->it.cpu.incr.sched == 0) + return; + + if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { + unsigned long long delta, incr; + + if (now.sched < timer->it.cpu.expires.sched) + return; + incr = timer->it.cpu.incr.sched; + delta = now.sched + incr - timer->it.cpu.expires.sched; + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + for (; i >= 0; incr >>= 1, i--) { + if (delta <= incr) + continue; + timer->it.cpu.expires.sched += incr; + timer->it_overrun += 1 << i; + delta -= incr; + } + } else { + cputime_t delta, incr; + + if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) + return; + incr = timer->it.cpu.incr.cpu; + delta = cputime_sub(cputime_add(now.cpu, incr), + timer->it.cpu.expires.cpu); + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) + incr = cputime_add(incr, incr); + for (; i >= 0; incr = cputime_halve(incr), i--) { + if (cputime_le(delta, incr)) + continue; + timer->it.cpu.expires.cpu = + cputime_add(timer->it.cpu.expires.cpu, incr); + timer->it_overrun += 1 << i; + delta = cputime_sub(delta, incr); + } + } +} + +static inline cputime_t prof_ticks(struct task_struct *p) +{ + return cputime_add(p->utime, p->stime); +} +static inline cputime_t virt_ticks(struct task_struct *p) +{ + return p->utime; +} +static inline unsigned long long sched_ns(struct task_struct *p) +{ + return (p == current) ? current_sched_time(p) : p->sched_time; +} + +int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) +{ + int error = check_clock(which_clock); + if (!error) { + tp->tv_sec = 0; + tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + /* + * If sched_clock is using a cycle counter, we + * don't have any idea of its true resolution + * exported, but it is much more than 1s/HZ. + */ + tp->tv_nsec = 1; + } + } + return error; +} + +int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) +{ + /* + * You can never reset a CPU clock, but we check for other errors + * in the call before failing with EPERM. + */ + int error = check_clock(which_clock); + if (error == 0) { + error = -EPERM; + } + return error; +} + + +/* + * Sample a per-thread clock for the given task. + */ +static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p, + union cpu_time_count *cpu) +{ + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = prof_ticks(p); + break; + case CPUCLOCK_VIRT: + cpu->cpu = virt_ticks(p); + break; + case CPUCLOCK_SCHED: + cpu->sched = sched_ns(p); + break; + } + return 0; +} + +/* + * Sample a process (thread group) clock for the given group_leader task. + * Must be called with tasklist_lock held for reading. + * Must be called with tasklist_lock held for reading, and p->sighand->siglock. + */ +static int cpu_clock_sample_group_locked(unsigned int clock_idx, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_struct *t = p; + switch (clock_idx) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); + do { + cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); + t = next_thread(t); + } while (t != p); + break; + case CPUCLOCK_VIRT: + cpu->cpu = p->signal->utime; + do { + cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); + t = next_thread(t); + } while (t != p); + break; + case CPUCLOCK_SCHED: + cpu->sched = p->signal->sched_time; + /* Add in each other live thread. */ + while ((t = next_thread(t)) != p) { + cpu->sched += t->sched_time; + } + if (p->tgid == current->tgid) { + /* + * We're sampling ourselves, so include the + * cycles not yet banked. We still omit + * other threads running on other CPUs, + * so the total can always be behind as + * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). + */ + cpu->sched += current_sched_time(current); + } else { + cpu->sched += p->sched_time; + } + break; + } + return 0; +} + +/* + * Sample a process (thread group) clock for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_clock_sample_group(clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + int ret; + unsigned long flags; + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, + cpu); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + return ret; +} + + +int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp) +{ + const pid_t pid = CPUCLOCK_PID(which_clock); + int error = -EINVAL; + union cpu_time_count rtn; + + if (pid == 0) { + /* + * Special case constant value for our own clocks. + * We don't have to do any lookup to find ourselves. + */ + if (CPUCLOCK_PERTHREAD(which_clock)) { + /* + * Sampling just ourselves we can do with no locking. + */ + error = cpu_clock_sample(which_clock, + current, &rtn); + } else { + read_lock(&tasklist_lock); + error = cpu_clock_sample_group(which_clock, + current, &rtn); + read_unlock(&tasklist_lock); + } + } else { + /* + * Find the given PID, and validate that the caller + * should be able to see it. + */ + struct task_struct *p; + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (p) { + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (p->tgid == current->tgid) { + error = cpu_clock_sample(which_clock, + p, &rtn); + } + } else if (p->tgid == pid && p->signal) { + error = cpu_clock_sample_group(which_clock, + p, &rtn); + } + } + read_unlock(&tasklist_lock); + } + + if (error) + return error; + sample_to_timespec(which_clock, rtn, tp); + return 0; +} + + +/* + * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. + * This is called from sys_timer_create with the new timer already locked. + */ +int posix_cpu_timer_create(struct k_itimer *new_timer) +{ + int ret = 0; + const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); + struct task_struct *p; + + if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + INIT_LIST_HEAD(&new_timer->it.cpu.entry); + new_timer->it.cpu.incr.sched = 0; + new_timer->it.cpu.expires.sched = 0; + + read_lock(&tasklist_lock); + if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { + if (pid == 0) { + p = current; + } else { + p = find_task_by_pid(pid); + if (p && p->tgid != current->tgid) + p = NULL; + } + } else { + if (pid == 0) { + p = current->group_leader; + } else { + p = find_task_by_pid(pid); + if (p && p->tgid != pid) + p = NULL; + } + } + new_timer->it.cpu.task = p; + if (p) { + get_task_struct(p); + } else { + ret = -EINVAL; + } + read_unlock(&tasklist_lock); + + return ret; +} + +/* + * Clean up a CPU-clock timer that is about to be destroyed. + * This is called from timer deletion with the timer already locked. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +int posix_cpu_timer_del(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + + if (timer->it.cpu.firing) + return TIMER_RETRY; + + if (unlikely(p == NULL)) + return 0; + + if (!list_empty(&timer->it.cpu.entry)) { + read_lock(&tasklist_lock); + if (unlikely(p->signal == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + BUG_ON(!list_empty(&timer->it.cpu.entry)); + } else { + /* + * Take us off the task's timer list. + */ + spin_lock(&p->sighand->siglock); + list_del(&timer->it.cpu.entry); + spin_unlock(&p->sighand->siglock); + } + read_unlock(&tasklist_lock); + } + put_task_struct(p); + + return 0; +} + +/* + * Clean out CPU timers still ticking when a thread exited. The task + * pointer is cleared, and the expiry time is replaced with the residual + * time for later timer_gettime calls to return. + * This must be called with the siglock held. + */ +static void cleanup_timers(struct list_head *head, + cputime_t utime, cputime_t stime, + unsigned long long sched_time) +{ + struct cpu_timer_list *timer, *next; + cputime_t ptime = cputime_add(utime, stime); + + list_for_each_entry_safe(timer, next, head, entry) { + timer->task = NULL; + list_del_init(&timer->entry); + if (cputime_lt(timer->expires.cpu, ptime)) { + timer->expires.cpu = cputime_zero; + } else { + timer->expires.cpu = cputime_sub(timer->expires.cpu, + ptime); + } + } + + ++head; + list_for_each_entry_safe(timer, next, head, entry) { + timer->task = NULL; + list_del_init(&timer->entry); + if (cputime_lt(timer->expires.cpu, utime)) { + timer->expires.cpu = cputime_zero; + } else { + timer->expires.cpu = cputime_sub(timer->expires.cpu, + utime); + } + } + + ++head; + list_for_each_entry_safe(timer, next, head, entry) { + timer->task = NULL; + list_del_init(&timer->entry); + if (timer->expires.sched < sched_time) { + timer->expires.sched = 0; + } else { + timer->expires.sched -= sched_time; + } + } +} + +/* + * These are both called with the siglock held, when the current thread + * is being reaped. When the final (leader) thread in the group is reaped, + * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. + */ +void posix_cpu_timers_exit(struct task_struct *tsk) +{ + cleanup_timers(tsk->cpu_timers, + tsk->utime, tsk->stime, tsk->sched_time); + +} +void posix_cpu_timers_exit_group(struct task_struct *tsk) +{ + cleanup_timers(tsk->signal->cpu_timers, + cputime_add(tsk->utime, tsk->signal->utime), + cputime_add(tsk->stime, tsk->signal->stime), + tsk->sched_time + tsk->signal->sched_time); +} + + +/* + * Set the expiry times of all the threads in the process so one of them + * will go off before the process cumulative expiry total is reached. + */ +static void process_timer_rebalance(struct task_struct *p, + unsigned int clock_idx, + union cpu_time_count expires, + union cpu_time_count val) +{ + cputime_t ticks, left; + unsigned long long ns, nsleft; + struct task_struct *t = p; + unsigned int nthreads = atomic_read(&p->signal->live); + + switch (clock_idx) { + default: + BUG(); + break; + case CPUCLOCK_PROF: + left = cputime_div(cputime_sub(expires.cpu, val.cpu), + nthreads); + do { + if (!unlikely(t->exit_state)) { + ticks = cputime_add(prof_ticks(t), left); + if (cputime_eq(t->it_prof_expires, + cputime_zero) || + cputime_gt(t->it_prof_expires, ticks)) { + t->it_prof_expires = ticks; + } + } + t = next_thread(t); + } while (t != p); + break; + case CPUCLOCK_VIRT: + left = cputime_div(cputime_sub(expires.cpu, val.cpu), + nthreads); + do { + if (!unlikely(t->exit_state)) { + ticks = cputime_add(virt_ticks(t), left); + if (cputime_eq(t->it_virt_expires, + cputime_zero) || + cputime_gt(t->it_virt_expires, ticks)) { + t->it_virt_expires = ticks; + } + } + t = next_thread(t); + } while (t != p); + break; + case CPUCLOCK_SCHED: + nsleft = expires.sched - val.sched; + do_div(nsleft, nthreads); + do { + if (!unlikely(t->exit_state)) { + ns = t->sched_time + nsleft; + if (t->it_sched_expires == 0 || + t->it_sched_expires > ns) { + t->it_sched_expires = ns; + } + } + t = next_thread(t); + } while (t != p); + break; + } +} + +static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +{ + /* + * That's all for this thread or process. + * We leave our residual in expires to be reported. + */ + put_task_struct(timer->it.cpu.task); + timer->it.cpu.task = NULL; + timer->it.cpu.expires = cpu_time_sub(timer->it_clock, + timer->it.cpu.expires, + now); +} + +/* + * Insert the timer on the appropriate list before any timers that + * expire later. This must be called with the tasklist_lock held + * for reading, and interrupts disabled. + */ +static void arm_timer(struct k_itimer *timer, union cpu_time_count now) +{ + struct task_struct *p = timer->it.cpu.task; + struct list_head *head, *listpos; + struct cpu_timer_list *const nt = &timer->it.cpu; + struct cpu_timer_list *next; + unsigned long i; + + head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? + p->cpu_timers : p->signal->cpu_timers); + head += CPUCLOCK_WHICH(timer->it_clock); + + BUG_ON(!irqs_disabled()); + spin_lock(&p->sighand->siglock); + + listpos = head; + if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { + list_for_each_entry(next, head, entry) { + if (next->expires.sched > nt->expires.sched) { + listpos = &next->entry; + break; + } + } + } else { + list_for_each_entry(next, head, entry) { + if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { + listpos = &next->entry; + break; + } + } + } + list_add(&nt->entry, listpos); + + if (listpos == head) { + /* + * We are the new earliest-expiring timer. + * If we are a thread timer, there can always + * be a process timer telling us to stop earlier. + */ + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + switch (CPUCLOCK_WHICH(timer->it_clock)) { + default: + BUG(); + case CPUCLOCK_PROF: + if (cputime_eq(p->it_prof_expires, + cputime_zero) || + cputime_gt(p->it_prof_expires, + nt->expires.cpu)) + p->it_prof_expires = nt->expires.cpu; + break; + case CPUCLOCK_VIRT: + if (cputime_eq(p->it_virt_expires, + cputime_zero) || + cputime_gt(p->it_virt_expires, + nt->expires.cpu)) + p->it_virt_expires = nt->expires.cpu; + break; + case CPUCLOCK_SCHED: + if (p->it_sched_expires == 0 || + p->it_sched_expires > nt->expires.sched) + p->it_sched_expires = nt->expires.sched; + break; + } + } else { + /* + * For a process timer, we must balance + * all the live threads' expirations. + */ + switch (CPUCLOCK_WHICH(timer->it_clock)) { + default: + BUG(); + case CPUCLOCK_VIRT: + if (!cputime_eq(p->signal->it_virt_expires, + cputime_zero) && + cputime_lt(p->signal->it_virt_expires, + timer->it.cpu.expires.cpu)) + break; + goto rebalance; + case CPUCLOCK_PROF: + if (!cputime_eq(p->signal->it_prof_expires, + cputime_zero) && + cputime_lt(p->signal->it_prof_expires, + timer->it.cpu.expires.cpu)) + break; + i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + if (i != RLIM_INFINITY && + i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + break; + goto rebalance; + case CPUCLOCK_SCHED: + rebalance: + process_timer_rebalance( + timer->it.cpu.task, + CPUCLOCK_WHICH(timer->it_clock), + timer->it.cpu.expires, now); + break; + } + } + } + + spin_unlock(&p->sighand->siglock); +} + +/* + * The timer is locked, fire it and arrange for its reload. + */ +static void cpu_timer_fire(struct k_itimer *timer) +{ + if (unlikely(timer->sigq == NULL)) { + /* + * This a special case for clock_nanosleep, + * not a normal timer from sys_timer_create. + */ + wake_up_process(timer->it_process); + timer->it.cpu.expires.sched = 0; + } else if (timer->it.cpu.incr.sched == 0) { + /* + * One-shot timer. Clear it as soon as it's fired. + */ + posix_timer_event(timer, 0); + timer->it.cpu.expires.sched = 0; + } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + /* + * The signal did not get queued because the signal + * was ignored, so we won't get any callback to + * reload the timer. But we need to keep it + * ticking in case the signal is deliverable next time. + */ + posix_cpu_timer_schedule(timer); + } +} + +/* + * Guts of sys_timer_settime for CPU timers. + * This is called with the timer locked and interrupts disabled. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) +{ + struct task_struct *p = timer->it.cpu.task; + union cpu_time_count old_expires, new_expires, val; + int ret; + + if (unlikely(p == NULL)) { + /* + * Timer refers to a dead task's clock. + */ + return -ESRCH; + } + + new_expires = timespec_to_sample(timer->it_clock, &new->it_value); + + read_lock(&tasklist_lock); + /* + * We need the tasklist_lock to protect against reaping that + * clears p->signal. If p has just been reaped, we can no + * longer get any information about it at all. + */ + if (unlikely(p->signal == NULL)) { + read_unlock(&tasklist_lock); + put_task_struct(p); + timer->it.cpu.task = NULL; + return -ESRCH; + } + + /* + * Disarm any old timer after extracting its expiry time. + */ + BUG_ON(!irqs_disabled()); + spin_lock(&p->sighand->siglock); + old_expires = timer->it.cpu.expires; + list_del_init(&timer->it.cpu.entry); + spin_unlock(&p->sighand->siglock); + + /* + * We need to sample the current value to convert the new + * value from to relative and absolute, and to convert the + * old value from absolute to relative. To set a process + * timer, we need a sample to balance the thread expiry + * times (in arm_timer). With an absolute time, we must + * check if it's already passed. In short, we need a sample. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &val); + } else { + cpu_clock_sample_group(timer->it_clock, p, &val); + } + + if (old) { + if (old_expires.sched == 0) { + old->it_value.tv_sec = 0; + old->it_value.tv_nsec = 0; + } else { + /* + * Update the timer in case it has + * overrun already. If it has, + * we'll report it as having overrun + * and with the next reloaded timer + * already ticking, though we are + * swallowing that pending + * notification here to install the + * new setting. + */ + bump_cpu_timer(timer, val); + if (cpu_time_before(timer->it_clock, val, + timer->it.cpu.expires)) { + old_expires = cpu_time_sub( + timer->it_clock, + timer->it.cpu.expires, val); + sample_to_timespec(timer->it_clock, + old_expires, + &old->it_value); + } else { + old->it_value.tv_nsec = 1; + old->it_value.tv_sec = 0; + } + } + } + + if (unlikely(timer->it.cpu.firing)) { + /* + * We are colliding with the timer actually firing. + * Punt after filling in the timer's old value, and + * disable this firing since we are already reporting + * it as an overrun (thanks to bump_cpu_timer above). + */ + read_unlock(&tasklist_lock); + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + goto out; + } + + if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { + cpu_time_add(timer->it_clock, &new_expires, val); + } + + /* + * Install the new expiry time (or zero). + * For a timer with no notification action, we don't actually + * arm the timer (we'll just fake it for timer_gettime). + */ + timer->it.cpu.expires = new_expires; + if (new_expires.sched != 0 && + (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && + cpu_time_before(timer->it_clock, val, new_expires)) { + arm_timer(timer, val); + } + + read_unlock(&tasklist_lock); + + /* + * Install the new reload setting, and + * set up the signal and overrun bookkeeping. + */ + timer->it.cpu.incr = timespec_to_sample(timer->it_clock, + &new->it_interval); + + /* + * This acts as a modification timestamp for the timer, + * so any automatic reload attempt will punt on seeing + * that we have reset the timer manually. + */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timer->it_overrun_last = 0; + timer->it_overrun = -1; + + if (new_expires.sched != 0 && + (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && + !cpu_time_before(timer->it_clock, val, new_expires)) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + + ret = 0; + out: + if (old) { + sample_to_timespec(timer->it_clock, + timer->it.cpu.incr, &old->it_interval); + } + return ret; +} + +void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +{ + union cpu_time_count now; + struct task_struct *p = timer->it.cpu.task; + int clear_dead; + + /* + * Easy part: convert the reload time. + */ + sample_to_timespec(timer->it_clock, + timer->it.cpu.incr, &itp->it_interval); + + if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + return; + } + + if (unlikely(p == NULL)) { + /* + * This task already died and the timer will never fire. + * In this case, expires is actually the dead value. + */ + dead: + sample_to_timespec(timer->it_clock, timer->it.cpu.expires, + &itp->it_value); + return; + } + + /* + * Sample the clock to take the difference with the expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + clear_dead = p->exit_state; + } else { + read_lock(&tasklist_lock); + if (unlikely(p->signal == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + * Call the timer disarmed, nothing else to do. + */ + put_task_struct(p); + timer->it.cpu.task = NULL; + timer->it.cpu.expires.sched = 0; + read_unlock(&tasklist_lock); + goto dead; + } else { + cpu_clock_sample_group(timer->it_clock, p, &now); + clear_dead = (unlikely(p->exit_state) && + thread_group_empty(p)); + } + read_unlock(&tasklist_lock); + } + + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + if (timer->it.cpu.incr.sched == 0 && + cpu_time_before(timer->it_clock, + timer->it.cpu.expires, now)) { + /* + * Do-nothing timer expired and has no reload, + * so it's as if it was never set. + */ + timer->it.cpu.expires.sched = 0; + itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + return; + } + /* + * Account for any expirations and reloads that should + * have happened. + */ + bump_cpu_timer(timer, now); + } + + if (unlikely(clear_dead)) { + /* + * We've noticed that the thread is dead, but + * not yet reaped. Take this opportunity to + * drop our task ref. + */ + clear_dead_task(timer, now); + goto dead; + } + + if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + sample_to_timespec(timer->it_clock, + cpu_time_sub(timer->it_clock, + timer->it.cpu.expires, now), + &itp->it_value); + } else { + /* + * The timer should have expired already, but the firing + * hasn't taken place yet. Say it's just about to expire. + */ + itp->it_value.tv_nsec = 1; + itp->it_value.tv_sec = 0; + } +} + +/* + * Check for any per-thread CPU timers that have fired and move them off + * the tsk->cpu_timers[N] list onto the firing list. Here we update the + * tsk->it_*_expires values to reflect the remaining thread CPU timers. + */ +static void check_thread_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct list_head *timers = tsk->cpu_timers; + + tsk->it_prof_expires = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list, + entry); + if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { + tsk->it_prof_expires = t->expires.cpu; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + ++timers; + tsk->it_virt_expires = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list, + entry); + if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { + tsk->it_virt_expires = t->expires.cpu; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + ++timers; + tsk->it_sched_expires = 0; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list, + entry); + if (tsk->sched_time < t->expires.sched) { + tsk->it_sched_expires = t->expires.sched; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } +} + +/* + * Check for any per-thread CPU timers that have fired and move them + * off the tsk->*_timers list onto the firing list. Per-thread timers + * have already been taken off. + */ +static void check_process_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct signal_struct *const sig = tsk->signal; + cputime_t utime, stime, ptime, virt_expires, prof_expires; + unsigned long long sched_time, sched_expires; + struct task_struct *t; + struct list_head *timers = sig->cpu_timers; + + /* + * Don't sample the current process CPU clocks if there are no timers. + */ + if (list_empty(&timers[CPUCLOCK_PROF]) && + cputime_eq(sig->it_prof_expires, cputime_zero) && + sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && + list_empty(&timers[CPUCLOCK_VIRT]) && + cputime_eq(sig->it_virt_expires, cputime_zero) && + list_empty(&timers[CPUCLOCK_SCHED])) + return; + + /* + * Collect the current process totals. + */ + utime = sig->utime; + stime = sig->stime; + sched_time = sig->sched_time; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + sched_time += t->sched_time; + t = next_thread(t); + } while (t != tsk); + ptime = cputime_add(utime, stime); + + prof_expires = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list, + entry); + if (cputime_lt(ptime, t->expires.cpu)) { + prof_expires = t->expires.cpu; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + ++timers; + virt_expires = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list, + entry); + if (cputime_lt(utime, t->expires.cpu)) { + virt_expires = t->expires.cpu; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + ++timers; + sched_expires = 0; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list, + entry); + if (sched_time < t->expires.sched) { + sched_expires = t->expires.sched; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + /* + * Check for the special case process timers. + */ + if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { + if (cputime_ge(ptime, sig->it_prof_expires)) { + /* ITIMER_PROF fires and reloads. */ + sig->it_prof_expires = sig->it_prof_incr; + if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { + sig->it_prof_expires = cputime_add( + sig->it_prof_expires, ptime); + } + __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); + } + if (!cputime_eq(sig->it_prof_expires, cputime_zero) && + (cputime_eq(prof_expires, cputime_zero) || + cputime_lt(sig->it_prof_expires, prof_expires))) { + prof_expires = sig->it_prof_expires; + } + } + if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { + if (cputime_ge(utime, sig->it_virt_expires)) { + /* ITIMER_VIRTUAL fires and reloads. */ + sig->it_virt_expires = sig->it_virt_incr; + if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { + sig->it_virt_expires = cputime_add( + sig->it_virt_expires, utime); + } + __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); + } + if (!cputime_eq(sig->it_virt_expires, cputime_zero) && + (cputime_eq(virt_expires, cputime_zero) || + cputime_lt(sig->it_virt_expires, virt_expires))) { + virt_expires = sig->it_virt_expires; + } + } + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + unsigned long psecs = cputime_to_secs(ptime); + cputime_t x; + if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + if (sig->rlim[RLIMIT_CPU].rlim_cur + < sig->rlim[RLIMIT_CPU].rlim_max) { + sig->rlim[RLIMIT_CPU].rlim_cur++; + } + } + x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + if (cputime_eq(prof_expires, cputime_zero) || + cputime_lt(x, prof_expires)) { + prof_expires = x; + } + } + + if (!cputime_eq(prof_expires, cputime_zero) || + !cputime_eq(virt_expires, cputime_zero) || + sched_expires != 0) { + /* + * Rebalance the threads' expiry times for the remaining + * process CPU timers. + */ + + cputime_t prof_left, virt_left, ticks; + unsigned long long sched_left, sched; + const unsigned int nthreads = atomic_read(&sig->live); + + prof_left = cputime_sub(prof_expires, utime); + prof_left = cputime_sub(prof_left, stime); + prof_left = cputime_div(prof_left, nthreads); + virt_left = cputime_sub(virt_expires, utime); + virt_left = cputime_div(virt_left, nthreads); + if (sched_expires) { + sched_left = sched_expires - sched_time; + do_div(sched_left, nthreads); + } else { + sched_left = 0; + } + t = tsk; + do { + ticks = cputime_add(cputime_add(t->utime, t->stime), + prof_left); + if (!cputime_eq(prof_expires, cputime_zero) && + (cputime_eq(t->it_prof_expires, cputime_zero) || + cputime_gt(t->it_prof_expires, ticks))) { + t->it_prof_expires = ticks; + } + + ticks = cputime_add(t->utime, virt_left); + if (!cputime_eq(virt_expires, cputime_zero) && + (cputime_eq(t->it_virt_expires, cputime_zero) || + cputime_gt(t->it_virt_expires, ticks))) { + t->it_virt_expires = ticks; + } + + sched = t->sched_time + sched_left; + if (sched_expires && (t->it_sched_expires == 0 || + t->it_sched_expires > sched)) { + t->it_sched_expires = sched; + } + + do { + t = next_thread(t); + } while (unlikely(t->exit_state)); + } while (t != tsk); + } +} + +/* + * This is called from the signal code (via do_schedule_next_timer) + * when the last timer signal was delivered and we have to reload the timer. + */ +void posix_cpu_timer_schedule(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + union cpu_time_count now; + + if (unlikely(p == NULL)) + /* + * The task was cleaned up already, no future firings. + */ + return; + + /* + * Fetch the current sample and update the timer's expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + if (unlikely(p->exit_state)) { + clear_dead_task(timer, now); + return; + } + read_lock(&tasklist_lock); /* arm_timer needs it. */ + } else { + read_lock(&tasklist_lock); + if (unlikely(p->signal == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + */ + put_task_struct(p); + timer->it.cpu.task = p = NULL; + timer->it.cpu.expires.sched = 0; + read_unlock(&tasklist_lock); + return; + } else if (unlikely(p->exit_state) && thread_group_empty(p)) { + /* + * We've noticed that the thread is dead, but + * not yet reaped. Take this opportunity to + * drop our task ref. + */ + clear_dead_task(timer, now); + read_unlock(&tasklist_lock); + return; + } + cpu_clock_sample_group(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + /* Leave the tasklist_lock locked for the call below. */ + } + + /* + * Now re-arm for the new expiry time. + */ + arm_timer(timer, now); + + read_unlock(&tasklist_lock); +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + LIST_HEAD(firing); + struct k_itimer *timer, *next; + + BUG_ON(!irqs_disabled()); + +#define UNEXPIRED(clock) \ + (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ + cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) + + if (UNEXPIRED(prof) && UNEXPIRED(virt) && + (tsk->it_sched_expires == 0 || + tsk->sched_time < tsk->it_sched_expires)) + return; + +#undef UNEXPIRED + + BUG_ON(tsk->exit_state); + + /* + * Double-check with locks held. + */ + read_lock(&tasklist_lock); + spin_lock(&tsk->sighand->siglock); + + /* + * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] + * all the timers that are firing, and put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); + + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + + /* + * Now that all the timers on our list have the firing flag, + * noone will touch their list entries but us. We'll take + * each timer's lock before clearing its firing flag, so no + * timer call will interfere. + */ + list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + int firing; + spin_lock(&timer->it_lock); + list_del_init(&timer->it.cpu.entry); + firing = timer->it.cpu.firing; + timer->it.cpu.firing = 0; + /* + * The firing flag is -1 if we collided with a reset + * of the timer, which already reported this + * almost-firing as an overrun. So don't generate an event. + */ + if (likely(firing >= 0)) { + cpu_timer_fire(timer); + } + spin_unlock(&timer->it_lock); + } +} + +/* + * Set one of the process-wide special case CPU timers. + * The tasklist_lock and tsk->sighand->siglock must be held by the caller. + * The oldval argument is null for the RLIMIT_CPU timer, where *newval is + * absolute; non-null for ITIMER_*, where *newval is relative and we update + * it to be absolute, *oldval is absolute and we update it to be relative. + */ +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, + cputime_t *newval, cputime_t *oldval) +{ + union cpu_time_count now; + struct list_head *head; + + BUG_ON(clock_idx == CPUCLOCK_SCHED); + cpu_clock_sample_group_locked(clock_idx, tsk, &now); + + if (oldval) { + if (!cputime_eq(*oldval, cputime_zero)) { + if (cputime_le(*oldval, now.cpu)) { + /* Just about to fire. */ + *oldval = jiffies_to_cputime(1); + } else { + *oldval = cputime_sub(*oldval, now.cpu); + } + } + + if (cputime_eq(*newval, cputime_zero)) + return; + *newval = cputime_add(*newval, now.cpu); + + /* + * If the RLIMIT_CPU timer will expire before the + * ITIMER_PROF timer, we have nothing else to do. + */ + if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur + < cputime_to_secs(*newval)) + return; + } + + /* + * Check whether there are any process timers already set to fire + * before this one. If so, we don't have anything more to do. + */ + head = &tsk->signal->cpu_timers[clock_idx]; + if (list_empty(head) || + cputime_ge(list_entry(head->next, + struct cpu_timer_list, entry)->expires.cpu, + *newval)) { + /* + * Rejigger each thread's expiry time so that one will + * notice before we hit the process-cumulative expiry time. + */ + union cpu_time_count expires = { .sched = 0 }; + expires.cpu = *newval; + process_timer_rebalance(tsk, clock_idx, expires, now); + } +} + +static long posix_cpu_clock_nanosleep_restart(struct restart_block *); + +int posix_cpu_nsleep(clockid_t which_clock, int flags, + struct timespec *rqtp) +{ + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + struct k_itimer timer; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + /* + * Set up a temporary timer and then wait for it to go off. + */ + memset(&timer, 0, sizeof timer); + spin_lock_init(&timer.it_lock); + timer.it_clock = which_clock; + timer.it_overrun = -1; + error = posix_cpu_timer_create(&timer); + timer.it_process = current; + if (!error) { + struct timespec __user *rmtp; + static struct itimerspec zero_it; + struct itimerspec it = { .it_value = *rqtp, + .it_interval = {} }; + + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_set(&timer, flags, &it, NULL); + if (error) { + spin_unlock_irq(&timer.it_lock); + return error; + } + + while (!signal_pending(current)) { + if (timer.it.cpu.expires.sched == 0) { + /* + * Our timer fired and was reset. + */ + spin_unlock_irq(&timer.it_lock); + return 0; + } + + /* + * Block until cpu_timer_fire (or a signal) wakes us. + */ + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&timer.it_lock); + schedule(); + spin_lock_irq(&timer.it_lock); + } + + /* + * We were interrupted by a signal. + */ + sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); + posix_cpu_timer_set(&timer, 0, &zero_it, &it); + spin_unlock_irq(&timer.it_lock); + + if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { + /* + * It actually did fire already. + */ + return 0; + } + + /* + * Report back to the user the time still remaining. + */ + rmtp = (struct timespec __user *) restart_block->arg1; + if (rmtp != NULL && !(flags & TIMER_ABSTIME) && + copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_clock_nanosleep_restart; + /* Caller already set restart_block->arg1 */ + restart_block->arg0 = which_clock; + restart_block->arg2 = rqtp->tv_sec; + restart_block->arg3 = rqtp->tv_nsec; + + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +static long +posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->arg0; + struct timespec t = { .tv_sec = restart_block->arg2, + .tv_nsec = restart_block->arg3 }; + restart_block->fn = do_no_restart_syscall; + return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); +} + + +#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) + +static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) +{ + return posix_cpu_clock_getres(PROCESS_CLOCK, tp); +} +static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp) +{ + return posix_cpu_clock_get(PROCESS_CLOCK, tp); +} +static int process_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = PROCESS_CLOCK; + return posix_cpu_timer_create(timer); +} +static int process_cpu_nsleep(clockid_t which_clock, int flags, + struct timespec *rqtp) +{ + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); +} +static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) +{ + return posix_cpu_clock_getres(THREAD_CLOCK, tp); +} +static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp) +{ + return posix_cpu_clock_get(THREAD_CLOCK, tp); +} +static int thread_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = THREAD_CLOCK; + return posix_cpu_timer_create(timer); +} +static int thread_cpu_nsleep(clockid_t which_clock, int flags, + struct timespec *rqtp) +{ + return -EINVAL; +} + +static __init int init_posix_cpu_timers(void) +{ + struct k_clock process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .clock_set = do_posix_clock_nosettime, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + }; + struct k_clock thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .clock_set = do_posix_clock_nosettime, + .timer_create = thread_cpu_timer_create, + .nsleep = thread_cpu_nsleep, + }; + + register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + + return 0; +} +__initcall(init_posix_cpu_timers); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c new file mode 100644 index 00000000000..fd316c27226 --- /dev/null +++ b/kernel/posix-timers.c @@ -0,0 +1,1584 @@ +/* + * linux/kernel/posix_timers.c + * + * + * 2002-10-15 Posix Clocks & timers + * by George Anzinger george@mvista.com + * + * Copyright (C) 2002 2003 by MontaVista Software. + * + * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. + * Copyright (C) 2004 Boris Hu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA + */ + +/* These are all the functions necessary to implement + * POSIX clocks & timers + */ +#include <linux/mm.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/time.h> + +#include <asm/uaccess.h> +#include <asm/semaphore.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/idr.h> +#include <linux/posix-timers.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/module.h> + +#ifndef div_long_long_rem +#include <asm/div64.h> + +#define div_long_long_rem(dividend,divisor,remainder) ({ \ + u64 result = dividend; \ + *remainder = do_div(result,divisor); \ + result; }) + +#endif +#define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */ + +static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2) +{ + return (u64)mpy1 * mpy2; +} +/* + * Management arrays for POSIX timers. Timers are kept in slab memory + * Timer ids are allocated by an external routine that keeps track of the + * id and the timer. The external interface is: + * + * void *idr_find(struct idr *idp, int id); to find timer_id <id> + * int idr_get_new(struct idr *idp, void *ptr); to get a new id and + * related it to <ptr> + * void idr_remove(struct idr *idp, int id); to release <id> + * void idr_init(struct idr *idp); to initialize <idp> + * which we supply. + * The idr_get_new *may* call slab for more memory so it must not be + * called under a spin lock. Likewise idr_remore may release memory + * (but it may be ok to do this under a lock...). + * idr_find is just a memory look up and is quite fast. A -1 return + * indicates that the requested id does not exist. + */ + +/* + * Lets keep our timers in a slab cache :-) + */ +static kmem_cache_t *posix_timers_cache; +static struct idr posix_timers_id; +static DEFINE_SPINLOCK(idr_lock); + +/* + * Just because the timer is not in the timer list does NOT mean it is + * inactive. It could be in the "fire" routine getting a new expire time. + */ +#define TIMER_INACTIVE 1 + +#ifdef CONFIG_SMP +# define timer_active(tmr) \ + ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) +# define set_timer_inactive(tmr) \ + do { \ + (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ + } while (0) +#else +# define timer_active(tmr) BARFY // error to use outside of SMP +# define set_timer_inactive(tmr) do { } while (0) +#endif +/* + * we assume that the new SIGEV_THREAD_ID shares no bits with the other + * SIGEV values. Here we put out an error if this assumption fails. + */ +#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) +#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" +#endif + + +/* + * The timer ID is turned into a timer address by idr_find(). + * Verifying a valid ID consists of: + * + * a) checking that idr_find() returns other than -1. + * b) checking that the timer id matches the one in the timer itself. + * c) that the timer owner is in the callers thread group. + */ + +/* + * CLOCKs: The POSIX standard calls for a couple of clocks and allows us + * to implement others. This structure defines the various + * clocks and allows the possibility of adding others. We + * provide an interface to add clocks to the table and expect + * the "arch" code to add at least one clock that is high + * resolution. Here we define the standard CLOCK_REALTIME as a + * 1/HZ resolution clock. + * + * RESOLUTION: Clock resolution is used to round up timer and interval + * times, NOT to report clock times, which are reported with as + * much resolution as the system can muster. In some cases this + * resolution may depend on the underlying clock hardware and + * may not be quantifiable until run time, and only then is the + * necessary code is written. The standard says we should say + * something about this issue in the documentation... + * + * FUNCTIONS: The CLOCKs structure defines possible functions to handle + * various clock functions. For clocks that use the standard + * system timer code these entries should be NULL. This will + * allow dispatch without the overhead of indirect function + * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) + * must supply functions here, even if the function just returns + * ENOSYS. The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for the + * timer. 2.) The list, it_lock, it_clock, it_id and it_process + * fields are not modified by timer code. + * + * At this time all functions EXCEPT clock_nanosleep can be + * redirected by the CLOCKS structure. Clock_nanosleep is in + * there, but the code ignores it. + * + * Permissions: It is assumed that the clock_settime() function defined + * for each clock will take care of permission checks. Some + * clocks may be set able by any user (i.e. local process + * clocks) others not. Currently the only set able clock we + * have is CLOCK_REALTIME and its high res counter part, both of + * which we beg off on and pass to do_sys_settimeofday(). + */ + +static struct k_clock posix_clocks[MAX_CLOCKS]; +/* + * We only have one real clock that can be set so we need only one abs list, + * even if we should want to have several clocks with differing resolutions. + */ +static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), + .lock = SPIN_LOCK_UNLOCKED}; + +static void posix_timer_fn(unsigned long); +static u64 do_posix_clock_monotonic_gettime_parts( + struct timespec *tp, struct timespec *mo); +int do_posix_clock_monotonic_gettime(struct timespec *tp); +static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp); + +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); + +static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +{ + spin_unlock_irqrestore(&timr->it_lock, flags); +} + +/* + * Call the k_clock hook function if non-null, or the default function. + */ +#define CLOCK_DISPATCH(clock, call, arglist) \ + ((clock) < 0 ? posix_cpu_##call arglist : \ + (posix_clocks[clock].call != NULL \ + ? (*posix_clocks[clock].call) arglist : common_##call arglist)) + +/* + * Default clock hook functions when the struct k_clock passed + * to register_posix_clock leaves a function pointer null. + * + * The function common_CALL is the default implementation for + * the function pointer CALL in struct k_clock. + */ + +static inline int common_clock_getres(clockid_t which_clock, + struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = posix_clocks[which_clock].res; + return 0; +} + +static inline int common_clock_get(clockid_t which_clock, struct timespec *tp) +{ + getnstimeofday(tp); + return 0; +} + +static inline int common_clock_set(clockid_t which_clock, struct timespec *tp) +{ + return do_sys_settimeofday(tp, NULL); +} + +static inline int common_timer_create(struct k_itimer *new_timer) +{ + INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); + init_timer(&new_timer->it.real.timer); + new_timer->it.real.timer.data = (unsigned long) new_timer; + new_timer->it.real.timer.function = posix_timer_fn; + set_timer_inactive(new_timer); + return 0; +} + +/* + * These ones are defined below. + */ +static int common_nsleep(clockid_t, int flags, struct timespec *t); +static void common_timer_get(struct k_itimer *, struct itimerspec *); +static int common_timer_set(struct k_itimer *, int, + struct itimerspec *, struct itimerspec *); +static int common_timer_del(struct k_itimer *timer); + +/* + * Return nonzero iff we know a priori this clockid_t value is bogus. + */ +static inline int invalid_clockid(clockid_t which_clock) +{ + if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ + return 0; + if ((unsigned) which_clock >= MAX_CLOCKS) + return 1; + if (posix_clocks[which_clock].clock_getres != NULL) + return 0; +#ifndef CLOCK_DISPATCH_DIRECT + if (posix_clocks[which_clock].res != 0) + return 0; +#endif + return 1; +} + + +/* + * Initialize everything, well, just everything in Posix clocks/timers ;) + */ +static __init int init_posix_timers(void) +{ + struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, + .abs_struct = &abs_list + }; + struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, + .abs_struct = NULL, + .clock_get = do_posix_clock_monotonic_get, + .clock_set = do_posix_clock_nosettime + }; + + register_posix_clock(CLOCK_REALTIME, &clock_realtime); + register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof (struct k_itimer), 0, 0, NULL, NULL); + idr_init(&posix_timers_id); + return 0; +} + +__initcall(init_posix_timers); + +static void tstojiffie(struct timespec *tp, int res, u64 *jiff) +{ + long sec = tp->tv_sec; + long nsec = tp->tv_nsec + res - 1; + + if (nsec > NSEC_PER_SEC) { + sec++; + nsec -= NSEC_PER_SEC; + } + + /* + * The scaling constants are defined in <linux/time.h> + * The difference between there and here is that we do the + * res rounding and compute a 64-bit result (well so does that + * but it then throws away the high bits). + */ + *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) + + (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} + +/* + * This function adjusts the timer as needed as a result of the clock + * being set. It should only be called for absolute timers, and then + * under the abs_list lock. It computes the time difference and sets + * the new jiffies value in the timer. It also updates the timers + * reference wall_to_monotonic value. It is complicated by the fact + * that tstojiffies() only handles positive times and it needs to work + * with both positive and negative times. Also, for negative offsets, + * we need to defeat the res round up. + * + * Return is true if there is a new time, else false. + */ +static long add_clockset_delta(struct k_itimer *timr, + struct timespec *new_wall_to) +{ + struct timespec delta; + int sign = 0; + u64 exp; + + set_normalized_timespec(&delta, + new_wall_to->tv_sec - + timr->it.real.wall_to_prev.tv_sec, + new_wall_to->tv_nsec - + timr->it.real.wall_to_prev.tv_nsec); + if (likely(!(delta.tv_sec | delta.tv_nsec))) + return 0; + if (delta.tv_sec < 0) { + set_normalized_timespec(&delta, + -delta.tv_sec, + 1 - delta.tv_nsec - + posix_clocks[timr->it_clock].res); + sign++; + } + tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp); + timr->it.real.wall_to_prev = *new_wall_to; + timr->it.real.timer.expires += (sign ? -exp : exp); + return 1; +} + +static void remove_from_abslist(struct k_itimer *timr) +{ + if (!list_empty(&timr->it.real.abs_timer_entry)) { + spin_lock(&abs_list.lock); + list_del_init(&timr->it.real.abs_timer_entry); + spin_unlock(&abs_list.lock); + } +} + +static void schedule_next_timer(struct k_itimer *timr) +{ + struct timespec new_wall_to; + struct now_struct now; + unsigned long seq; + + /* + * Set up the timer for the next interval (if there is one). + * Note: this code uses the abs_timer_lock to protect + * it.real.wall_to_prev and must hold it until exp is set, not exactly + * obvious... + + * This function is used for CLOCK_REALTIME* and + * CLOCK_MONOTONIC* timers. If we ever want to handle other + * CLOCKs, the calling code (do_schedule_next_timer) would need + * to pull the "clock" info from the timer and dispatch the + * "other" CLOCKs "next timer" code (which, I suppose should + * also be added to the k_clock structure). + */ + if (!timr->it.real.incr) + return; + + do { + seq = read_seqbegin(&xtime_lock); + new_wall_to = wall_to_monotonic; + posix_get_now(&now); + } while (read_seqretry(&xtime_lock, seq)); + + if (!list_empty(&timr->it.real.abs_timer_entry)) { + spin_lock(&abs_list.lock); + add_clockset_delta(timr, &new_wall_to); + + posix_bump_timer(timr, now); + + spin_unlock(&abs_list.lock); + } else { + posix_bump_timer(timr, now); + } + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; + add_timer(&timr->it.real.timer); +} + +/* + * This function is exported for use by the signal deliver code. It is + * called just prior to the info block being released and passes that + * block to us. It's function is to update the overrun entry AND to + * restart the timer. It should only be called if the timer is to be + * restarted (i.e. we have flagged this in the sys_private entry of the + * info block). + * + * To protect aginst the timer going away while the interrupt is queued, + * we require that the it_requeue_pending flag be set. + */ +void do_schedule_next_timer(struct siginfo *info) +{ + struct k_itimer *timr; + unsigned long flags; + + timr = lock_timer(info->si_tid, &flags); + + if (!timr || timr->it_requeue_pending != info->si_sys_private) + goto exit; + + if (timr->it_clock < 0) /* CPU clock */ + posix_cpu_timer_schedule(timr); + else + schedule_next_timer(timr); + info->si_overrun = timr->it_overrun_last; +exit: + if (timr) + unlock_timer(timr, flags); +} + +int posix_timer_event(struct k_itimer *timr,int si_private) +{ + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + timr->sigq->info.si_sys_private = si_private; + /* + * Send signal to the process that owns this timer. + + * This code assumes that all the possible abs_lists share the + * same lock (there is only one list at this time). If this is + * not the case, the CLOCK info would need to be used to find + * the proper abs list lock. + */ + + timr->sigq->info.si_signo = timr->it_sigev_signo; + timr->sigq->info.si_errno = 0; + timr->sigq->info.si_code = SI_TIMER; + timr->sigq->info.si_tid = timr->it_id; + timr->sigq->info.si_value = timr->it_sigev_value; + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { + if (unlikely(timr->it_process->flags & PF_EXITING)) { + timr->it_sigev_notify = SIGEV_SIGNAL; + put_task_struct(timr->it_process); + timr->it_process = timr->it_process->group_leader; + goto group; + } + return send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + } + else { + group: + return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + } +} +EXPORT_SYMBOL_GPL(posix_timer_event); + +/* + * This function gets called when a POSIX.1b interval timer expires. It + * is used as a callback from the kernel internal timer. The + * run_timer_list code ALWAYS calls with interrupts on. + + * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + */ +static void posix_timer_fn(unsigned long __data) +{ + struct k_itimer *timr = (struct k_itimer *) __data; + unsigned long flags; + unsigned long seq; + struct timespec delta, new_wall_to; + u64 exp = 0; + int do_notify = 1; + + spin_lock_irqsave(&timr->it_lock, flags); + set_timer_inactive(timr); + if (!list_empty(&timr->it.real.abs_timer_entry)) { + spin_lock(&abs_list.lock); + do { + seq = read_seqbegin(&xtime_lock); + new_wall_to = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + set_normalized_timespec(&delta, + new_wall_to.tv_sec - + timr->it.real.wall_to_prev.tv_sec, + new_wall_to.tv_nsec - + timr->it.real.wall_to_prev.tv_nsec); + if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) { + /* do nothing, timer is on time */ + } else if (delta.tv_sec < 0) { + /* do nothing, timer is already late */ + } else { + /* timer is early due to a clock set */ + tstojiffie(&delta, + posix_clocks[timr->it_clock].res, + &exp); + timr->it.real.wall_to_prev = new_wall_to; + timr->it.real.timer.expires += exp; + add_timer(&timr->it.real.timer); + do_notify = 0; + } + spin_unlock(&abs_list.lock); + + } + if (do_notify) { + int si_private=0; + + if (timr->it.real.incr) + si_private = ++timr->it_requeue_pending; + else { + remove_from_abslist(timr); + } + + if (posix_timer_event(timr, si_private)) + /* + * signal was not sent because of sig_ignor + * we will not get a call back to restart it AND + * it should be restarted. + */ + schedule_next_timer(timr); + } + unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ +} + + +static inline struct task_struct * good_sigevent(sigevent_t * event) +{ + struct task_struct *rtn = current->group_leader; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && + (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || + rtn->tgid != current->tgid || + (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + return NULL; + + if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && + ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) + return NULL; + + return rtn; +} + +void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock) +{ + if ((unsigned) clock_id >= MAX_CLOCKS) { + printk("POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + posix_clocks[clock_id] = *new_clock; +} +EXPORT_SYMBOL_GPL(register_posix_clock); + +static struct k_itimer * alloc_posix_timer(void) +{ + struct k_itimer *tmr; + tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) + return tmr; + memset(tmr, 0, sizeof (struct k_itimer)); + if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + kmem_cache_free(posix_timers_cache, tmr); + tmr = NULL; + } + return tmr; +} + +#define IT_ID_SET 1 +#define IT_ID_NOT_SET 0 +static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +{ + if (it_id_set) { + unsigned long flags; + spin_lock_irqsave(&idr_lock, flags); + idr_remove(&posix_timers_id, tmr->it_id); + spin_unlock_irqrestore(&idr_lock, flags); + } + sigqueue_free(tmr->sigq); + if (unlikely(tmr->it_process) && + tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(tmr->it_process); + kmem_cache_free(posix_timers_cache, tmr); +} + +/* Create a POSIX.1b interval timer. */ + +asmlinkage long +sys_timer_create(clockid_t which_clock, + struct sigevent __user *timer_event_spec, + timer_t __user * created_timer_id) +{ + int error = 0; + struct k_itimer *new_timer = NULL; + int new_timer_id; + struct task_struct *process = NULL; + unsigned long flags; + sigevent_t event; + int it_id_set = IT_ID_NOT_SET; + + if (invalid_clockid(which_clock)) + return -EINVAL; + + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + retry: + if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { + error = -EAGAIN; + goto out; + } + spin_lock_irq(&idr_lock); + error = idr_get_new(&posix_timers_id, + (void *) new_timer, + &new_timer_id); + spin_unlock_irq(&idr_lock); + if (error == -EAGAIN) + goto retry; + else if (error) { + /* + * Wierd looking, but we return EAGAIN if the IDR is + * full (proper POSIX return value for this) + */ + error = -EAGAIN; + goto out; + } + + it_id_set = IT_ID_SET; + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; + new_timer->it_overrun = -1; + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + if (error) + goto out; + + /* + * return the timer_id now. The next step is hard to + * back out if there is an error. + */ + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + if (timer_event_spec) { + if (copy_from_user(&event, timer_event_spec, sizeof (event))) { + error = -EFAULT; + goto out; + } + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->it_sigev_signo = event.sigev_signo; + new_timer->it_sigev_value = event.sigev_value; + + read_lock(&tasklist_lock); + if ((process = good_sigevent(&event))) { + /* + * We may be setting up this process for another + * thread. It may be exiting. To catch this + * case the we check the PF_EXITING flag. If + * the flag is not set, the siglock will catch + * him before it is too late (in exit_itimers). + * + * The exec case is a bit more invloved but easy + * to code. If the process is in our thread + * group (and it must be or we would not allow + * it here) and is doing an exec, it will cause + * us to be killed. In this case it will wait + * for us to die which means we can finish this + * linkage with our last gasp. I.e. no code :) + */ + spin_lock_irqsave(&process->sighand->siglock, flags); + if (!(process->flags & PF_EXITING)) { + new_timer->it_process = process; + list_add(&new_timer->list, + &process->signal->posix_timers); + spin_unlock_irqrestore(&process->sighand->siglock, flags); + if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + get_task_struct(process); + } else { + spin_unlock_irqrestore(&process->sighand->siglock, flags); + process = NULL; + } + } + read_unlock(&tasklist_lock); + if (!process) { + error = -EINVAL; + goto out; + } + } else { + new_timer->it_sigev_notify = SIGEV_SIGNAL; + new_timer->it_sigev_signo = SIGALRM; + new_timer->it_sigev_value.sival_int = new_timer->it_id; + process = current->group_leader; + spin_lock_irqsave(&process->sighand->siglock, flags); + new_timer->it_process = process; + list_add(&new_timer->list, &process->signal->posix_timers); + spin_unlock_irqrestore(&process->sighand->siglock, flags); + } + + /* + * In the case of the timer belonging to another task, after + * the task is unlocked, the timer is owned by the other task + * and may cease to exist at any time. Don't use or modify + * new_timer after the unlock call. + */ + +out: + if (error) + release_posix_timer(new_timer, it_id_set); + + return error; +} + +/* + * good_timespec + * + * This function checks the elements of a timespec structure. + * + * Arguments: + * ts : Pointer to the timespec structure to check + * + * Return value: + * If a NULL pointer was passed in, or the tv_nsec field was less than 0 + * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0, + * this function returns 0. Otherwise it returns 1. + */ +static int good_timespec(const struct timespec *ts) +{ + if ((!ts) || (ts->tv_sec < 0) || + ((unsigned) ts->tv_nsec >= NSEC_PER_SEC)) + return 0; + return 1; +} + +/* + * Locking issues: We need to protect the result of the id look up until + * we get the timer locked down so it is not deleted under us. The + * removal is done under the idr spinlock so we use that here to bridge + * the find to the timer lock. To avoid a dead lock, the timer id MUST + * be release with out holding the timer lock. + */ +static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +{ + struct k_itimer *timr; + /* + * Watch out here. We do a irqsave on the idr_lock and pass the + * flags part over to the timer lock. Must not let interrupts in + * while we are moving the lock. + */ + + spin_lock_irqsave(&idr_lock, *flags); + timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); + if (timr) { + spin_lock(&timr->it_lock); + spin_unlock(&idr_lock); + + if ((timr->it_id != timer_id) || !(timr->it_process) || + timr->it_process->tgid != current->tgid) { + unlock_timer(timr, *flags); + timr = NULL; + } + } else + spin_unlock_irqrestore(&idr_lock, *flags); + + return timr; +} + +/* + * Get the time remaining on a POSIX.1b interval timer. This function + * is ALWAYS called with spin_lock_irq on the timer, thus it must not + * mess with irq. + * + * We have a couple of messes to clean up here. First there is the case + * of a timer that has a requeue pending. These timers should appear to + * be in the timer list with an expiry as if we were to requeue them + * now. + * + * The second issue is the SIGEV_NONE timer which may be active but is + * not really ever put in the timer list (to save system resources). + * This timer may be expired, and if so, we will do it here. Otherwise + * it is the same as a requeue pending timer WRT to what we should + * report. + */ +static void +common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +{ + unsigned long expires; + struct now_struct now; + + do + expires = timr->it.real.timer.expires; + while ((volatile long) (timr->it.real.timer.expires) != expires); + + posix_get_now(&now); + + if (expires && + ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) && + !timr->it.real.incr && + posix_time_before(&timr->it.real.timer, &now)) + timr->it.real.timer.expires = expires = 0; + if (expires) { + if (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + posix_bump_timer(timr, now); + expires = timr->it.real.timer.expires; + } + else + if (!timer_pending(&timr->it.real.timer)) + expires = 0; + if (expires) + expires -= now.jiffies; + } + jiffies_to_timespec(expires, &cur_setting->it_value); + jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval); + + if (cur_setting->it_value.tv_sec < 0) { + cur_setting->it_value.tv_nsec = 1; + cur_setting->it_value.tv_sec = 0; + } +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +asmlinkage long +sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) +{ + struct k_itimer *timr; + struct itimerspec cur_setting; + unsigned long flags; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); + + unlock_timer(timr, flags); + + if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + return -EFAULT; + + return 0; +} +/* + * Get the number of overruns of a POSIX.1b interval timer. This is to + * be the overrun of the timer last delivered. At the same time we are + * accumulating overruns on the next timer. The overrun is frozen when + * the signal is delivered, either at the notify time (if the info block + * is not queued) or at the actual delivery time (as we are informed by + * the call back to do_schedule_next_timer(). So all we need to do is + * to pick up the frozen overrun. + */ + +asmlinkage long +sys_timer_getoverrun(timer_t timer_id) +{ + struct k_itimer *timr; + int overrun; + long flags; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + overrun = timr->it_overrun_last; + unlock_timer(timr, flags); + + return overrun; +} +/* + * Adjust for absolute time + * + * If absolute time is given and it is not CLOCK_MONOTONIC, we need to + * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and + * what ever clock he is using. + * + * If it is relative time, we need to add the current (CLOCK_MONOTONIC) + * time to it to get the proper time for the timer. + */ +static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, + int abs, u64 *exp, struct timespec *wall_to) +{ + struct timespec now; + struct timespec oc = *tp; + u64 jiffies_64_f; + int rtn =0; + + if (abs) { + /* + * The mask pick up the 4 basic clocks + */ + if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) { + jiffies_64_f = do_posix_clock_monotonic_gettime_parts( + &now, wall_to); + /* + * If we are doing a MONOTONIC clock + */ + if((clock - &posix_clocks[0]) & CLOCKS_MONO){ + now.tv_sec += wall_to->tv_sec; + now.tv_nsec += wall_to->tv_nsec; + } + } else { + /* + * Not one of the basic clocks + */ + clock->clock_get(clock - posix_clocks, &now); + jiffies_64_f = get_jiffies_64(); + } + /* + * Take away now to get delta + */ + oc.tv_sec -= now.tv_sec; + oc.tv_nsec -= now.tv_nsec; + /* + * Normalize... + */ + while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) { + oc.tv_nsec -= NSEC_PER_SEC; + oc.tv_sec++; + } + while ((oc.tv_nsec) < 0) { + oc.tv_nsec += NSEC_PER_SEC; + oc.tv_sec--; + } + }else{ + jiffies_64_f = get_jiffies_64(); + } + /* + * Check if the requested time is prior to now (if so set now) + */ + if (oc.tv_sec < 0) + oc.tv_sec = oc.tv_nsec = 0; + + if (oc.tv_sec | oc.tv_nsec) + set_normalized_timespec(&oc, oc.tv_sec, + oc.tv_nsec + clock->res); + tstojiffie(&oc, clock->res, exp); + + /* + * Check if the requested time is more than the timer code + * can handle (if so we error out but return the value too). + */ + if (*exp > ((u64)MAX_JIFFY_OFFSET)) + /* + * This is a considered response, not exactly in + * line with the standard (in fact it is silent on + * possible overflows). We assume such a large + * value is ALMOST always a programming error and + * try not to compound it by setting a really dumb + * value. + */ + rtn = -EINVAL; + /* + * return the actual jiffies expire time, full 64 bits + */ + *exp += jiffies_64_f; + return rtn; +} + +/* Set a POSIX.1b interval timer. */ +/* timr->it_lock is taken. */ +static inline int +common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, struct itimerspec *old_setting) +{ + struct k_clock *clock = &posix_clocks[timr->it_clock]; + u64 expire_64; + + if (old_setting) + common_timer_get(timr, old_setting); + + /* disable the timer */ + timr->it.real.incr = 0; + /* + * careful here. If smp we could be in the "fire" routine which will + * be spinning as we hold the lock. But this is ONLY an SMP issue. + */ +#ifdef CONFIG_SMP + if (timer_active(timr) && !del_timer(&timr->it.real.timer)) + /* + * It can only be active if on an other cpu. Since + * we have cleared the interval stuff above, it should + * clear once we release the spin lock. Of course once + * we do that anything could happen, including the + * complete melt down of the timer. So return with + * a "retry" exit status. + */ + return TIMER_RETRY; + + set_timer_inactive(timr); +#else + del_timer(&timr->it.real.timer); +#endif + remove_from_abslist(timr); + + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timr->it_overrun_last = 0; + timr->it_overrun = -1; + /* + *switch off the timer when it_value is zero + */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) { + timr->it.real.timer.expires = 0; + return 0; + } + + if (adjust_abs_time(clock, + &new_setting->it_value, flags & TIMER_ABSTIME, + &expire_64, &(timr->it.real.wall_to_prev))) { + return -EINVAL; + } + timr->it.real.timer.expires = (unsigned long)expire_64; + tstojiffie(&new_setting->it_interval, clock->res, &expire_64); + timr->it.real.incr = (unsigned long)expire_64; + + /* + * We do not even queue SIGEV_NONE timers! But we do put them + * in the abs list so we can do that right. + */ + if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) + add_timer(&timr->it.real.timer); + + if (flags & TIMER_ABSTIME && clock->abs_struct) { + spin_lock(&clock->abs_struct->lock); + list_add_tail(&(timr->it.real.abs_timer_entry), + &(clock->abs_struct->list)); + spin_unlock(&clock->abs_struct->lock); + } + return 0; +} + +/* Set a POSIX.1b interval timer */ +asmlinkage long +sys_timer_settime(timer_t timer_id, int flags, + const struct itimerspec __user *new_setting, + struct itimerspec __user *old_setting) +{ + struct k_itimer *timr; + struct itimerspec new_spec, old_spec; + int error = 0; + long flag; + struct itimerspec *rtn = old_setting ? &old_spec : NULL; + + if (!new_setting) + return -EINVAL; + + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + + if ((!good_timespec(&new_spec.it_interval)) || + (!good_timespec(&new_spec.it_value))) + return -EINVAL; +retry: + timr = lock_timer(timer_id, &flag); + if (!timr) + return -EINVAL; + + error = CLOCK_DISPATCH(timr->it_clock, timer_set, + (timr, flags, &new_spec, rtn)); + + unlock_timer(timr, flag); + if (error == TIMER_RETRY) { + rtn = NULL; // We already got the old time... + goto retry; + } + + if (old_setting && !error && copy_to_user(old_setting, + &old_spec, sizeof (old_spec))) + error = -EFAULT; + + return error; +} + +static inline int common_timer_del(struct k_itimer *timer) +{ + timer->it.real.incr = 0; +#ifdef CONFIG_SMP + if (timer_active(timer) && !del_timer(&timer->it.real.timer)) + /* + * It can only be active if on an other cpu. Since + * we have cleared the interval stuff above, it should + * clear once we release the spin lock. Of course once + * we do that anything could happen, including the + * complete melt down of the timer. So return with + * a "retry" exit status. + */ + return TIMER_RETRY; +#else + del_timer(&timer->it.real.timer); +#endif + remove_from_abslist(timer); + + return 0; +} + +static inline int timer_delete_hook(struct k_itimer *timer) +{ + return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); +} + +/* Delete a POSIX.1b interval timer. */ +asmlinkage long +sys_timer_delete(timer_t timer_id) +{ + struct k_itimer *timer; + long flags; + +#ifdef CONFIG_SMP + int error; +retry_delete: +#endif + timer = lock_timer(timer_id, &flags); + if (!timer) + return -EINVAL; + +#ifdef CONFIG_SMP + error = timer_delete_hook(timer); + + if (error == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } +#else + timer_delete_hook(timer); +#endif + spin_lock(¤t->sighand->siglock); + list_del(&timer->list); + spin_unlock(¤t->sighand->siglock); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + if (timer->it_process) { + if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(timer->it_process); + timer->it_process = NULL; + } + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); + return 0; +} +/* + * return timer owned by the process, used by exit_itimers + */ +static inline void itimer_delete(struct k_itimer *timer) +{ + unsigned long flags; + +#ifdef CONFIG_SMP + int error; +retry_delete: +#endif + spin_lock_irqsave(&timer->it_lock, flags); + +#ifdef CONFIG_SMP + error = timer_delete_hook(timer); + + if (error == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } +#else + timer_delete_hook(timer); +#endif + list_del(&timer->list); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + if (timer->it_process) { + if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(timer->it_process); + timer->it_process = NULL; + } + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); +} + +/* + * This is called by __exit_signal, only when there are no more + * references to the shared signal_struct. + */ +void exit_itimers(struct signal_struct *sig) +{ + struct k_itimer *tmr; + + while (!list_empty(&sig->posix_timers)) { + tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + itimer_delete(tmr); + } +} + +/* + * And now for the "clock" calls + * + * These functions are called both from timer functions (with the timer + * spin_lock_irq() held and from clock calls with no locking. They must + * use the save flags versions of locks. + */ + +/* + * We do ticks here to avoid the irq lock ( they take sooo long). + * The seqlock is great here. Since we a reader, we don't really care + * if we are interrupted since we don't take lock that will stall us or + * any other cpu. Voila, no irq lock is needed. + * + */ + +static u64 do_posix_clock_monotonic_gettime_parts( + struct timespec *tp, struct timespec *mo) +{ + u64 jiff; + unsigned int seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(tp); + *mo = wall_to_monotonic; + jiff = jiffies_64; + + } while(read_seqretry(&xtime_lock, seq)); + + return jiff; +} + +static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) +{ + struct timespec wall_to_mono; + + do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); + + tp->tv_sec += wall_to_mono.tv_sec; + tp->tv_nsec += wall_to_mono.tv_nsec; + + if ((tp->tv_nsec - NSEC_PER_SEC) > 0) { + tp->tv_nsec -= NSEC_PER_SEC; + tp->tv_sec++; + } + return 0; +} + +int do_posix_clock_monotonic_gettime(struct timespec *tp) +{ + return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp); +} + +int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); + +int do_posix_clock_notimer_create(struct k_itimer *timer) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); + +int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) +{ +#ifndef ENOTSUP + return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ +#else /* parisc does define it separately. */ + return -ENOTSUP; +#endif +} +EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); + +asmlinkage long +sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) +{ + struct timespec new_tp; + + if (invalid_clockid(which_clock)) + return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + + return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); +} + +asmlinkage long +sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) +{ + struct timespec kernel_tp; + int error; + + if (invalid_clockid(which_clock)) + return -EINVAL; + error = CLOCK_DISPATCH(which_clock, clock_get, + (which_clock, &kernel_tp)); + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + error = -EFAULT; + + return error; + +} + +asmlinkage long +sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) +{ + struct timespec rtn_tp; + int error; + + if (invalid_clockid(which_clock)) + return -EINVAL; + + error = CLOCK_DISPATCH(which_clock, clock_getres, + (which_clock, &rtn_tp)); + + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { + error = -EFAULT; + } + + return error; +} + +static void nanosleep_wake_up(unsigned long __data) +{ + struct task_struct *p = (struct task_struct *) __data; + + wake_up_process(p); +} + +/* + * The standard says that an absolute nanosleep call MUST wake up at + * the requested time in spite of clock settings. Here is what we do: + * For each nanosleep call that needs it (only absolute and not on + * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure + * into the "nanosleep_abs_list". All we need is the task_struct pointer. + * When ever the clock is set we just wake up all those tasks. The rest + * is done by the while loop in clock_nanosleep(). + * + * On locking, clock_was_set() is called from update_wall_clock which + * holds (or has held for it) a write_lock_irq( xtime_lock) and is + * called from the timer bh code. Thus we need the irq save locks. + * + * Also, on the call from update_wall_clock, that is done as part of a + * softirq thing. We don't want to delay the system that much (possibly + * long list of timers to fix), so we defer that work to keventd. + */ + +static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); +static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); + +static DECLARE_MUTEX(clock_was_set_lock); + +void clock_was_set(void) +{ + struct k_itimer *timr; + struct timespec new_wall_to; + LIST_HEAD(cws_list); + unsigned long seq; + + + if (unlikely(in_interrupt())) { + schedule_work(&clock_was_set_work); + return; + } + wake_up_all(&nanosleep_abs_wqueue); + + /* + * Check if there exist TIMER_ABSTIME timers to correct. + * + * Notes on locking: This code is run in task context with irq + * on. We CAN be interrupted! All other usage of the abs list + * lock is under the timer lock which holds the irq lock as + * well. We REALLY don't want to scan the whole list with the + * interrupt system off, AND we would like a sequence lock on + * this code as well. Since we assume that the clock will not + * be set often, it seems ok to take and release the irq lock + * for each timer. In fact add_timer will do this, so this is + * not an issue. So we know when we are done, we will move the + * whole list to a new location. Then as we process each entry, + * we will move it to the actual list again. This way, when our + * copy is empty, we are done. We are not all that concerned + * about preemption so we will use a semaphore lock to protect + * aginst reentry. This way we will not stall another + * processor. It is possible that this may delay some timers + * that should have expired, given the new clock, but even this + * will be minimal as we will always update to the current time, + * even if it was set by a task that is waiting for entry to + * this code. Timers that expire too early will be caught by + * the expire code and restarted. + + * Absolute timers that repeat are left in the abs list while + * waiting for the task to pick up the signal. This means we + * may find timers that are not in the "add_timer" list, but are + * in the abs list. We do the same thing for these, save + * putting them back in the "add_timer" list. (Note, these are + * left in the abs list mainly to indicate that they are + * ABSOLUTE timers, a fact that is used by the re-arm code, and + * for which we have no other flag.) + + */ + + down(&clock_was_set_lock); + spin_lock_irq(&abs_list.lock); + list_splice_init(&abs_list.list, &cws_list); + spin_unlock_irq(&abs_list.lock); + do { + do { + seq = read_seqbegin(&xtime_lock); + new_wall_to = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + spin_lock_irq(&abs_list.lock); + if (list_empty(&cws_list)) { + spin_unlock_irq(&abs_list.lock); + break; + } + timr = list_entry(cws_list.next, struct k_itimer, + it.real.abs_timer_entry); + + list_del_init(&timr->it.real.abs_timer_entry); + if (add_clockset_delta(timr, &new_wall_to) && + del_timer(&timr->it.real.timer)) /* timer run yet? */ + add_timer(&timr->it.real.timer); + list_add(&timr->it.real.abs_timer_entry, &abs_list.list); + spin_unlock_irq(&abs_list.lock); + } while (1); + + up(&clock_was_set_lock); +} + +long clock_nanosleep_restart(struct restart_block *restart_block); + +asmlinkage long +sys_clock_nanosleep(clockid_t which_clock, int flags, + const struct timespec __user *rqtp, + struct timespec __user *rmtp) +{ + struct timespec t; + struct restart_block *restart_block = + &(current_thread_info()->restart_block); + int ret; + + if (invalid_clockid(which_clock)) + return -EINVAL; + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + + if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) + return -EINVAL; + + /* + * Do this here as nsleep function does not have the real address. + */ + restart_block->arg1 = (unsigned long)rmtp; + + ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t)); + + if ((ret == -ERESTART_RESTARTBLOCK) && rmtp && + copy_to_user(rmtp, &t, sizeof (t))) + return -EFAULT; + return ret; +} + + +static int common_nsleep(clockid_t which_clock, + int flags, struct timespec *tsave) +{ + struct timespec t, dum; + struct timer_list new_timer; + DECLARE_WAITQUEUE(abs_wqueue, current); + u64 rq_time = (u64)0; + s64 left; + int abs; + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + + abs_wqueue.flags = 0; + init_timer(&new_timer); + new_timer.expires = 0; + new_timer.data = (unsigned long) current; + new_timer.function = nanosleep_wake_up; + abs = flags & TIMER_ABSTIME; + + if (restart_block->fn == clock_nanosleep_restart) { + /* + * Interrupted by a non-delivered signal, pick up remaining + * time and continue. Remaining time is in arg2 & 3. + */ + restart_block->fn = do_no_restart_syscall; + + rq_time = restart_block->arg3; + rq_time = (rq_time << 32) + restart_block->arg2; + if (!rq_time) + return -EINTR; + left = rq_time - get_jiffies_64(); + if (left <= (s64)0) + return 0; /* Already passed */ + } + + if (abs && (posix_clocks[which_clock].clock_get != + posix_clocks[CLOCK_MONOTONIC].clock_get)) + add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue); + + do { + t = *tsave; + if (abs || !rq_time) { + adjust_abs_time(&posix_clocks[which_clock], &t, abs, + &rq_time, &dum); + } + + left = rq_time - get_jiffies_64(); + if (left >= (s64)MAX_JIFFY_OFFSET) + left = (s64)MAX_JIFFY_OFFSET; + if (left < (s64)0) + break; + + new_timer.expires = jiffies + left; + __set_current_state(TASK_INTERRUPTIBLE); + add_timer(&new_timer); + + schedule(); + + del_timer_sync(&new_timer); + left = rq_time - get_jiffies_64(); + } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); + + if (abs_wqueue.task_list.next) + finish_wait(&nanosleep_abs_wqueue, &abs_wqueue); + + if (left > (s64)0) { + + /* + * Always restart abs calls from scratch to pick up any + * clock shifting that happened while we are away. + */ + if (abs) + return -ERESTARTNOHAND; + + left *= TICK_NSEC; + tsave->tv_sec = div_long_long_rem(left, + NSEC_PER_SEC, + &tsave->tv_nsec); + /* + * Restart works by saving the time remaing in + * arg2 & 3 (it is 64-bits of jiffies). The other + * info we need is the clock_id (saved in arg0). + * The sys_call interface needs the users + * timespec return address which _it_ saves in arg1. + * Since we have cast the nanosleep call to a clock_nanosleep + * both can be restarted with the same code. + */ + restart_block->fn = clock_nanosleep_restart; + restart_block->arg0 = which_clock; + /* + * Caller sets arg1 + */ + restart_block->arg2 = rq_time & 0xffffffffLL; + restart_block->arg3 = rq_time >> 32; + + return -ERESTART_RESTARTBLOCK; + } + + return 0; +} +/* + * This will restart clock_nanosleep. + */ +long +clock_nanosleep_restart(struct restart_block *restart_block) +{ + struct timespec t; + int ret = common_nsleep(restart_block->arg0, 0, &t); + + if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 && + copy_to_user((struct timespec __user *)(restart_block->arg1), &t, + sizeof (t))) + return -EFAULT; + return ret; +} diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig new file mode 100644 index 00000000000..696387ffe49 --- /dev/null +++ b/kernel/power/Kconfig @@ -0,0 +1,74 @@ +config PM + bool "Power Management support" + ---help--- + "Power Management" means that parts of your computer are shut + off or put into a power conserving "sleep" mode if they are not + being used. There are two competing standards for doing this: APM + and ACPI. If you want to use either one, say Y here and then also + to the requisite support below. + + Power Management is most important for battery powered laptop + computers; if you have a laptop, check out the Linux Laptop home + page on the WWW at <http://www.linux-on-laptops.com/> or + Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> + and the Battery Powered Linux mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + Note that, even if you say N here, Linux on the x86 architecture + will issue the hlt instruction if nothing is to be done, thereby + sending the processor to sleep and saving power. + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables verbose debugging support in the Power Management + code. This is helpful when debugging and reporting various PM bugs, + like suspend support. + +config SOFTWARE_SUSPEND + bool "Software Suspend (EXPERIMENTAL)" + depends on EXPERIMENTAL && PM && SWAP + ---help--- + Enable the possibility of suspending the machine. + It doesn't need APM. + You may suspend your machine by 'swsusp' or 'shutdown -z <time>' + (patch for sysvinit needed). + + It creates an image which is saved in your active swap. Upon next + boot, pass the 'resume=/dev/swappartition' argument to the kernel to + have it detect the saved image, restore memory state from it, and + continue to run as before. If you do not want the previous state to + be reloaded, then use the 'noresume' kernel argument. However, note + that your partitions will be fsck'd and you must re-mkswap your swap + partitions. It does not work with swap files. + + Right now you may boot without resuming and then later resume but + in meantime you cannot use those swap partitions/files which were + involved in suspending. Also in this case there is a risk that buffers + on disk won't match with saved ones. + + For more information take a look at <file:Documentation/power/swsusp.txt>. + +config PM_STD_PARTITION + string "Default resume partition" + depends on SOFTWARE_SUSPEND + default "" + ---help--- + The default resume partition is the partition that the suspend- + to-disk implementation will look for a suspended disk image. + + The partition specified here will be different for almost every user. + It should be a valid swap partition (at least for now) that is turned + on before suspending. + + The partition specified can be overridden by specifying: + + resume=/dev/<other device> + + which will set the resume partition to the device specified. + + Note there is currently not a way to specify which device to save the + suspended image to. It will simply pick the first available swap + device. + diff --git a/kernel/power/Makefile b/kernel/power/Makefile new file mode 100644 index 00000000000..fbdc634135a --- /dev/null +++ b/kernel/power/Makefile @@ -0,0 +1,11 @@ + +ifeq ($(CONFIG_PM_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif + +swsusp-smp-$(CONFIG_SMP) += smp.o + +obj-y := main.o process.o console.o pm.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o + +obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/console.c b/kernel/power/console.c new file mode 100644 index 00000000000..7ff375e7c95 --- /dev/null +++ b/kernel/power/console.c @@ -0,0 +1,58 @@ +/* + * drivers/power/process.c - Functions for saving/restoring console. + * + * Originally from swsusp. + */ + +#include <linux/vt_kern.h> +#include <linux/kbd_kern.h> +#include <linux/console.h> +#include "power.h" + +static int new_loglevel = 10; +static int orig_loglevel; +#ifdef SUSPEND_CONSOLE +static int orig_fgconsole, orig_kmsg; +#endif + +int pm_prepare_console(void) +{ + orig_loglevel = console_loglevel; + console_loglevel = new_loglevel; + +#ifdef SUSPEND_CONSOLE + acquire_console_sem(); + + orig_fgconsole = fg_console; + + if (vc_allocate(SUSPEND_CONSOLE)) { + /* we can't have a free VC for now. Too bad, + * we don't want to mess the screen for now. */ + release_console_sem(); + return 1; + } + + set_console(SUSPEND_CONSOLE); + release_console_sem(); + + if (vt_waitactive(SUSPEND_CONSOLE)) { + pr_debug("Suspend: Can't switch VCs."); + return 1; + } + orig_kmsg = kmsg_redirect; + kmsg_redirect = SUSPEND_CONSOLE; +#endif + return 0; +} + +void pm_restore_console(void) +{ + console_loglevel = orig_loglevel; +#ifdef SUSPEND_CONSOLE + acquire_console_sem(); + set_console(orig_fgconsole); + release_console_sem(); + kmsg_redirect = orig_kmsg; +#endif + return; +} diff --git a/kernel/power/disk.c b/kernel/power/disk.c new file mode 100644 index 00000000000..02b6764034d --- /dev/null +++ b/kernel/power/disk.c @@ -0,0 +1,431 @@ +/* + * kernel/power/disk.c - Suspend-to-disk support. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * + * This file is released under the GPLv2. + * + */ + +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/fs.h> +#include "power.h" + + +extern suspend_disk_method_t pm_disk_mode; +extern struct pm_ops * pm_ops; + +extern int swsusp_suspend(void); +extern int swsusp_write(void); +extern int swsusp_check(void); +extern int swsusp_read(void); +extern void swsusp_close(void); +extern int swsusp_resume(void); +extern int swsusp_free(void); + + +static int noresume = 0; +char resume_file[256] = CONFIG_PM_STD_PARTITION; +dev_t swsusp_resume_device; + +/** + * power_down - Shut machine down for hibernate. + * @mode: Suspend-to-disk mode + * + * Use the platform driver, if configured so, and return gracefully if it + * fails. + * Otherwise, try to power off and reboot. If they fail, halt the machine, + * there ain't no turning back. + */ + +static void power_down(suspend_disk_method_t mode) +{ + unsigned long flags; + int error = 0; + + local_irq_save(flags); + switch(mode) { + case PM_DISK_PLATFORM: + device_shutdown(); + error = pm_ops->enter(PM_SUSPEND_DISK); + break; + case PM_DISK_SHUTDOWN: + printk("Powering off system\n"); + device_shutdown(); + machine_power_off(); + break; + case PM_DISK_REBOOT: + device_shutdown(); + machine_restart(NULL); + break; + } + machine_halt(); + /* Valid image is on the disk, if we continue we risk serious data corruption + after resume. */ + printk(KERN_CRIT "Please power me down manually\n"); + while(1); +} + + +static int in_suspend __nosavedata = 0; + + +/** + * free_some_memory - Try to free as much memory as possible + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped at this point, or + * livelock is possible. + */ + +static void free_some_memory(void) +{ + unsigned int i = 0; + unsigned int tmp; + unsigned long pages = 0; + char *p = "-\\|/"; + + printk("Freeing memory... "); + while ((tmp = shrink_all_memory(10000))) { + pages += tmp; + printk("\b%c", p[i]); + i++; + if (i > 3) + i = 0; + } + printk("\bdone (%li pages freed)\n", pages); +} + + +static inline void platform_finish(void) +{ + if (pm_disk_mode == PM_DISK_PLATFORM) { + if (pm_ops && pm_ops->finish) + pm_ops->finish(PM_SUSPEND_DISK); + } +} + +static void finish(void) +{ + device_resume(); + platform_finish(); + enable_nonboot_cpus(); + thaw_processes(); + pm_restore_console(); +} + + +static int prepare_processes(void) +{ + int error; + + pm_prepare_console(); + + sys_sync(); + + if (freeze_processes()) { + error = -EBUSY; + return error; + } + + if (pm_disk_mode == PM_DISK_PLATFORM) { + if (pm_ops && pm_ops->prepare) { + if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) + return error; + } + } + + /* Free memory before shutting down devices. */ + free_some_memory(); + + return 0; +} + +static void unprepare_processes(void) +{ + enable_nonboot_cpus(); + thaw_processes(); + pm_restore_console(); +} + +static int prepare_devices(void) +{ + int error; + + disable_nonboot_cpus(); + if ((error = device_suspend(PMSG_FREEZE))) { + printk("Some devices failed to suspend\n"); + platform_finish(); + enable_nonboot_cpus(); + return error; + } + + return 0; +} + +/** + * pm_suspend_disk - The granpappy of power management. + * + * If we're going through the firmware, then get it over with quickly. + * + * If not, then call swsusp to do its thing, then figure out how + * to power down the system. + */ + +int pm_suspend_disk(void) +{ + int error; + + error = prepare_processes(); + if (!error) { + error = prepare_devices(); + } + + if (error) { + unprepare_processes(); + return error; + } + + pr_debug("PM: Attempting to suspend to disk.\n"); + if (pm_disk_mode == PM_DISK_FIRMWARE) + return pm_ops->enter(PM_SUSPEND_DISK); + + pr_debug("PM: snapshotting memory.\n"); + in_suspend = 1; + if ((error = swsusp_suspend())) + goto Done; + + if (in_suspend) { + pr_debug("PM: writing image.\n"); + error = swsusp_write(); + if (!error) + power_down(pm_disk_mode); + } else + pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); + Done: + finish(); + return error; +} + + +/** + * software_resume - Resume from a saved image. + * + * Called as a late_initcall (so all devices are discovered and + * initialized), we call swsusp to see if we have a saved image or not. + * If so, we quiesce devices, the restore the saved image. We will + * return above (in pm_suspend_disk() ) if everything goes well. + * Otherwise, we fail gracefully and return to the normally + * scheduled program. + * + */ + +static int software_resume(void) +{ + int error; + + if (noresume) { + /** + * FIXME: If noresume is specified, we need to find the partition + * and reset it back to normal swap space. + */ + return 0; + } + + pr_debug("PM: Checking swsusp image.\n"); + + if ((error = swsusp_check())) + goto Done; + + pr_debug("PM: Preparing processes for restore.\n"); + + if ((error = prepare_processes())) { + swsusp_close(); + goto Cleanup; + } + + pr_debug("PM: Reading swsusp image.\n"); + + if ((error = swsusp_read())) + goto Cleanup; + + pr_debug("PM: Preparing devices for restore.\n"); + + if ((error = prepare_devices())) + goto Free; + + mb(); + + pr_debug("PM: Restoring saved image.\n"); + swsusp_resume(); + pr_debug("PM: Restore failed, recovering.n"); + finish(); + Free: + swsusp_free(); + Cleanup: + unprepare_processes(); + Done: + pr_debug("PM: Resume from disk failed.\n"); + return 0; +} + +late_initcall(software_resume); + + +static char * pm_disk_modes[] = { + [PM_DISK_FIRMWARE] = "firmware", + [PM_DISK_PLATFORM] = "platform", + [PM_DISK_SHUTDOWN] = "shutdown", + [PM_DISK_REBOOT] = "reboot", +}; + +/** + * disk - Control suspend-to-disk mode + * + * Suspend-to-disk can be handled in several ways. The greatest + * distinction is who writes memory to disk - the firmware or the OS. + * If the firmware does it, we assume that it also handles suspending + * the system. + * If the OS does it, then we have three options for putting the system + * to sleep - using the platform driver (e.g. ACPI or other PM registers), + * powering off the system or rebooting the system (for testing). + * + * The system will support either 'firmware' or 'platform', and that is + * known a priori (and encoded in pm_ops). But, the user may choose + * 'shutdown' or 'reboot' as alternatives. + * + * show() will display what the mode is currently set to. + * store() will accept one of + * + * 'firmware' + * 'platform' + * 'shutdown' + * 'reboot' + * + * It will only change to 'firmware' or 'platform' if the system + * supports it (as determined from pm_ops->pm_disk_mode). + */ + +static ssize_t disk_show(struct subsystem * subsys, char * buf) +{ + return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); +} + + +static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) +{ + int error = 0; + int i; + int len; + char *p; + suspend_disk_method_t mode = 0; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + down(&pm_sem); + for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { + if (!strncmp(buf, pm_disk_modes[i], len)) { + mode = i; + break; + } + } + if (mode) { + if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) + pm_disk_mode = mode; + else { + if (pm_ops && pm_ops->enter && + (mode == pm_ops->pm_disk_mode)) + pm_disk_mode = mode; + else + error = -EINVAL; + } + } else + error = -EINVAL; + + pr_debug("PM: suspend-to-disk mode set to '%s'\n", + pm_disk_modes[mode]); + up(&pm_sem); + return error ? error : n; +} + +power_attr(disk); + +static ssize_t resume_show(struct subsystem * subsys, char *buf) +{ + return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); +} + +static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) +{ + int len; + char *p; + unsigned int maj, min; + int error = -EINVAL; + dev_t res; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + if (sscanf(buf, "%u:%u", &maj, &min) == 2) { + res = MKDEV(maj,min); + if (maj == MAJOR(res) && min == MINOR(res)) { + swsusp_resume_device = res; + printk("Attempting manual resume\n"); + noresume = 0; + software_resume(); + } + } + + return error >= 0 ? n : error; +} + +power_attr(resume); + +static struct attribute * g[] = { + &disk_attr.attr, + &resume_attr.attr, + NULL, +}; + + +static struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_disk_init(void) +{ + return sysfs_create_group(&power_subsys.kset.kobj,&attr_group); +} + +core_initcall(pm_disk_init); + + +static int __init resume_setup(char *str) +{ + if (noresume) + return 1; + + strncpy( resume_file, str, 255 ); + return 1; +} + +static int __init noresume_setup(char *str) +{ + noresume = 1; + return 1; +} + +__setup("noresume", noresume_setup); +__setup("resume=", resume_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c new file mode 100644 index 00000000000..7960ddf04a5 --- /dev/null +++ b/kernel/power/main.c @@ -0,0 +1,269 @@ +/* + * kernel/power/main.c - PM subsystem core functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * + * This file is released under the GPLv2 + * + */ + +#include <linux/suspend.h> +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/pm.h> + + +#include "power.h" + +DECLARE_MUTEX(pm_sem); + +struct pm_ops * pm_ops = NULL; +suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; + +/** + * pm_set_ops - Set the global power method table. + * @ops: Pointer to ops structure. + */ + +void pm_set_ops(struct pm_ops * ops) +{ + down(&pm_sem); + pm_ops = ops; + up(&pm_sem); +} + + +/** + * suspend_prepare - Do prep work before entering low-power state. + * @state: State we're entering. + * + * This is common code that is called for each state that we're + * entering. Allocate a console, stop all processes, then make sure + * the platform can enter the requested state. + */ + +static int suspend_prepare(suspend_state_t state) +{ + int error = 0; + + if (!pm_ops || !pm_ops->enter) + return -EPERM; + + pm_prepare_console(); + + if (freeze_processes()) { + error = -EAGAIN; + goto Thaw; + } + + if (pm_ops->prepare) { + if ((error = pm_ops->prepare(state))) + goto Thaw; + } + + if ((error = device_suspend(PMSG_SUSPEND))) { + printk(KERN_ERR "Some devices failed to suspend\n"); + goto Finish; + } + return 0; + Finish: + if (pm_ops->finish) + pm_ops->finish(state); + Thaw: + thaw_processes(); + pm_restore_console(); + return error; +} + + +static int suspend_enter(suspend_state_t state) +{ + int error = 0; + unsigned long flags; + + local_irq_save(flags); + + if ((error = device_power_down(PMSG_SUSPEND))) { + printk(KERN_ERR "Some devices failed to power down\n"); + goto Done; + } + error = pm_ops->enter(state); + device_power_up(); + Done: + local_irq_restore(flags); + return error; +} + + +/** + * suspend_finish - Do final work before exiting suspend sequence. + * @state: State we're coming out of. + * + * Call platform code to clean up, restart processes, and free the + * console that we've allocated. This is not called for suspend-to-disk. + */ + +static void suspend_finish(suspend_state_t state) +{ + device_resume(); + if (pm_ops && pm_ops->finish) + pm_ops->finish(state); + thaw_processes(); + pm_restore_console(); +} + + + + +static char * pm_states[] = { + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", + [PM_SUSPEND_DISK] = "disk", + NULL, +}; + + +/** + * enter_state - Do common work of entering low-power state. + * @state: pm_state structure for state we're entering. + * + * Make sure we're the only ones trying to enter a sleep state. Fail + * if someone has beat us to it, since we don't want anything weird to + * happen when we wake up. + * Then, do the setup for suspend, enter the state, and cleaup (after + * we've woken up). + */ + +static int enter_state(suspend_state_t state) +{ + int error; + + if (down_trylock(&pm_sem)) + return -EBUSY; + + if (state == PM_SUSPEND_DISK) { + error = pm_suspend_disk(); + goto Unlock; + } + + /* Suspend is hard to get right on SMP. */ + if (num_online_cpus() != 1) { + error = -EPERM; + goto Unlock; + } + + pr_debug("PM: Preparing system for suspend\n"); + if ((error = suspend_prepare(state))) + goto Unlock; + + pr_debug("PM: Entering state.\n"); + error = suspend_enter(state); + + pr_debug("PM: Finishing up.\n"); + suspend_finish(state); + Unlock: + up(&pm_sem); + return error; +} + +/* + * This is main interface to the outside world. It needs to be + * called from process context. + */ +int software_suspend(void) +{ + return enter_state(PM_SUSPEND_DISK); +} + + +/** + * pm_suspend - Externally visible function for suspending system. + * @state: Enumarted value of state to enter. + * + * Determine whether or not value is within range, get state + * structure, and enter (above). + */ + +int pm_suspend(suspend_state_t state) +{ + if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) + return enter_state(state); + return -EINVAL; +} + + + +decl_subsys(power,NULL,NULL); + + +/** + * state - control system power state. + * + * show() returns what states are supported, which is hard-coded to + * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and + * 'disk' (Suspend-to-Disk). + * + * store() accepts one of those strings, translates it into the + * proper enumerated value, and initiates a suspend transition. + */ + +static ssize_t state_show(struct subsystem * subsys, char * buf) +{ + int i; + char * s = buf; + + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (pm_states[i]) + s += sprintf(s,"%s ",pm_states[i]); + } + s += sprintf(s,"\n"); + return (s - buf); +} + +static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) +{ + suspend_state_t state = PM_SUSPEND_STANDBY; + char ** s; + char *p; + int error; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { + if (*s && !strncmp(buf, *s, len)) + break; + } + if (*s) + error = enter_state(state); + else + error = -EINVAL; + return error ? error : n; +} + +power_attr(state); + +static struct attribute * g[] = { + &state_attr.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_init(void) +{ + int error = subsystem_register(&power_subsys); + if (!error) + error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group); + return error; +} + +core_initcall(pm_init); diff --git a/kernel/power/pm.c b/kernel/power/pm.c new file mode 100644 index 00000000000..61deda04e39 --- /dev/null +++ b/kernel/power/pm.c @@ -0,0 +1,265 @@ +/* + * pm.c - Power management interface + * + * Copyright (C) 2000 Andrew Henroid + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/pm.h> +#include <linux/interrupt.h> + +int pm_active; + +/* + * Locking notes: + * pm_devs_lock can be a semaphore providing pm ops are not called + * from an interrupt handler (already a bad idea so no change here). Each + * change must be protected so that an unlink of an entry doesn't clash + * with a pm send - which is permitted to sleep in the current architecture + * + * Module unloads clashing with pm events now work out safely, the module + * unload path will block until the event has been sent. It may well block + * until a resume but that will be fine. + */ + +static DECLARE_MUTEX(pm_devs_lock); +static LIST_HEAD(pm_devs); + +/** + * pm_register - register a device with power management + * @type: device type + * @id: device ID + * @callback: callback function + * + * Add a device to the list of devices that wish to be notified about + * power management events. A &pm_dev structure is returned on success, + * on failure the return is %NULL. + * + * The callback function will be called in process context and + * it may sleep. + */ + +struct pm_dev *pm_register(pm_dev_t type, + unsigned long id, + pm_callback callback) +{ + struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); + if (dev) { + memset(dev, 0, sizeof(*dev)); + dev->type = type; + dev->id = id; + dev->callback = callback; + + down(&pm_devs_lock); + list_add(&dev->entry, &pm_devs); + up(&pm_devs_lock); + } + return dev; +} + +/** + * pm_unregister - unregister a device with power management + * @dev: device to unregister + * + * Remove a device from the power management notification lists. The + * dev passed must be a handle previously returned by pm_register. + */ + +void pm_unregister(struct pm_dev *dev) +{ + if (dev) { + down(&pm_devs_lock); + list_del(&dev->entry); + up(&pm_devs_lock); + + kfree(dev); + } +} + +static void __pm_unregister(struct pm_dev *dev) +{ + if (dev) { + list_del(&dev->entry); + kfree(dev); + } +} + +/** + * pm_unregister_all - unregister all devices with matching callback + * @callback: callback function pointer + * + * Unregister every device that would call the callback passed. This + * is primarily meant as a helper function for loadable modules. It + * enables a module to give up all its managed devices without keeping + * its own private list. + */ + +void pm_unregister_all(pm_callback callback) +{ + struct list_head *entry; + + if (!callback) + return; + + down(&pm_devs_lock); + entry = pm_devs.next; + while (entry != &pm_devs) { + struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); + entry = entry->next; + if (dev->callback == callback) + __pm_unregister(dev); + } + up(&pm_devs_lock); +} + +/** + * pm_send - send request to a single device + * @dev: device to send to + * @rqst: power management request + * @data: data for the callback + * + * Issue a power management request to a given device. The + * %PM_SUSPEND and %PM_RESUME events are handled specially. The + * data field must hold the intended next state. No call is made + * if the state matches. + * + * BUGS: what stops two power management requests occurring in parallel + * and conflicting. + * + * WARNING: Calling pm_send directly is not generally recommended, in + * particular there is no locking against the pm_dev going away. The + * caller must maintain all needed locking or have 'inside knowledge' + * on the safety. Also remember that this function is not locked against + * pm_unregister. This means that you must handle SMP races on callback + * execution and unload yourself. + */ + +static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) +{ + int status = 0; + unsigned long prev_state, next_state; + + if (in_interrupt()) + BUG(); + + switch (rqst) { + case PM_SUSPEND: + case PM_RESUME: + prev_state = dev->state; + next_state = (unsigned long) data; + if (prev_state != next_state) { + if (dev->callback) + status = (*dev->callback)(dev, rqst, data); + if (!status) { + dev->state = next_state; + dev->prev_state = prev_state; + } + } + else { + dev->prev_state = prev_state; + } + break; + default: + if (dev->callback) + status = (*dev->callback)(dev, rqst, data); + break; + } + return status; +} + +/* + * Undo incomplete request + */ +static void pm_undo_all(struct pm_dev *last) +{ + struct list_head *entry = last->entry.prev; + while (entry != &pm_devs) { + struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); + if (dev->state != dev->prev_state) { + /* previous state was zero (running) resume or + * previous state was non-zero (suspended) suspend + */ + pm_request_t undo = (dev->prev_state + ? PM_SUSPEND:PM_RESUME); + pm_send(dev, undo, (void*) dev->prev_state); + } + entry = entry->prev; + } +} + +/** + * pm_send_all - send request to all managed devices + * @rqst: power management request + * @data: data for the callback + * + * Issue a power management request to a all devices. The + * %PM_SUSPEND events are handled specially. Any device is + * permitted to fail a suspend by returning a non zero (error) + * value from its callback function. If any device vetoes a + * suspend request then all other devices that have suspended + * during the processing of this request are restored to their + * previous state. + * + * WARNING: This function takes the pm_devs_lock. The lock is not dropped until + * the callbacks have completed. This prevents races against pm locking + * functions, races against module unload pm_unregister code. It does + * mean however that you must not issue pm_ functions within the callback + * or you will deadlock and users will hate you. + * + * Zero is returned on success. If a suspend fails then the status + * from the device that vetoes the suspend is returned. + * + * BUGS: what stops two power management requests occurring in parallel + * and conflicting. + */ + +int pm_send_all(pm_request_t rqst, void *data) +{ + struct list_head *entry; + + down(&pm_devs_lock); + entry = pm_devs.next; + while (entry != &pm_devs) { + struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); + if (dev->callback) { + int status = pm_send(dev, rqst, data); + if (status) { + /* return devices to previous state on + * failed suspend request + */ + if (rqst == PM_SUSPEND) + pm_undo_all(dev); + up(&pm_devs_lock); + return status; + } + } + entry = entry->next; + } + up(&pm_devs_lock); + return 0; +} + +EXPORT_SYMBOL(pm_register); +EXPORT_SYMBOL(pm_unregister); +EXPORT_SYMBOL(pm_unregister_all); +EXPORT_SYMBOL(pm_send_all); +EXPORT_SYMBOL(pm_active); + + diff --git a/kernel/power/power.h b/kernel/power/power.h new file mode 100644 index 00000000000..cd6a3493cc0 --- /dev/null +++ b/kernel/power/power.h @@ -0,0 +1,52 @@ +#include <linux/suspend.h> +#include <linux/utsname.h> + +/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but + we probably do not take enough locks for switching consoles, etc, + so bad things might happen. +*/ +#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) +#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) +#endif + + +struct swsusp_info { + struct new_utsname uts; + u32 version_code; + unsigned long num_physpages; + int cpus; + unsigned long image_pages; + unsigned long pagedir_pages; + suspend_pagedir_t * suspend_pagedir; + swp_entry_t pagedir[768]; +} __attribute__((aligned(PAGE_SIZE))); + + + +#ifdef CONFIG_SOFTWARE_SUSPEND +extern int pm_suspend_disk(void); + +#else +static inline int pm_suspend_disk(void) +{ + return -EPERM; +} +#endif +extern struct semaphore pm_sem; +#define power_attr(_name) \ +static struct subsys_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = 0644, \ + }, \ + .show = _name##_show, \ + .store = _name##_store, \ +} + +extern struct subsystem power_subsys; + +extern int freeze_processes(void); +extern void thaw_processes(void); + +extern int pm_prepare_console(void); +extern void pm_restore_console(void); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c new file mode 100644 index 00000000000..715081b2d82 --- /dev/null +++ b/kernel/power/poweroff.c @@ -0,0 +1,45 @@ +/* + * poweroff.c - sysrq handler to gracefully power down machine. + * + * This file is released under the GPL v2 + */ + +#include <linux/kernel.h> +#include <linux/sysrq.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/workqueue.h> + +/* + * When the user hits Sys-Rq o to power down the machine this is the + * callback we use. + */ + +static void do_poweroff(void *dummy) +{ + if (pm_power_off) + pm_power_off(); +} + +static DECLARE_WORK(poweroff_work, do_poweroff, NULL); + +static void handle_poweroff(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + schedule_work(&poweroff_work); +} + +static struct sysrq_key_op sysrq_poweroff_op = { + .handler = handle_poweroff, + .help_msg = "powerOff", + .action_msg = "Power Off", + .enable_mask = SYSRQ_ENABLE_BOOT, +}; + +static int pm_sysrq_init(void) +{ + register_sysrq_key('o', &sysrq_poweroff_op); + return 0; +} + +subsys_initcall(pm_sysrq_init); diff --git a/kernel/power/process.c b/kernel/power/process.c new file mode 100644 index 00000000000..78d92dc6a1e --- /dev/null +++ b/kernel/power/process.c @@ -0,0 +1,121 @@ +/* + * drivers/power/process.c - Functions for starting/stopping processes on + * suspend transitions. + * + * Originally from swsusp. + */ + + +#undef DEBUG + +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/suspend.h> +#include <linux/module.h> + +/* + * Timeout for stopping processes + */ +#define TIMEOUT (6 * HZ) + + +static inline int freezeable(struct task_struct * p) +{ + if ((p == current) || + (p->flags & PF_NOFREEZE) || + (p->exit_state == EXIT_ZOMBIE) || + (p->exit_state == EXIT_DEAD) || + (p->state == TASK_STOPPED) || + (p->state == TASK_TRACED)) + return 0; + return 1; +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(unsigned long flag) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + save = current->state; + current->state = TASK_UNINTERRUPTIBLE; + pr_debug("%s entered refrigerator\n", current->comm); + printk("="); + current->flags &= ~PF_FREEZE; + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + + current->flags |= PF_FROZEN; + while (current->flags & PF_FROZEN) + schedule(); + pr_debug("%s left refrigerator\n", current->comm); + current->state = save; +} + +/* 0 = success, else # of processes that we failed to stop */ +int freeze_processes(void) +{ + int todo; + unsigned long start_time; + struct task_struct *g, *p; + + printk( "Stopping tasks: " ); + start_time = jiffies; + do { + todo = 0; + read_lock(&tasklist_lock); + do_each_thread(g, p) { + unsigned long flags; + if (!freezeable(p)) + continue; + if ((p->flags & PF_FROZEN) || + (p->state == TASK_TRACED) || + (p->state == TASK_STOPPED)) + continue; + + /* FIXME: smp problem here: we may not access other process' flags + without locking */ + p->flags |= PF_FREEZE; + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + todo++; + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + yield(); /* Yield is okay here */ + if (time_after(jiffies, start_time + TIMEOUT)) { + printk( "\n" ); + printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); + return todo; + } + } while(todo); + + printk( "|\n" ); + BUG_ON(in_atomic()); + return 0; +} + +void thaw_processes(void) +{ + struct task_struct *g, *p; + + printk( "Restarting tasks..." ); + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (!freezeable(p)) + continue; + if (p->flags & PF_FROZEN) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + } else + printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); + schedule(); + printk( " done\n" ); +} + +EXPORT_SYMBOL(refrigerator); diff --git a/kernel/power/smp.c b/kernel/power/smp.c new file mode 100644 index 00000000000..7fa7f6e2b7f --- /dev/null +++ b/kernel/power/smp.c @@ -0,0 +1,85 @@ +/* + * drivers/power/smp.c - Functions for stopping other CPUs. + * + * Copyright 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz> + * + * This file is released under the GPLv2. + */ + +#undef DEBUG + +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/suspend.h> +#include <linux/module.h> +#include <asm/atomic.h> +#include <asm/tlbflush.h> + +static atomic_t cpu_counter, freeze; + + +static void smp_pause(void * data) +{ + struct saved_context ctxt; + __save_processor_state(&ctxt); + printk("Sleeping in:\n"); + dump_stack(); + atomic_inc(&cpu_counter); + while (atomic_read(&freeze)) { + /* FIXME: restore takes place at random piece inside this. + This should probably be written in assembly, and + preserve general-purpose registers, too + + What about stack? We may need to move to new stack here. + + This should better be ran with interrupts disabled. + */ + cpu_relax(); + barrier(); + } + atomic_dec(&cpu_counter); + __restore_processor_state(&ctxt); +} + +static cpumask_t oldmask; + +void disable_nonboot_cpus(void) +{ + printk("Freezing CPUs (at %d)", smp_processor_id()); + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(0)); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); + printk("..."); + BUG_ON(smp_processor_id() != 0); + + /* FIXME: for this to work, all the CPUs must be running + * "idle" thread (or we deadlock). Is that guaranteed? */ + + atomic_set(&cpu_counter, 0); + atomic_set(&freeze, 1); + smp_call_function(smp_pause, NULL, 0, 0); + while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { + cpu_relax(); + barrier(); + } + printk("ok\n"); +} + +void enable_nonboot_cpus(void) +{ + printk("Restarting CPUs"); + atomic_set(&freeze, 0); + while (atomic_read(&cpu_counter)) { + cpu_relax(); + barrier(); + } + printk("..."); + set_cpus_allowed(current, oldmask); + schedule(); + printk("ok\n"); + +} + + diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c new file mode 100644 index 00000000000..ae5bebc3b18 --- /dev/null +++ b/kernel/power/swsusp.c @@ -0,0 +1,1433 @@ +/* + * linux/kernel/power/swsusp.c + * + * This file is to realize architecture-independent + * machine suspend feature using pretty near only high-level routines + * + * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> + * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> + * + * This file is released under the GPLv2. + * + * I'd like to thank the following people for their work: + * + * Pavel Machek <pavel@ucw.cz>: + * Modifications, defectiveness pointing, being with me at the very beginning, + * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. + * + * Steve Doddi <dirk@loth.demon.co.uk>: + * Support the possibility of hardware state restoring. + * + * Raph <grey.havens@earthling.net>: + * Support for preserving states of network devices and virtual console + * (including X and svgatextmode) + * + * Kurt Garloff <garloff@suse.de>: + * Straightened the critical function in order to prevent compilers from + * playing tricks with local variables. + * + * Andreas Mohr <a.mohr@mailto.de> + * + * Alex Badea <vampire@go.ro>: + * Fixed runaway init + * + * More state savers are welcome. Especially for the scsi layer... + * + * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/suspend.h> +#include <linux/smp_lock.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <linux/version.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/bitops.h> +#include <linux/vt_kern.h> +#include <linux/kbd_kern.h> +#include <linux/keyboard.h> +#include <linux/spinlock.h> +#include <linux/genhd.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/swap.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/buffer_head.h> +#include <linux/swapops.h> +#include <linux/bootmem.h> +#include <linux/syscalls.h> +#include <linux/console.h> +#include <linux/highmem.h> +#include <linux/bio.h> + +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +#include "power.h" + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* Variables to be preserved over suspend */ +static int nr_copy_pages_check; + +extern char resume_file[]; + +/* Local variables that should not be affected by save */ +unsigned int nr_copy_pages __nosavedata = 0; + +/* Suspend pagedir is allocated before final copy, therefore it + must be freed after resume + + Warning: this is evil. There are actually two pagedirs at time of + resume. One is "pagedir_save", which is empty frame allocated at + time of suspend, that must be freed. Second is "pagedir_nosave", + allocated at time of resume, that travels through memory not to + collide with anything. + + Warning: this is even more evil than it seems. Pagedirs this file + talks about are completely different from page directories used by + MMU hardware. + */ +suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; +static suspend_pagedir_t *pagedir_save; + +#define SWSUSP_SIG "S1SUSPEND" + +static struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + swp_entry_t swsusp_info; + char orig_sig[10]; + char sig[10]; +} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; + +static struct swsusp_info swsusp_info; + +/* + * XXX: We try to keep some more pages free so that I/O operations succeed + * without paging. Might this be more? + */ +#define PAGES_FOR_IO 512 + +/* + * Saving part... + */ + +/* We memorize in swapfile_used what swap devices are used for suspension */ +#define SWAPFILE_UNUSED 0 +#define SWAPFILE_SUSPEND 1 /* This is the suspending device */ +#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ + +static unsigned short swapfile_used[MAX_SWAPFILES]; +static unsigned short root_swap; + +static int mark_swapfiles(swp_entry_t prev) +{ + int error; + + rw_swap_page_sync(READ, + swp_entry(root_swap, 0), + virt_to_page((unsigned long)&swsusp_header)); + if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { + memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); + memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + swsusp_header.swsusp_info = prev; + error = rw_swap_page_sync(WRITE, + swp_entry(root_swap, 0), + virt_to_page((unsigned long) + &swsusp_header)); + } else { + pr_debug("swsusp: Partition is not swap space.\n"); + error = -ENODEV; + } + return error; +} + +/* + * Check whether the swap device is the specified resume + * device, irrespective of whether they are specified by + * identical names. + * + * (Thus, device inode aliasing is allowed. You can say /dev/hda4 + * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] + * and they'll be considered the same device. This is *necessary* for + * devfs, since the resume code can only recognize the form /dev/hda4, + * but the suspend code would see the long name.) + */ +static int is_resume_device(const struct swap_info_struct *swap_info) +{ + struct file *file = swap_info->swap_file; + struct inode *inode = file->f_dentry->d_inode; + + return S_ISBLK(inode->i_mode) && + swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); +} + +static int swsusp_swap_check(void) /* This is called before saving image */ +{ + int i, len; + + len=strlen(resume_file); + root_swap = 0xFFFF; + + swap_list_lock(); + for(i=0; i<MAX_SWAPFILES; i++) { + if (swap_info[i].flags == 0) { + swapfile_used[i]=SWAPFILE_UNUSED; + } else { + if(!len) { + printk(KERN_WARNING "resume= option should be used to set suspend device" ); + if(root_swap == 0xFFFF) { + swapfile_used[i] = SWAPFILE_SUSPEND; + root_swap = i; + } else + swapfile_used[i] = SWAPFILE_IGNORED; + } else { + /* we ignore all swap devices that are not the resume_file */ + if (is_resume_device(&swap_info[i])) { + swapfile_used[i] = SWAPFILE_SUSPEND; + root_swap = i; + } else { + swapfile_used[i] = SWAPFILE_IGNORED; + } + } + } + } + swap_list_unlock(); + return (root_swap != 0xffff) ? 0 : -ENODEV; +} + +/** + * This is called after saving image so modification + * will be lost after resume... and that's what we want. + * we make the device unusable. A new call to + * lock_swapdevices can unlock the devices. + */ +static void lock_swapdevices(void) +{ + int i; + + swap_list_lock(); + for(i = 0; i< MAX_SWAPFILES; i++) + if(swapfile_used[i] == SWAPFILE_IGNORED) { + swap_info[i].flags ^= 0xFF; + } + swap_list_unlock(); +} + +/** + * write_swap_page - Write one page to a fresh swap location. + * @addr: Address we're writing. + * @loc: Place to store the entry we used. + * + * Allocate a new swap entry and 'sync' it. Note we discard -EIO + * errors. That is an artifact left over from swsusp. It did not + * check the return of rw_swap_page_sync() at all, since most pages + * written back to swap would return -EIO. + * This is a partial improvement, since we will at least return other + * errors, though we need to eventually fix the damn code. + */ +static int write_page(unsigned long addr, swp_entry_t * loc) +{ + swp_entry_t entry; + int error = 0; + + entry = get_swap_page(); + if (swp_offset(entry) && + swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { + error = rw_swap_page_sync(WRITE, entry, + virt_to_page(addr)); + if (error == -EIO) + error = 0; + if (!error) + *loc = entry; + } else + error = -ENOSPC; + return error; +} + +/** + * data_free - Free the swap entries used by the saved image. + * + * Walk the list of used swap entries and free each one. + * This is only used for cleanup when suspend fails. + */ +static void data_free(void) +{ + swp_entry_t entry; + int i; + + for (i = 0; i < nr_copy_pages; i++) { + entry = (pagedir_nosave + i)->swap_address; + if (entry.val) + swap_free(entry); + else + break; + (pagedir_nosave + i)->swap_address = (swp_entry_t){0}; + } +} + +/** + * data_write - Write saved image to swap. + * + * Walk the list of pages in the image and sync each one to swap. + */ +static int data_write(void) +{ + int error = 0, i = 0; + unsigned int mod = nr_copy_pages / 100; + struct pbe *p; + + if (!mod) + mod = 1; + + printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); + for_each_pbe(p, pagedir_nosave) { + if (!(i%mod)) + printk( "\b\b\b\b%3d%%", i / mod ); + if ((error = write_page(p->address, &(p->swap_address)))) + return error; + i++; + } + printk("\b\b\b\bdone\n"); + return error; +} + +static void dump_info(void) +{ + pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); + pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); + pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); + pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); + pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); + pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); + pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); + pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); + pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); + pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); + pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); +} + +static void init_header(void) +{ + memset(&swsusp_info, 0, sizeof(swsusp_info)); + swsusp_info.version_code = LINUX_VERSION_CODE; + swsusp_info.num_physpages = num_physpages; + memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); + + swsusp_info.suspend_pagedir = pagedir_nosave; + swsusp_info.cpus = num_online_cpus(); + swsusp_info.image_pages = nr_copy_pages; +} + +static int close_swap(void) +{ + swp_entry_t entry; + int error; + + dump_info(); + error = write_page((unsigned long)&swsusp_info, &entry); + if (!error) { + printk( "S" ); + error = mark_swapfiles(entry); + printk( "|\n" ); + } + return error; +} + +/** + * free_pagedir_entries - Free pages used by the page directory. + * + * This is used during suspend for error recovery. + */ + +static void free_pagedir_entries(void) +{ + int i; + + for (i = 0; i < swsusp_info.pagedir_pages; i++) + swap_free(swsusp_info.pagedir[i]); +} + + +/** + * write_pagedir - Write the array of pages holding the page directory. + * @last: Last swap entry we write (needed for header). + */ + +static int write_pagedir(void) +{ + int error = 0; + unsigned n = 0; + struct pbe * pbe; + + printk( "Writing pagedir..."); + for_each_pb_page(pbe, pagedir_nosave) { + if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) + return error; + } + + swsusp_info.pagedir_pages = n; + printk("done (%u pages)\n", n); + return error; +} + +/** + * write_suspend_image - Write entire image and metadata. + * + */ + +static int write_suspend_image(void) +{ + int error; + + init_header(); + if ((error = data_write())) + goto FreeData; + + if ((error = write_pagedir())) + goto FreePagedir; + + if ((error = close_swap())) + goto FreePagedir; + Done: + return error; + FreePagedir: + free_pagedir_entries(); + FreeData: + data_free(); + goto Done; +} + + +#ifdef CONFIG_HIGHMEM +struct highmem_page { + char *data; + struct page *page; + struct highmem_page *next; +}; + +static struct highmem_page *highmem_copy; + +static int save_highmem_zone(struct zone *zone) +{ + unsigned long zone_pfn; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + struct highmem_page *save; + void *kaddr; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. + */ + if (PageReserved(page)) { + printk("highmem reserved page?!\n"); + continue; + } + BUG_ON(PageNosave(page)); + if (PageNosaveFree(page)) + continue; + save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); + if (!save) + return -ENOMEM; + save->next = highmem_copy; + save->page = page; + save->data = (void *) get_zeroed_page(GFP_ATOMIC); + if (!save->data) { + kfree(save); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(save->data, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + highmem_copy = save; + } + return 0; +} +#endif /* CONFIG_HIGHMEM */ + + +static int save_highmem(void) +{ +#ifdef CONFIG_HIGHMEM + struct zone *zone; + int res = 0; + + pr_debug("swsusp: Saving Highmem\n"); + for_each_zone(zone) { + if (is_highmem(zone)) + res = save_highmem_zone(zone); + if (res) + return res; + } +#endif + return 0; +} + +static int restore_highmem(void) +{ +#ifdef CONFIG_HIGHMEM + printk("swsusp: Restoring Highmem\n"); + while (highmem_copy) { + struct highmem_page *save = highmem_copy; + void *kaddr; + highmem_copy = save->next; + + kaddr = kmap_atomic(save->page, KM_USER0); + memcpy(kaddr, save->data, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + free_page((long) save->data); + kfree(save); + } +#endif + return 0; +} + + +static int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +/** + * saveable - Determine whether a page should be cloned or not. + * @pfn: The page + * + * We save a page if it's Reserved, and not in the range of pages + * statically defined as 'unsaveable', or if it isn't reserved, and + * isn't part of a free chunk of pages. + */ + +static int saveable(struct zone * zone, unsigned long * zone_pfn) +{ + unsigned long pfn = *zone_pfn + zone->zone_start_pfn; + struct page * page; + + if (!pfn_valid(pfn)) + return 0; + + page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) + return 0; + if (PageReserved(page) && pfn_is_nosave(pfn)) { + pr_debug("[nosave pfn 0x%lx]", pfn); + return 0; + } + if (PageNosaveFree(page)) + return 0; + + return 1; +} + +static void count_data_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + + nr_copy_pages = 0; + + for_each_zone(zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + nr_copy_pages += saveable(zone, &zone_pfn); + } +} + + +static void copy_data_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe * pbe = pagedir_nosave; + + pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); + for_each_zone(zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + if (saveable(zone, &zone_pfn)) { + struct page * page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + BUG_ON(!pbe); + pbe->orig_address = (long) page_address(page); + /* copy_page is not usable for copying task structs. */ + memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + pbe = pbe->next; + } + } + } + BUG_ON(pbe); +} + + +/** + * calc_nr - Determine the number of pages needed for a pbe list. + */ + +static int calc_nr(int nr_copy) +{ + int extra = 0; + int mod = !!(nr_copy % PBES_PER_PAGE); + int diff = (nr_copy / PBES_PER_PAGE) + mod; + + do { + extra += diff; + nr_copy += diff; + mod = !!(nr_copy % PBES_PER_PAGE); + diff = (nr_copy / PBES_PER_PAGE) + mod - extra; + } while (diff > 0); + + return nr_copy; +} + +/** + * free_pagedir - free pages allocated with alloc_pagedir() + */ + +static inline void free_pagedir(struct pbe *pblist) +{ + struct pbe *pbe; + + while (pblist) { + pbe = (pblist + PB_PAGE_SKIP)->next; + free_page((unsigned long)pblist); + pblist = pbe; + } +} + +/** + * fill_pb_page - Create a list of PBEs on a given memory page + */ + +static inline void fill_pb_page(struct pbe *pbpage) +{ + struct pbe *p; + + p = pbpage; + pbpage += PB_PAGE_SKIP; + do + p->next = p + 1; + while (++p < pbpage); +} + +/** + * create_pbe_list - Create a list of PBEs on top of a given chain + * of memory pages allocated with alloc_pagedir() + */ + +static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) +{ + struct pbe *pbpage, *p; + unsigned num = PBES_PER_PAGE; + + for_each_pb_page (pbpage, pblist) { + if (num >= nr_pages) + break; + + fill_pb_page(pbpage); + num += PBES_PER_PAGE; + } + if (pbpage) { + for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) + p->next = p + 1; + p->next = NULL; + } + pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +/** + * alloc_pagedir - Allocate the page directory. + * + * First, determine exactly how many pages we need and + * allocate them. + * + * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE + * struct pbe elements (pbes) and the last element in the page points + * to the next page. + * + * On each page we set up a list of struct_pbe elements. + */ + +static struct pbe * alloc_pagedir(unsigned nr_pages) +{ + unsigned num; + struct pbe *pblist, *pbe; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); + pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); + for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; + pbe = pbe->next, num += PBES_PER_PAGE) { + pbe += PB_PAGE_SKIP; + pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); + } + if (!pbe) { /* get_zeroed_page() failed */ + free_pagedir(pblist); + pblist = NULL; + } + return pblist; +} + +/** + * free_image_pages - Free pages allocated for snapshot + */ + +static void free_image_pages(void) +{ + struct pbe * p; + + for_each_pbe(p, pagedir_save) { + if (p->address) { + ClearPageNosave(virt_to_page(p->address)); + free_page(p->address); + p->address = 0; + } + } +} + +/** + * alloc_image_pages - Allocate pages for the snapshot. + */ + +static int alloc_image_pages(void) +{ + struct pbe * p; + + for_each_pbe(p, pagedir_save) { + p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); + if (!p->address) + return -ENOMEM; + SetPageNosave(virt_to_page(p->address)); + } + return 0; +} + +void swsusp_free(void) +{ + BUG_ON(PageNosave(virt_to_page(pagedir_save))); + BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); + free_image_pages(); + free_pagedir(pagedir_save); +} + + +/** + * enough_free_mem - Make sure we enough free memory to snapshot. + * + * Returns TRUE or FALSE after checking the number of available + * free pages. + */ + +static int enough_free_mem(void) +{ + if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { + pr_debug("swsusp: Not enough free pages: Have %d\n", + nr_free_pages()); + return 0; + } + return 1; +} + + +/** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space avaiable. + * + * FIXME: si_swapinfo(&i) returns all swap devices information. + * We should only consider resume_device. + */ + +static int enough_swap(void) +{ + struct sysinfo i; + + si_swapinfo(&i); + if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { + pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); + return 0; + } + return 1; +} + +static int swsusp_alloc(void) +{ + int error; + + pr_debug("suspend: (pages needed: %d + %d free: %d)\n", + nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); + + pagedir_nosave = NULL; + if (!enough_free_mem()) + return -ENOMEM; + + if (!enough_swap()) + return -ENOSPC; + + nr_copy_pages = calc_nr(nr_copy_pages); + + if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { + printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); + return -ENOMEM; + } + create_pbe_list(pagedir_save, nr_copy_pages); + pagedir_nosave = pagedir_save; + if ((error = alloc_image_pages())) { + printk(KERN_ERR "suspend: Allocating image pages failed.\n"); + swsusp_free(); + return error; + } + + nr_copy_pages_check = nr_copy_pages; + return 0; +} + +static int suspend_prepare_image(void) +{ + int error; + + pr_debug("swsusp: critical section: \n"); + if (save_highmem()) { + printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); + restore_highmem(); + return -ENOMEM; + } + + drain_local_pages(); + count_data_pages(); + printk("swsusp: Need to copy %u pages\n", nr_copy_pages); + + error = swsusp_alloc(); + if (error) + return error; + + /* During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(); + copy_data_pages(); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + + printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); + return 0; +} + + +/* It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) + */ +int swsusp_write(void) +{ + int error; + device_resume(); + lock_swapdevices(); + error = write_suspend_image(); + /* This will unlock ignored swap devices since writing is finished */ + lock_swapdevices(); + return error; + +} + + +extern asmlinkage int swsusp_arch_suspend(void); +extern asmlinkage int swsusp_arch_resume(void); + + +asmlinkage int swsusp_save(void) +{ + int error = 0; + + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " + "swapon -a!\n"); + return error; + } + return suspend_prepare_image(); +} + +int swsusp_suspend(void) +{ + int error; + if ((error = arch_prepare_suspend())) + return error; + local_irq_disable(); + /* At this point, device_suspend() has been called, but *not* + * device_power_down(). We *must* device_power_down() now. + * Otherwise, drivers for some devices (e.g. interrupt controllers) + * become desynchronized with the actual state of the hardware + * at resume time, and evil weirdness ensues. + */ + if ((error = device_power_down(PMSG_FREEZE))) { + printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); + local_irq_enable(); + swsusp_free(); + return error; + } + save_processor_state(); + if ((error = swsusp_arch_suspend())) + swsusp_free(); + /* Restore control flow magically appears here */ + restore_processor_state(); + BUG_ON (nr_copy_pages_check != nr_copy_pages); + restore_highmem(); + device_power_up(); + local_irq_enable(); + return error; +} + +int swsusp_resume(void) +{ + int error; + local_irq_disable(); + if (device_power_down(PMSG_FREEZE)) + printk(KERN_ERR "Some devices failed to power down, very bad\n"); + /* We'll ignore saved state, but this gets preempt count (etc) right */ + save_processor_state(); + error = swsusp_arch_resume(); + /* Code below is only ever reached in case of failure. Otherwise + * execution continues at place where swsusp_arch_suspend was called + */ + BUG_ON(!error); + restore_processor_state(); + restore_highmem(); + device_power_up(); + local_irq_enable(); + return error; +} + +/* More restore stuff */ + +/* + * Returns true if given address/order collides with any orig_address + */ +static int does_collide_order(unsigned long addr, int order) +{ + int i; + + for (i=0; i < (1<<order); i++) + if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE))) + return 1; + return 0; +} + +/** + * On resume, for storing the PBE list and the image, + * we can only use memory pages that do not conflict with the pages + * which had been used before suspend. + * + * We don't know which pages are usable until we allocate them. + * + * Allocated but unusable (ie eaten) memory pages are linked together + * to create a list, so that we can free them easily + * + * We could have used a type other than (void *) + * for this purpose, but ... + */ +static void **eaten_memory = NULL; + +static inline void eat_page(void *page) +{ + void **c; + + c = eaten_memory; + eaten_memory = page; + *eaten_memory = c; +} + +static unsigned long get_usable_page(unsigned gfp_mask) +{ + unsigned long m; + + m = get_zeroed_page(gfp_mask); + while (does_collide_order(m, 0)) { + eat_page((void *)m); + m = get_zeroed_page(gfp_mask); + if (!m) + break; + } + return m; +} + +static void free_eaten_memory(void) +{ + unsigned long m; + void **c; + int i = 0; + + c = eaten_memory; + while (c) { + m = (unsigned long)c; + c = *c; + free_page(m); + i++; + } + eaten_memory = NULL; + pr_debug("swsusp: %d unused pages freed\n", i); +} + +/** + * check_pagedir - We ensure here that pages that the PBEs point to + * won't collide with pages where we're going to restore from the loaded + * pages later + */ + +static int check_pagedir(struct pbe *pblist) +{ + struct pbe *p; + + /* This is necessary, so that we can free allocated pages + * in case of failure + */ + for_each_pbe (p, pblist) + p->address = 0UL; + + for_each_pbe (p, pblist) { + p->address = get_usable_page(GFP_ATOMIC); + if (!p->address) + return -ENOMEM; + } + return 0; +} + +/** + * swsusp_pagedir_relocate - It is possible, that some memory pages + * occupied by the list of PBEs collide with pages where we're going to + * restore from the loaded pages later. We relocate them here. + */ + +static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *pbpage, *tail, *p; + void *m; + int rel = 0, error = 0; + + if (!pblist) /* a sanity check */ + return NULL; + + pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", + swsusp_info.pagedir_pages); + + /* Set page flags */ + + for_each_zone(zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + SetPageNosaveFree(pfn_to_page(zone_pfn + + zone->zone_start_pfn)); + } + + /* Clear orig addresses */ + + for_each_pbe (p, pblist) + ClearPageNosaveFree(virt_to_page(p->orig_address)); + + tail = pblist + PB_PAGE_SKIP; + + /* Relocate colliding pages */ + + for_each_pb_page (pbpage, pblist) { + if (does_collide_order((unsigned long)pbpage, 0)) { + m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); + if (!m) { + error = -ENOMEM; + break; + } + memcpy(m, (void *)pbpage, PAGE_SIZE); + if (pbpage == pblist) + pblist = (struct pbe *)m; + else + tail->next = (struct pbe *)m; + + eat_page((void *)pbpage); + pbpage = (struct pbe *)m; + + /* We have to link the PBEs again */ + + for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) + if (p->next) /* needed to save the end */ + p->next = p + 1; + + rel++; + } + tail = pbpage + PB_PAGE_SKIP; + } + + if (error) { + printk("\nswsusp: Out of memory\n\n"); + free_pagedir(pblist); + free_eaten_memory(); + pblist = NULL; + } + else + printk("swsusp: Relocated %d pages\n", rel); + + return pblist; +} + +/** + * Using bio to read from swap. + * This code requires a bit more work than just using buffer heads + * but, it is the recommended way for 2.5/2.6. + * The following are to signal the beginning and end of I/O. Bios + * finish asynchronously, while we want them to happen synchronously. + * A simple atomic_t, and a wait loop take care of this problem. + */ + +static atomic_t io_done = ATOMIC_INIT(0); + +static int end_io(struct bio * bio, unsigned int num, int err) +{ + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + panic("I/O error reading memory image"); + atomic_set(&io_done, 0); + return 0; +} + +static struct block_device * resume_bdev; + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * + * Straight from the textbook - allocate and initialize the bio. + * If we're writing, make sure the page is marked as dirty. + * Then submit it and wait. + */ + +static int submit(int rw, pgoff_t page_off, void * page) +{ + int error = 0; + struct bio * bio; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + bio->bi_sector = page_off * (PAGE_SIZE >> 9); + bio_get(bio); + bio->bi_bdev = resume_bdev; + bio->bi_end_io = end_io; + + if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); + error = -EFAULT; + goto Done; + } + + if (rw == WRITE) + bio_set_pages_dirty(bio); + + atomic_set(&io_done, 1); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + while (atomic_read(&io_done)) + yield(); + + Done: + bio_put(bio); + return error; +} + +static int bio_read_page(pgoff_t page_off, void * page) +{ + return submit(READ, page_off, page); +} + +static int bio_write_page(pgoff_t page_off, void * page) +{ + return submit(WRITE, page_off, page); +} + +/* + * Sanity check if this image makes sense with this kernel/swap context + * I really don't think that it's foolproof but more than nothing.. + */ + +static const char * sanity_check(void) +{ + dump_info(); + if(swsusp_info.version_code != LINUX_VERSION_CODE) + return "kernel version"; + if(swsusp_info.num_physpages != num_physpages) + return "memory size"; + if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) + return "system type"; + if (strcmp(swsusp_info.uts.release,system_utsname.release)) + return "kernel release"; + if (strcmp(swsusp_info.uts.version,system_utsname.version)) + return "version"; + if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) + return "machine"; + if(swsusp_info.cpus != num_online_cpus()) + return "number of cpus"; + return NULL; +} + + +static int check_header(void) +{ + const char * reason = NULL; + int error; + + if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) + return error; + + /* Is this same machine? */ + if ((reason = sanity_check())) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); + return -EPERM; + } + nr_copy_pages = swsusp_info.image_pages; + return error; +} + +static int check_sig(void) +{ + int error; + + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + + /* + * Reset swap signature now. + */ + error = bio_write_page(0, &swsusp_header); + } else { + printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); + return -EINVAL; + } + if (!error) + pr_debug("swsusp: Signature found, resuming\n"); + return error; +} + +/** + * data_read - Read image pages from swap. + * + * You do not need to check for overlaps, check_pagedir() + * already did that. + */ + +static int data_read(struct pbe *pblist) +{ + struct pbe * p; + int error = 0; + int i = 0; + int mod = swsusp_info.image_pages / 100; + + if (!mod) + mod = 1; + + printk("swsusp: Reading image data (%lu pages): ", + swsusp_info.image_pages); + + for_each_pbe (p, pblist) { + if (!(i % mod)) + printk("\b\b\b\b%3d%%", i / mod); + + error = bio_read_page(swp_offset(p->swap_address), + (void *)p->address); + if (error) + return error; + + i++; + } + printk("\b\b\b\bdone\n"); + return error; +} + +extern dev_t name_to_dev_t(const char *line); + +/** + * read_pagedir - Read page backup list pages from swap + */ + +static int read_pagedir(struct pbe *pblist) +{ + struct pbe *pbpage, *p; + unsigned i = 0; + int error; + + if (!pblist) + return -EFAULT; + + printk("swsusp: Reading pagedir (%lu pages)\n", + swsusp_info.pagedir_pages); + + for_each_pb_page (pbpage, pblist) { + unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); + + error = -EFAULT; + if (offset) { + p = (pbpage + PB_PAGE_SKIP)->next; + error = bio_read_page(offset, (void *)pbpage); + (pbpage + PB_PAGE_SKIP)->next = p; + } + if (error) + break; + } + + if (error) + free_page((unsigned long)pblist); + + BUG_ON(i != swsusp_info.pagedir_pages); + + return error; +} + + +static int check_suspend_image(void) +{ + int error = 0; + + if ((error = check_sig())) + return error; + + if ((error = check_header())) + return error; + + return 0; +} + +static int read_suspend_image(void) +{ + int error = 0; + struct pbe *p; + + if (!(p = alloc_pagedir(nr_copy_pages))) + return -ENOMEM; + + if ((error = read_pagedir(p))) + return error; + + create_pbe_list(p, nr_copy_pages); + + if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) + return -ENOMEM; + + /* Allocate memory for the image and read the data from swap */ + + error = check_pagedir(pagedir_nosave); + free_eaten_memory(); + if (!error) + error = data_read(pagedir_nosave); + + if (error) { /* We fail cleanly */ + for_each_pbe (p, pagedir_nosave) + if (p->address) { + free_page(p->address); + p->address = 0UL; + } + free_pagedir(pagedir_nosave); + } + return error; +} + +/** + * swsusp_check - Check for saved image in swap + */ + +int swsusp_check(void) +{ + int error; + + if (!swsusp_resume_device) { + if (!strlen(resume_file)) + return -ENOENT; + swsusp_resume_device = name_to_dev_t(resume_file); + pr_debug("swsusp: Resume From Partition %s\n", resume_file); + } else { + pr_debug("swsusp: Resume From Partition %d:%d\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + } + + resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + if (!IS_ERR(resume_bdev)) { + set_blocksize(resume_bdev, PAGE_SIZE); + error = check_suspend_image(); + if (error) + blkdev_put(resume_bdev); + } else + error = PTR_ERR(resume_bdev); + + if (!error) + pr_debug("swsusp: resume file found\n"); + else + pr_debug("swsusp: Error %d check for resume file\n", error); + return error; +} + +/** + * swsusp_read - Read saved image from swap. + */ + +int swsusp_read(void) +{ + int error; + + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + + error = read_suspend_image(); + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); + return error; +} + +/** + * swsusp_close - close swap device. + */ + +void swsusp_close(void) +{ + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return; + } + + blkdev_put(resume_bdev); +} diff --git a/kernel/printk.c b/kernel/printk.c new file mode 100644 index 00000000000..1498689548d --- /dev/null +++ b/kernel/printk.c @@ -0,0 +1,996 @@ +/* + * linux/kernel/printk.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Modified to make sys_syslog() more flexible: added commands to + * return the last 4k of kernel messages, regardless of whether + * they've been read or not. Added option to suppress kernel printk's + * to the console. Added hook for sending the console messages + * elsewhere, in preparation for a serial line console (someday). + * Ted Ts'o, 2/11/93. + * Modified for sysctl support, 1/8/97, Chris Horn. + * Fixed SMP synchronization, 08/08/99, Manfred Spraul + * manfreds@colorfullife.com + * Rewrote bits to get rid of console_lock + * 01Mar01 Andrew Morton <andrewm@uow.edu.au> + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/tty.h> +#include <linux/tty_driver.h> +#include <linux/smp_lock.h> +#include <linux/console.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/interrupt.h> /* For in_interrupt() */ +#include <linux/config.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/security.h> +#include <linux/bootmem.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> + +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) + +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ + +/* We show everything that is MORE important than this.. */ +#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ +#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ + +DECLARE_WAIT_QUEUE_HEAD(log_wait); + +int console_printk[4] = { + DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ + DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ +}; + +EXPORT_SYMBOL(console_printk); + +/* + * Low lever drivers may need that to know if they can schedule in + * their unblank() callback or not. So let's export it. + */ +int oops_in_progress; +EXPORT_SYMBOL(oops_in_progress); + +/* + * console_sem protects the console_drivers list, and also + * provides serialisation for access to the entire console + * driver system. + */ +static DECLARE_MUTEX(console_sem); +struct console *console_drivers; +/* + * This is used for debugging the mess that is the VT code by + * keeping track if we have the console semaphore held. It's + * definitely not the perfect debug tool (we don't know if _WE_ + * hold it are racing, but it helps tracking those weird code + * path in the console code where we end up in places I want + * locked without the console sempahore held + */ +static int console_locked; + +/* + * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars + * It is also used in interesting ways to provide interlocking in + * release_console_sem(). + */ +static DEFINE_SPINLOCK(logbuf_lock); + +static char __log_buf[__LOG_BUF_LEN]; +static char *log_buf = __log_buf; +static int log_buf_len = __LOG_BUF_LEN; + +#define LOG_BUF_MASK (log_buf_len-1) +#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) + +/* + * The indices into log_buf are not constrained to log_buf_len - they + * must be masked before subscripting + */ +static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */ +static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */ +static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */ +static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ + +/* + * Array of consoles built from command line options (console=) + */ +struct console_cmdline +{ + char name[8]; /* Name of the driver */ + int index; /* Minor dev. to use */ + char *options; /* Options for the driver */ +}; + +#define MAX_CMDLINECONSOLES 8 + +static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; +static int selected_console = -1; +static int preferred_console = -1; + +/* Flag: console code may call schedule() */ +static int console_may_schedule; + +/* + * Setup a list of consoles. Called from init/main.c + */ +static int __init console_setup(char *str) +{ + char name[sizeof(console_cmdline[0].name)]; + char *s, *options; + int idx; + + /* + * Decode str into name, index, options. + */ + if (str[0] >= '0' && str[0] <= '9') { + strcpy(name, "ttyS"); + strncpy(name + 4, str, sizeof(name) - 5); + } else + strncpy(name, str, sizeof(name) - 1); + name[sizeof(name) - 1] = 0; + if ((options = strchr(str, ',')) != NULL) + *(options++) = 0; +#ifdef __sparc__ + if (!strcmp(str, "ttya")) + strcpy(name, "ttyS0"); + if (!strcmp(str, "ttyb")) + strcpy(name, "ttyS1"); +#endif + for(s = name; *s; s++) + if ((*s >= '0' && *s <= '9') || *s == ',') + break; + idx = simple_strtoul(s, NULL, 10); + *s = 0; + + add_preferred_console(name, idx, options); + return 1; +} + +__setup("console=", console_setup); + +/** + * add_preferred_console - add a device to the list of preferred consoles. + * + * The last preferred console added will be used for kernel messages + * and stdin/out/err for init. Normally this is used by console_setup + * above to handle user-supplied console arguments; however it can also + * be used by arch-specific code either to override the user or more + * commonly to provide a default console (ie from PROM variables) when + * the user has not supplied one. + */ +int __init add_preferred_console(char *name, int idx, char *options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + selected_console = i; + c = &console_cmdline[i]; + memcpy(c->name, name, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx; + return 0; +} + +static int __init log_buf_len_setup(char *str) +{ + unsigned long size = memparse(str, &str); + unsigned long flags; + + if (size) + size = roundup_pow_of_two(size); + if (size > log_buf_len) { + unsigned long start, dest_idx, offset; + char * new_log_buf; + + new_log_buf = alloc_bootmem(size); + if (!new_log_buf) { + printk("log_buf_len: allocation failed\n"); + goto out; + } + + spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = size; + log_buf = new_log_buf; + + offset = start = min(con_start, log_start); + dest_idx = 0; + while (start != log_end) { + log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; + start++; + dest_idx++; + } + log_start -= offset; + con_start -= offset; + log_end -= offset; + spin_unlock_irqrestore(&logbuf_lock, flags); + + printk("log_buf_len: %d\n", log_buf_len); + } +out: + + return 1; +} + +__setup("log_buf_len=", log_buf_len_setup); + +/* + * Commands to do_syslog: + * + * 0 -- Close the log. Currently a NOP. + * 1 -- Open the log. Currently a NOP. + * 2 -- Read from the log. + * 3 -- Read all messages remaining in the ring buffer. + * 4 -- Read and clear all messages remaining in the ring buffer + * 5 -- Clear ring buffer. + * 6 -- Disable printk's to console + * 7 -- Enable printk's to console + * 8 -- Set level of messages printed to console + * 9 -- Return number of unread characters in the log buffer + * 10 -- Return size of the log buffer + */ +int do_syslog(int type, char __user * buf, int len) +{ + unsigned long i, j, limit, count; + int do_clear = 0; + char c; + int error = 0; + + error = security_syslog(type); + if (error) + return error; + + switch (type) { + case 0: /* Close log */ + break; + case 1: /* Open log */ + break; + case 2: /* Read from log */ + error = -EINVAL; + if (!buf || len < 0) + goto out; + error = 0; + if (!len) + goto out; + if (!access_ok(VERIFY_WRITE, buf, len)) { + error = -EFAULT; + goto out; + } + error = wait_event_interruptible(log_wait, (log_start - log_end)); + if (error) + goto out; + i = 0; + spin_lock_irq(&logbuf_lock); + while (!error && (log_start != log_end) && i < len) { + c = LOG_BUF(log_start); + log_start++; + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,buf); + buf++; + i++; + cond_resched(); + spin_lock_irq(&logbuf_lock); + } + spin_unlock_irq(&logbuf_lock); + if (!error) + error = i; + break; + case 4: /* Read/clear last kernel messages */ + do_clear = 1; + /* FALL THRU */ + case 3: /* Read last kernel messages */ + error = -EINVAL; + if (!buf || len < 0) + goto out; + error = 0; + if (!len) + goto out; + if (!access_ok(VERIFY_WRITE, buf, len)) { + error = -EFAULT; + goto out; + } + count = len; + if (count > log_buf_len) + count = log_buf_len; + spin_lock_irq(&logbuf_lock); + if (count > logged_chars) + count = logged_chars; + if (do_clear) + logged_chars = 0; + limit = log_end; + /* + * __put_user() could sleep, and while we sleep + * printk() could overwrite the messages + * we try to copy to user space. Therefore + * the messages are copied in reverse. <manfreds> + */ + for(i = 0; i < count && !error; i++) { + j = limit-1-i; + if (j + log_buf_len < log_end) + break; + c = LOG_BUF(j); + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,&buf[count-1-i]); + cond_resched(); + spin_lock_irq(&logbuf_lock); + } + spin_unlock_irq(&logbuf_lock); + if (error) + break; + error = i; + if(i != count) { + int offset = count-error; + /* buffer overflow during copy, correct user buffer. */ + for(i=0;i<error;i++) { + if (__get_user(c,&buf[i+offset]) || + __put_user(c,&buf[i])) { + error = -EFAULT; + break; + } + cond_resched(); + } + } + break; + case 5: /* Clear ring buffer */ + logged_chars = 0; + break; + case 6: /* Disable logging to console */ + console_loglevel = minimum_console_loglevel; + break; + case 7: /* Enable logging to console */ + console_loglevel = default_console_loglevel; + break; + case 8: /* Set level of messages printed to console */ + error = -EINVAL; + if (len < 1 || len > 8) + goto out; + if (len < minimum_console_loglevel) + len = minimum_console_loglevel; + console_loglevel = len; + error = 0; + break; + case 9: /* Number of chars in the log buffer */ + error = log_end - log_start; + break; + case 10: /* Size of the log buffer */ + error = log_buf_len; + break; + default: + error = -EINVAL; + break; + } +out: + return error; +} + +asmlinkage long sys_syslog(int type, char __user * buf, int len) +{ + return do_syslog(type, buf, len); +} + +/* + * Call the console drivers on a range of log_buf + */ +static void __call_console_drivers(unsigned long start, unsigned long end) +{ + struct console *con; + + for (con = console_drivers; con; con = con->next) { + if ((con->flags & CON_ENABLED) && con->write) + con->write(con, &LOG_BUF(start), end - start); + } +} + +/* + * Write out chars from start to end - 1 inclusive + */ +static void _call_console_drivers(unsigned long start, + unsigned long end, int msg_log_level) +{ + if (msg_log_level < console_loglevel && + console_drivers && start != end) { + if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { + /* wrapped write */ + __call_console_drivers(start & LOG_BUF_MASK, + log_buf_len); + __call_console_drivers(0, end & LOG_BUF_MASK); + } else { + __call_console_drivers(start, end); + } + } +} + +/* + * Call the console drivers, asking them to write out + * log_buf[start] to log_buf[end - 1]. + * The console_sem must be held. + */ +static void call_console_drivers(unsigned long start, unsigned long end) +{ + unsigned long cur_index, start_print; + static int msg_level = -1; + + if (((long)(start - end)) > 0) + BUG(); + + cur_index = start; + start_print = start; + while (cur_index != end) { + if ( msg_level < 0 && + ((end - cur_index) > 2) && + LOG_BUF(cur_index + 0) == '<' && + LOG_BUF(cur_index + 1) >= '0' && + LOG_BUF(cur_index + 1) <= '7' && + LOG_BUF(cur_index + 2) == '>') + { + msg_level = LOG_BUF(cur_index + 1) - '0'; + cur_index += 3; + start_print = cur_index; + } + while (cur_index != end) { + char c = LOG_BUF(cur_index); + cur_index++; + + if (c == '\n') { + if (msg_level < 0) { + /* + * printk() has already given us loglevel tags in + * the buffer. This code is here in case the + * log buffer has wrapped right round and scribbled + * on those tags + */ + msg_level = default_message_loglevel; + } + _call_console_drivers(start_print, cur_index, msg_level); + msg_level = -1; + start_print = cur_index; + break; + } + } + } + _call_console_drivers(start_print, end, msg_level); +} + +static void emit_log_char(char c) +{ + LOG_BUF(log_end) = c; + log_end++; + if (log_end - log_start > log_buf_len) + log_start = log_end - log_buf_len; + if (log_end - con_start > log_buf_len) + con_start = log_end - log_buf_len; + if (logged_chars < log_buf_len) + logged_chars++; +} + +/* + * Zap console related locks when oopsing. Only zap at most once + * every 10 seconds, to leave time for slow consoles to print a + * full oops. + */ +static void zap_locks(void) +{ + static unsigned long oops_timestamp; + + if (time_after_eq(jiffies, oops_timestamp) && + !time_after(jiffies, oops_timestamp + 30*HZ)) + return; + + oops_timestamp = jiffies; + + /* If a crash is occurring, make sure we can't deadlock */ + spin_lock_init(&logbuf_lock); + /* And make sure that we print immediately */ + init_MUTEX(&console_sem); +} + +#if defined(CONFIG_PRINTK_TIME) +static int printk_time = 1; +#else +static int printk_time = 0; +#endif + +static int __init printk_time_setup(char *str) +{ + if (*str) + return 0; + printk_time = 1; + return 1; +} + +__setup("time", printk_time_setup); + +/* + * This is printk. It can be called from any context. We want it to work. + * + * We try to grab the console_sem. If we succeed, it's easy - we log the output and + * call the console drivers. If we fail to get the semaphore we place the output + * into the log buffer and return. The current holder of the console_sem will + * notice the new output in release_console_sem() and will send it to the + * consoles before releasing the semaphore. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + */ +asmlinkage int printk(const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk(fmt, args); + va_end(args); + + return r; +} + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + unsigned long flags; + int printed_len; + char *p; + static char printk_buf[1024]; + static int log_level_unknown = 1; + + if (unlikely(oops_in_progress)) + zap_locks(); + + /* This stops the holder of console_sem just where we want him */ + spin_lock_irqsave(&logbuf_lock, flags); + + /* Emit the output into the temporary buffer */ + printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); + + /* + * Copy the output into log_buf. If the caller didn't provide + * appropriate log level tags, we insert them here + */ + for (p = printk_buf; *p; p++) { + if (log_level_unknown) { + /* log_level_unknown signals the start of a new line */ + if (printk_time) { + int loglev_char; + char tbuf[50], *tp; + unsigned tlen; + unsigned long long t; + unsigned long nanosec_rem; + + /* + * force the log level token to be + * before the time output. + */ + if (p[0] == '<' && p[1] >='0' && + p[1] <= '7' && p[2] == '>') { + loglev_char = p[1]; + p += 3; + printed_len += 3; + } else { + loglev_char = default_message_loglevel + + '0'; + } + t = sched_clock(); + nanosec_rem = do_div(t, 1000000000); + tlen = sprintf(tbuf, + "<%c>[%5lu.%06lu] ", + loglev_char, + (unsigned long)t, + nanosec_rem/1000); + + for (tp = tbuf; tp < tbuf + tlen; tp++) + emit_log_char(*tp); + printed_len += tlen - 3; + } else { + if (p[0] != '<' || p[1] < '0' || + p[1] > '7' || p[2] != '>') { + emit_log_char('<'); + emit_log_char(default_message_loglevel + + '0'); + emit_log_char('>'); + } + printed_len += 3; + } + log_level_unknown = 0; + if (!*p) + break; + } + emit_log_char(*p); + if (*p == '\n') + log_level_unknown = 1; + } + + if (!cpu_online(smp_processor_id()) && + system_state != SYSTEM_RUNNING) { + /* + * Some console drivers may assume that per-cpu resources have + * been allocated. So don't allow them to be called by this + * CPU until it is officially up. We shouldn't be calling into + * random console drivers on a CPU which doesn't exist yet.. + */ + spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + if (!down_trylock(&console_sem)) { + console_locked = 1; + /* + * We own the drivers. We can drop the spinlock and let + * release_console_sem() print the text + */ + spin_unlock_irqrestore(&logbuf_lock, flags); + console_may_schedule = 0; + release_console_sem(); + } else { + /* + * Someone else owns the drivers. We drop the spinlock, which + * allows the semaphore holder to proceed and to call the + * console drivers with the output which we just produced. + */ + spin_unlock_irqrestore(&logbuf_lock, flags); + } +out: + return printed_len; +} +EXPORT_SYMBOL(printk); +EXPORT_SYMBOL(vprintk); + +/** + * acquire_console_sem - lock the console system for exclusive use. + * + * Acquires a semaphore which guarantees that the caller has + * exclusive access to the console system and the console_drivers list. + * + * Can sleep, returns nothing. + */ +void acquire_console_sem(void) +{ + if (in_interrupt()) + BUG(); + down(&console_sem); + console_locked = 1; + console_may_schedule = 1; +} +EXPORT_SYMBOL(acquire_console_sem); + +int try_acquire_console_sem(void) +{ + if (down_trylock(&console_sem)) + return -1; + console_locked = 1; + console_may_schedule = 0; + return 0; +} +EXPORT_SYMBOL(try_acquire_console_sem); + +int is_console_locked(void) +{ + return console_locked; +} +EXPORT_SYMBOL(is_console_locked); + +/** + * release_console_sem - unlock the console system + * + * Releases the semaphore which the caller holds on the console system + * and the console driver list. + * + * While the semaphore was held, console output may have been buffered + * by printk(). If this is the case, release_console_sem() emits + * the output prior to releasing the semaphore. + * + * If there is output waiting for klogd, we wake it up. + * + * release_console_sem() may be called from any context. + */ +void release_console_sem(void) +{ + unsigned long flags; + unsigned long _con_start, _log_end; + unsigned long wake_klogd = 0; + + for ( ; ; ) { + spin_lock_irqsave(&logbuf_lock, flags); + wake_klogd |= log_start - log_end; + if (con_start == log_end) + break; /* Nothing to print */ + _con_start = con_start; + _log_end = log_end; + con_start = log_end; /* Flush */ + spin_unlock(&logbuf_lock); + call_console_drivers(_con_start, _log_end); + local_irq_restore(flags); + } + console_locked = 0; + console_may_schedule = 0; + up(&console_sem); + spin_unlock_irqrestore(&logbuf_lock, flags); + if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) + wake_up_interruptible(&log_wait); +} +EXPORT_SYMBOL(release_console_sem); + +/** console_conditional_schedule - yield the CPU if required + * + * If the console code is currently allowed to sleep, and + * if this CPU should yield the CPU to another task, do + * so here. + * + * Must be called within acquire_console_sem(). + */ +void __sched console_conditional_schedule(void) +{ + if (console_may_schedule) + cond_resched(); +} +EXPORT_SYMBOL(console_conditional_schedule); + +void console_print(const char *s) +{ + printk(KERN_EMERG "%s", s); +} +EXPORT_SYMBOL(console_print); + +void console_unblank(void) +{ + struct console *c; + + /* + * console_unblank can no longer be called in interrupt context unless + * oops_in_progress is set to 1.. + */ + if (oops_in_progress) { + if (down_trylock(&console_sem) != 0) + return; + } else + acquire_console_sem(); + + console_locked = 1; + console_may_schedule = 0; + for (c = console_drivers; c != NULL; c = c->next) + if ((c->flags & CON_ENABLED) && c->unblank) + c->unblank(); + release_console_sem(); +} +EXPORT_SYMBOL(console_unblank); + +/* + * Return the console tty driver structure and its associated index + */ +struct tty_driver *console_device(int *index) +{ + struct console *c; + struct tty_driver *driver = NULL; + + acquire_console_sem(); + for (c = console_drivers; c != NULL; c = c->next) { + if (!c->device) + continue; + driver = c->device(c, index); + if (driver) + break; + } + release_console_sem(); + return driver; +} + +/* + * Prevent further output on the passed console device so that (for example) + * serial drivers can disable console output before suspending a port, and can + * re-enable output afterwards. + */ +void console_stop(struct console *console) +{ + acquire_console_sem(); + console->flags &= ~CON_ENABLED; + release_console_sem(); +} +EXPORT_SYMBOL(console_stop); + +void console_start(struct console *console) +{ + acquire_console_sem(); + console->flags |= CON_ENABLED; + release_console_sem(); +} +EXPORT_SYMBOL(console_start); + +/* + * The console driver calls this routine during kernel initialization + * to register the console printing procedure with printk() and to + * print any messages that were printed by the kernel before the + * console driver was initialized. + */ +void register_console(struct console * console) +{ + int i; + unsigned long flags; + + if (preferred_console < 0) + preferred_console = selected_console; + + /* + * See if we want to use this console driver. If we + * didn't select a console we take the first one + * that registers here. + */ + if (preferred_console < 0) { + if (console->index < 0) + console->index = 0; + if (console->setup == NULL || + console->setup(console, NULL) == 0) { + console->flags |= CON_ENABLED | CON_CONSDEV; + preferred_console = 0; + } + } + + /* + * See if this console matches one we selected on + * the command line. + */ + for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { + if (strcmp(console_cmdline[i].name, console->name) != 0) + continue; + if (console->index >= 0 && + console->index != console_cmdline[i].index) + continue; + if (console->index < 0) + console->index = console_cmdline[i].index; + if (console->setup && + console->setup(console, console_cmdline[i].options) != 0) + break; + console->flags |= CON_ENABLED; + console->index = console_cmdline[i].index; + if (i == preferred_console) + console->flags |= CON_CONSDEV; + break; + } + + if (!(console->flags & CON_ENABLED)) + return; + + if (console_drivers && (console_drivers->flags & CON_BOOT)) { + unregister_console(console_drivers); + console->flags &= ~CON_PRINTBUFFER; + } + + /* + * Put this console in the list - keep the + * preferred driver at the head of the list. + */ + acquire_console_sem(); + if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { + console->next = console_drivers; + console_drivers = console; + } else { + console->next = console_drivers->next; + console_drivers->next = console; + } + if (console->flags & CON_PRINTBUFFER) { + /* + * release_console_sem() will print out the buffered messages + * for us. + */ + spin_lock_irqsave(&logbuf_lock, flags); + con_start = log_start; + spin_unlock_irqrestore(&logbuf_lock, flags); + } + release_console_sem(); +} +EXPORT_SYMBOL(register_console); + +int unregister_console(struct console * console) +{ + struct console *a,*b; + int res = 1; + + acquire_console_sem(); + if (console_drivers == console) { + console_drivers=console->next; + res = 0; + } else { + for (a=console_drivers->next, b=console_drivers ; + a; b=a, a=b->next) { + if (a == console) { + b->next = a->next; + res = 0; + break; + } + } + } + + /* If last console is removed, we re-enable picking the first + * one that gets registered. Without that, pmac early boot console + * would prevent fbcon from taking over. + */ + if (console_drivers == NULL) + preferred_console = selected_console; + + + release_console_sem(); + return res; +} +EXPORT_SYMBOL(unregister_console); + +/** + * tty_write_message - write a message to a certain tty, not just the console. + * + * This is used for messages that need to be redirected to a specific tty. + * We don't put it into the syslog queue right now maybe in the future if + * really needed. + */ +void tty_write_message(struct tty_struct *tty, char *msg) +{ + if (tty && tty->driver->write) + tty->driver->write(tty, msg, strlen(msg)); + return; +} + +/* + * printk rate limiting, lifted from the networking subsystem. + * + * This enforces a rate limit: not more than one kernel message + * every printk_ratelimit_jiffies to make a denial-of-service + * attack impossible. + */ +int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) +{ + static DEFINE_SPINLOCK(ratelimit_lock); + static unsigned long toks = 10*5*HZ; + static unsigned long last_msg; + static int missed; + unsigned long flags; + unsigned long now = jiffies; + + spin_lock_irqsave(&ratelimit_lock, flags); + toks += now - last_msg; + last_msg = now; + if (toks > (ratelimit_burst * ratelimit_jiffies)) + toks = ratelimit_burst * ratelimit_jiffies; + if (toks >= ratelimit_jiffies) { + int lost = missed; + missed = 0; + toks -= ratelimit_jiffies; + spin_unlock_irqrestore(&ratelimit_lock, flags); + if (lost) + printk(KERN_WARNING "printk: %d messages suppressed.\n", lost); + return 1; + } + missed++; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; +} +EXPORT_SYMBOL(__printk_ratelimit); + +/* minimum time in jiffies between messages */ +int printk_ratelimit_jiffies = 5*HZ; + +/* number of messages we send before ratelimiting */ +int printk_ratelimit_burst = 10; + +int printk_ratelimit(void) +{ + return __printk_ratelimit(printk_ratelimit_jiffies, + printk_ratelimit_burst); +} +EXPORT_SYMBOL(printk_ratelimit); diff --git a/kernel/profile.c b/kernel/profile.c new file mode 100644 index 00000000000..a38fa70075f --- /dev/null +++ b/kernel/profile.c @@ -0,0 +1,563 @@ +/* + * linux/kernel/profile.c + * Simple profiling. Manages a direct-mapped profile hit count buffer, + * with configurable resolution, support for restricting the cpus on + * which profiling is done, and switching between cpu time and + * schedule() calls via kernel command line parameters passed at boot. + * + * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, + * Red Hat, July 2004 + * Consolidation of architecture support code for profiling, + * William Irwin, Oracle, July 2004 + * Amortized hit count accounting via per-cpu open-addressed hashtables + * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/profile.h> +#include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/cpu.h> +#include <linux/profile.h> +#include <linux/highmem.h> +#include <asm/sections.h> +#include <asm/semaphore.h> + +struct profile_hit { + u32 pc, hits; +}; +#define PROFILE_GRPSHIFT 3 +#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) +#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) +#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) + +/* Oprofile timer tick hook */ +int (*timer_hook)(struct pt_regs *); + +static atomic_t *prof_buffer; +static unsigned long prof_len, prof_shift; +static int prof_on; +static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); +static DEFINE_PER_CPU(int, cpu_profile_flip); +static DECLARE_MUTEX(profile_flip_mutex); +#endif /* CONFIG_SMP */ + +static int __init profile_setup(char * str) +{ + int par; + + if (!strncmp(str, "schedule", 8)) { + prof_on = SCHED_PROFILING; + printk(KERN_INFO "kernel schedule profiling enabled\n"); + if (str[7] == ',') + str += 8; + } + if (get_option(&str,&par)) { + prof_shift = par; + prof_on = CPU_PROFILING; + printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", + prof_shift); + } + return 1; +} +__setup("profile=", profile_setup); + + +void __init profile_init(void) +{ + if (!prof_on) + return; + + /* only text is profiled */ + prof_len = (_etext - _stext) >> prof_shift; + prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); +} + +/* Profile event notifications */ + +#ifdef CONFIG_PROFILING + +static DECLARE_RWSEM(profile_rwsem); +static DEFINE_RWLOCK(handoff_lock); +static struct notifier_block * task_exit_notifier; +static struct notifier_block * task_free_notifier; +static struct notifier_block * munmap_notifier; + +void profile_task_exit(struct task_struct * task) +{ + down_read(&profile_rwsem); + notifier_call_chain(&task_exit_notifier, 0, task); + up_read(&profile_rwsem); +} + +int profile_handoff_task(struct task_struct * task) +{ + int ret; + read_lock(&handoff_lock); + ret = notifier_call_chain(&task_free_notifier, 0, task); + read_unlock(&handoff_lock); + return (ret == NOTIFY_OK) ? 1 : 0; +} + +void profile_munmap(unsigned long addr) +{ + down_read(&profile_rwsem); + notifier_call_chain(&munmap_notifier, 0, (void *)addr); + up_read(&profile_rwsem); +} + +int task_handoff_register(struct notifier_block * n) +{ + int err = -EINVAL; + + write_lock(&handoff_lock); + err = notifier_chain_register(&task_free_notifier, n); + write_unlock(&handoff_lock); + return err; +} + +int task_handoff_unregister(struct notifier_block * n) +{ + int err = -EINVAL; + + write_lock(&handoff_lock); + err = notifier_chain_unregister(&task_free_notifier, n); + write_unlock(&handoff_lock); + return err; +} + +int profile_event_register(enum profile_type type, struct notifier_block * n) +{ + int err = -EINVAL; + + down_write(&profile_rwsem); + + switch (type) { + case PROFILE_TASK_EXIT: + err = notifier_chain_register(&task_exit_notifier, n); + break; + case PROFILE_MUNMAP: + err = notifier_chain_register(&munmap_notifier, n); + break; + } + + up_write(&profile_rwsem); + + return err; +} + + +int profile_event_unregister(enum profile_type type, struct notifier_block * n) +{ + int err = -EINVAL; + + down_write(&profile_rwsem); + + switch (type) { + case PROFILE_TASK_EXIT: + err = notifier_chain_unregister(&task_exit_notifier, n); + break; + case PROFILE_MUNMAP: + err = notifier_chain_unregister(&munmap_notifier, n); + break; + } + + up_write(&profile_rwsem); + return err; +} + +int register_timer_hook(int (*hook)(struct pt_regs *)) +{ + if (timer_hook) + return -EBUSY; + timer_hook = hook; + return 0; +} + +void unregister_timer_hook(int (*hook)(struct pt_regs *)) +{ + WARN_ON(hook != timer_hook); + timer_hook = NULL; + /* make sure all CPUs see the NULL hook */ + synchronize_kernel(); +} + +EXPORT_SYMBOL_GPL(register_timer_hook); +EXPORT_SYMBOL_GPL(unregister_timer_hook); +EXPORT_SYMBOL_GPL(task_handoff_register); +EXPORT_SYMBOL_GPL(task_handoff_unregister); + +#endif /* CONFIG_PROFILING */ + +EXPORT_SYMBOL_GPL(profile_event_register); +EXPORT_SYMBOL_GPL(profile_event_unregister); + +#ifdef CONFIG_SMP +/* + * Each cpu has a pair of open-addressed hashtables for pending + * profile hits. read_profile() IPI's all cpus to request them + * to flip buffers and flushes their contents to prof_buffer itself. + * Flip requests are serialized by the profile_flip_mutex. The sole + * use of having a second hashtable is for avoiding cacheline + * contention that would otherwise happen during flushes of pending + * profile hits required for the accuracy of reported profile hits + * and so resurrect the interrupt livelock issue. + * + * The open-addressed hashtables are indexed by profile buffer slot + * and hold the number of pending hits to that profile buffer slot on + * a cpu in an entry. When the hashtable overflows, all pending hits + * are accounted to their corresponding profile buffer slots with + * atomic_add() and the hashtable emptied. As numerous pending hits + * may be accounted to a profile buffer slot in a hashtable entry, + * this amortizes a number of atomic profile buffer increments likely + * to be far larger than the number of entries in the hashtable, + * particularly given that the number of distinct profile buffer + * positions to which hits are accounted during short intervals (e.g. + * several seconds) is usually very small. Exclusion from buffer + * flipping is provided by interrupt disablement (note that for + * SCHED_PROFILING profile_hit() may be called from process context). + * The hash function is meant to be lightweight as opposed to strong, + * and was vaguely inspired by ppc64 firmware-supported inverted + * pagetable hash functions, but uses a full hashtable full of finite + * collision chains, not just pairs of them. + * + * -- wli + */ +static void __profile_flip_buffers(void *unused) +{ + int cpu = smp_processor_id(); + + per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); +} + +static void profile_flip_buffers(void) +{ + int i, j, cpu; + + down(&profile_flip_mutex); + j = per_cpu(cpu_profile_flip, get_cpu()); + put_cpu(); + on_each_cpu(__profile_flip_buffers, NULL, 0, 1); + for_each_online_cpu(cpu) { + struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; + for (i = 0; i < NR_PROFILE_HIT; ++i) { + if (!hits[i].hits) { + if (hits[i].pc) + hits[i].pc = 0; + continue; + } + atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); + hits[i].hits = hits[i].pc = 0; + } + } + up(&profile_flip_mutex); +} + +static void profile_discard_flip_buffers(void) +{ + int i, cpu; + + down(&profile_flip_mutex); + i = per_cpu(cpu_profile_flip, get_cpu()); + put_cpu(); + on_each_cpu(__profile_flip_buffers, NULL, 0, 1); + for_each_online_cpu(cpu) { + struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; + memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); + } + up(&profile_flip_mutex); +} + +void profile_hit(int type, void *__pc) +{ + unsigned long primary, secondary, flags, pc = (unsigned long)__pc; + int i, j, cpu; + struct profile_hit *hits; + + if (prof_on != type || !prof_buffer) + return; + pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); + i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; + secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; + cpu = get_cpu(); + hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; + if (!hits) { + put_cpu(); + return; + } + local_irq_save(flags); + do { + for (j = 0; j < PROFILE_GRPSZ; ++j) { + if (hits[i + j].pc == pc) { + hits[i + j].hits++; + goto out; + } else if (!hits[i + j].hits) { + hits[i + j].pc = pc; + hits[i + j].hits = 1; + goto out; + } + } + i = (i + secondary) & (NR_PROFILE_HIT - 1); + } while (i != primary); + atomic_inc(&prof_buffer[pc]); + for (i = 0; i < NR_PROFILE_HIT; ++i) { + atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); + hits[i].pc = hits[i].hits = 0; + } +out: + local_irq_restore(flags); + put_cpu(); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int __devinit profile_cpu_callback(struct notifier_block *info, + unsigned long action, void *__cpu) +{ + int node, cpu = (unsigned long)__cpu; + struct page *page; + + switch (action) { + case CPU_UP_PREPARE: + node = cpu_to_node(cpu); + per_cpu(cpu_profile_flip, cpu) = 0; + if (!per_cpu(cpu_profile_hits, cpu)[1]) { + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NOTIFY_BAD; + per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); + } + if (!per_cpu(cpu_profile_hits, cpu)[0]) { + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + goto out_free; + per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); + } + break; + out_free: + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); + per_cpu(cpu_profile_hits, cpu)[1] = NULL; + __free_page(page); + return NOTIFY_BAD; + case CPU_ONLINE: + cpu_set(cpu, prof_cpu_mask); + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + cpu_clear(cpu, prof_cpu_mask); + if (per_cpu(cpu_profile_hits, cpu)[0]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); + per_cpu(cpu_profile_hits, cpu)[0] = NULL; + __free_page(page); + } + if (per_cpu(cpu_profile_hits, cpu)[1]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); + per_cpu(cpu_profile_hits, cpu)[1] = NULL; + __free_page(page); + } + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ +#else /* !CONFIG_SMP */ +#define profile_flip_buffers() do { } while (0) +#define profile_discard_flip_buffers() do { } while (0) + +void profile_hit(int type, void *__pc) +{ + unsigned long pc; + + if (prof_on != type || !prof_buffer) + return; + pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; + atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); +} +#endif /* !CONFIG_SMP */ + +void profile_tick(int type, struct pt_regs *regs) +{ + if (type == CPU_PROFILING && timer_hook) + timer_hook(regs); + if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + profile_hit(type, (void *)profile_pc(regs)); +} + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +#include <asm/uaccess.h> +#include <asm/ptrace.h> + +static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); + if (count - len < 2) + return -EINVAL; + len += sprintf(page + len, "\n"); + return len; +} + +static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + cpumask_t *mask = (cpumask_t *)data; + unsigned long full_count = count, err; + cpumask_t new_value; + + err = cpumask_parse(buffer, count, new_value); + if (err) + return err; + + *mask = new_value; + return full_count; +} + +void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) +{ + struct proc_dir_entry *entry; + + /* create /proc/irq/prof_cpu_mask */ + if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) + return; + entry->nlink = 1; + entry->data = (void *)&prof_cpu_mask; + entry->read_proc = prof_cpu_mask_read_proc; + entry->write_proc = prof_cpu_mask_write_proc; +} + +/* + * This function accesses profiling information. The returned data is + * binary: the sampling step and the actual contents of the profile + * buffer. Use of the program readprofile is recommended in order to + * get meaningful info out of these data. + */ +static ssize_t +read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t read; + char * pnt; + unsigned int sample_step = 1 << prof_shift; + + profile_flip_buffers(); + if (p >= (prof_len+1)*sizeof(unsigned int)) + return 0; + if (count > (prof_len+1)*sizeof(unsigned int) - p) + count = (prof_len+1)*sizeof(unsigned int) - p; + read = 0; + + while (p < sizeof(unsigned int) && count > 0) { + put_user(*((char *)(&sample_step)+p),buf); + buf++; p++; count--; read++; + } + pnt = (char *)prof_buffer + p - sizeof(atomic_t); + if (copy_to_user(buf,(void *)pnt,count)) + return -EFAULT; + read += count; + *ppos += read; + return read; +} + +/* + * Writing to /proc/profile resets the counters + * + * Writing a 'profiling multiplier' value into it also re-sets the profiling + * interrupt frequency, on architectures that support this. + */ +static ssize_t write_profile(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ +#ifdef CONFIG_SMP + extern int setup_profiling_timer (unsigned int multiplier); + + if (count == sizeof(int)) { + unsigned int multiplier; + + if (copy_from_user(&multiplier, buf, sizeof(int))) + return -EFAULT; + + if (setup_profiling_timer(multiplier)) + return -EINVAL; + } +#endif + profile_discard_flip_buffers(); + memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); + return count; +} + +static struct file_operations proc_profile_operations = { + .read = read_profile, + .write = write_profile, +}; + +#ifdef CONFIG_SMP +static void __init profile_nop(void *unused) +{ +} + +static int __init create_hash_tables(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + int node = cpu_to_node(cpu); + struct page *page; + + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + goto out_cleanup; + per_cpu(cpu_profile_hits, cpu)[1] + = (struct profile_hit *)page_address(page); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + goto out_cleanup; + per_cpu(cpu_profile_hits, cpu)[0] + = (struct profile_hit *)page_address(page); + } + return 0; +out_cleanup: + prof_on = 0; + mb(); + on_each_cpu(profile_nop, NULL, 0, 1); + for_each_online_cpu(cpu) { + struct page *page; + + if (per_cpu(cpu_profile_hits, cpu)[0]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); + per_cpu(cpu_profile_hits, cpu)[0] = NULL; + __free_page(page); + } + if (per_cpu(cpu_profile_hits, cpu)[1]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); + per_cpu(cpu_profile_hits, cpu)[1] = NULL; + __free_page(page); + } + } + return -1; +} +#else +#define create_hash_tables() ({ 0; }) +#endif + +static int __init create_proc_profile(void) +{ + struct proc_dir_entry *entry; + + if (!prof_on) + return 0; + if (create_hash_tables()) + return -1; + if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) + return 0; + entry->proc_fops = &proc_profile_operations; + entry->size = (1+prof_len) * sizeof(atomic_t); + hotcpu_notifier(profile_cpu_callback, 0); + return 0; +} +module_init(create_proc_profile); +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c new file mode 100644 index 00000000000..88b306c4e84 --- /dev/null +++ b/kernel/ptrace.c @@ -0,0 +1,389 @@ +/* + * linux/kernel/ptrace.c + * + * (C) Copyright 1999 Linus Torvalds + * + * Common interfaces for "ptrace()" which we do not want + * to continually duplicate across every architecture. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> +#include <linux/ptrace.h> +#include <linux/security.h> + +#include <asm/pgtable.h> +#include <asm/uaccess.h> + +/* + * ptrace a task: make the debugger its new parent and + * move it to the ptrace list. + * + * Must be called with the tasklist lock write-held. + */ +void __ptrace_link(task_t *child, task_t *new_parent) +{ + if (!list_empty(&child->ptrace_list)) + BUG(); + if (child->parent == new_parent) + return; + list_add(&child->ptrace_list, &child->parent->ptrace_children); + REMOVE_LINKS(child); + child->parent = new_parent; + SET_LINKS(child); +} + +/* + * Turn a tracing stop into a normal stop now, since with no tracer there + * would be no way to wake it up with SIGCONT or SIGKILL. If there was a + * signal sent that would resume the child, but didn't because it was in + * TASK_TRACED, resume it now. + * Requires that irqs be disabled. + */ +void ptrace_untrace(task_t *child) +{ + spin_lock(&child->sighand->siglock); + if (child->state == TASK_TRACED) { + if (child->signal->flags & SIGNAL_STOP_STOPPED) { + child->state = TASK_STOPPED; + } else { + signal_wake_up(child, 1); + } + } + spin_unlock(&child->sighand->siglock); +} + +/* + * unptrace a task: move it back to its original parent and + * remove it from the ptrace list. + * + * Must be called with the tasklist lock write-held. + */ +void __ptrace_unlink(task_t *child) +{ + if (!child->ptrace) + BUG(); + child->ptrace = 0; + if (!list_empty(&child->ptrace_list)) { + list_del_init(&child->ptrace_list); + REMOVE_LINKS(child); + child->parent = child->real_parent; + SET_LINKS(child); + } + + if (child->state == TASK_TRACED) + ptrace_untrace(child); +} + +/* + * Check that we have indeed attached to the thing.. + */ +int ptrace_check_attach(struct task_struct *child, int kill) +{ + int ret = -ESRCH; + + /* + * We take the read lock around doing both checks to close a + * possible race where someone else was tracing our child and + * detached between these two checks. After this locked check, + * we are sure that this is our traced child and that can only + * be changed by us so it's not changing right after this. + */ + read_lock(&tasklist_lock); + if ((child->ptrace & PT_PTRACED) && child->parent == current && + (!(child->ptrace & PT_ATTACHED) || child->real_parent != current) + && child->signal != NULL) { + ret = 0; + spin_lock_irq(&child->sighand->siglock); + if (child->state == TASK_STOPPED) { + child->state = TASK_TRACED; + } else if (child->state != TASK_TRACED && !kill) { + ret = -ESRCH; + } + spin_unlock_irq(&child->sighand->siglock); + } + read_unlock(&tasklist_lock); + + if (!ret && !kill) { + wait_task_inactive(child); + } + + /* All systems go.. */ + return ret; +} + +int ptrace_attach(struct task_struct *task) +{ + int retval; + task_lock(task); + retval = -EPERM; + if (task->pid <= 1) + goto bad; + if (task == current) + goto bad; + if (!task->mm) + goto bad; + if(((current->uid != task->euid) || + (current->uid != task->suid) || + (current->uid != task->uid) || + (current->gid != task->egid) || + (current->gid != task->sgid) || + (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + goto bad; + rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + goto bad; + /* the same process cannot be attached many times */ + if (task->ptrace & PT_PTRACED) + goto bad; + retval = security_ptrace(current, task); + if (retval) + goto bad; + + /* Go */ + task->ptrace |= PT_PTRACED | ((task->real_parent != current) + ? PT_ATTACHED : 0); + if (capable(CAP_SYS_PTRACE)) + task->ptrace |= PT_PTRACE_CAP; + task_unlock(task); + + write_lock_irq(&tasklist_lock); + __ptrace_link(task, current); + write_unlock_irq(&tasklist_lock); + + force_sig_specific(SIGSTOP, task); + return 0; + +bad: + task_unlock(task); + return retval; +} + +int ptrace_detach(struct task_struct *child, unsigned int data) +{ + if ((unsigned long) data > _NSIG) + return -EIO; + + /* Architecture-specific hardware disable .. */ + ptrace_disable(child); + + /* .. re-parent .. */ + child->exit_code = data; + + write_lock_irq(&tasklist_lock); + __ptrace_unlink(child); + /* .. and wake it up. */ + if (child->exit_state != EXIT_ZOMBIE) + wake_up_process(child); + write_unlock_irq(&tasklist_lock); + + return 0; +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ + +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset; + void *maddr; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} + +int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) +{ + int copied = 0; + + while (len > 0) { + char buf[128]; + int this_len, retval; + + this_len = (len > sizeof(buf)) ? sizeof(buf) : len; + retval = access_process_vm(tsk, src, buf, this_len, 0); + if (!retval) { + if (copied) + break; + return -EIO; + } + if (copy_to_user(dst, buf, retval)) + return -EFAULT; + copied += retval; + src += retval; + dst += retval; + len -= retval; + } + return copied; +} + +int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len) +{ + int copied = 0; + + while (len > 0) { + char buf[128]; + int this_len, retval; + + this_len = (len > sizeof(buf)) ? sizeof(buf) : len; + if (copy_from_user(buf, src, this_len)) + return -EFAULT; + retval = access_process_vm(tsk, dst, buf, this_len, 1); + if (!retval) { + if (copied) + break; + return -EIO; + } + copied += retval; + src += retval; + dst += retval; + len -= retval; + } + return copied; +} + +static int ptrace_setoptions(struct task_struct *child, long data) +{ + child->ptrace &= ~PT_TRACE_MASK; + + if (data & PTRACE_O_TRACESYSGOOD) + child->ptrace |= PT_TRACESYSGOOD; + + if (data & PTRACE_O_TRACEFORK) + child->ptrace |= PT_TRACE_FORK; + + if (data & PTRACE_O_TRACEVFORK) + child->ptrace |= PT_TRACE_VFORK; + + if (data & PTRACE_O_TRACECLONE) + child->ptrace |= PT_TRACE_CLONE; + + if (data & PTRACE_O_TRACEEXEC) + child->ptrace |= PT_TRACE_EXEC; + + if (data & PTRACE_O_TRACEVFORKDONE) + child->ptrace |= PT_TRACE_VFORK_DONE; + + if (data & PTRACE_O_TRACEEXIT) + child->ptrace |= PT_TRACE_EXIT; + + return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; +} + +static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) +{ + siginfo_t lastinfo; + int error = -ESRCH; + + read_lock(&tasklist_lock); + if (likely(child->sighand != NULL)) { + error = -EINVAL; + spin_lock_irq(&child->sighand->siglock); + if (likely(child->last_siginfo != NULL)) { + lastinfo = *child->last_siginfo; + error = 0; + } + spin_unlock_irq(&child->sighand->siglock); + } + read_unlock(&tasklist_lock); + if (!error) + return copy_siginfo_to_user(data, &lastinfo); + return error; +} + +static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) +{ + siginfo_t newinfo; + int error = -ESRCH; + + if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) + return -EFAULT; + + read_lock(&tasklist_lock); + if (likely(child->sighand != NULL)) { + error = -EINVAL; + spin_lock_irq(&child->sighand->siglock); + if (likely(child->last_siginfo != NULL)) { + *child->last_siginfo = newinfo; + error = 0; + } + spin_unlock_irq(&child->sighand->siglock); + } + read_unlock(&tasklist_lock); + return error; +} + +int ptrace_request(struct task_struct *child, long request, + long addr, long data) +{ + int ret = -EIO; + + switch (request) { +#ifdef PTRACE_OLDSETOPTIONS + case PTRACE_OLDSETOPTIONS: +#endif + case PTRACE_SETOPTIONS: + ret = ptrace_setoptions(child, data); + break; + case PTRACE_GETEVENTMSG: + ret = put_user(child->ptrace_message, (unsigned long __user *) data); + break; + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); + break; + case PTRACE_SETSIGINFO: + ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); + break; + default: + break; + } + + return ret; +} diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c new file mode 100644 index 00000000000..d00eded75d7 --- /dev/null +++ b/kernel/rcupdate.c @@ -0,0 +1,470 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2001 + * + * Authors: Dipankar Sarma <dipankar@in.ibm.com> + * Manfred Spraul <manfred@colorfullife.com> + * + * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <asm/atomic.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> + +/* Definition for rcupdate control block. */ +struct rcu_ctrlblk rcu_ctrlblk = + { .cur = -300, .completed = -300 }; +struct rcu_ctrlblk rcu_bh_ctrlblk = + { .cur = -300, .completed = -300 }; + +/* Bookkeeping of the progress of the grace period */ +struct rcu_state { + spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ + cpumask_t cpumask; /* CPUs that need to switch in order */ + /* for current batch to proceed. */ +}; + +static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = + {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; +static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = + {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; + +DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; +DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; + +/* Fake initialization required by compiler */ +static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; +static int maxbatch = 10; + +/** + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void fastcall call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + local_irq_restore(flags); +} + +/** + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ +void fastcall call_rcu_bh(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_bh_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + local_irq_restore(flags); +} + +/* + * Invoke the completed RCU callbacks. They are expected to be in + * a per-cpu list. + */ +static void rcu_do_batch(struct rcu_data *rdp) +{ + struct rcu_head *next, *list; + int count = 0; + + list = rdp->donelist; + while (list) { + next = rdp->donelist = list->next; + list->func(list); + list = next; + if (++count >= maxbatch) + break; + } + if (!rdp->donelist) + rdp->donetail = &rdp->donelist; + else + tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); +} + +/* + * Grace period handling: + * The grace period handling consists out of two steps: + * - A new grace period is started. + * This is done by rcu_start_batch. The start is not broadcasted to + * all cpus, they must pick this up by comparing rcp->cur with + * rdp->quiescbatch. All cpus are recorded in the + * rcu_state.cpumask bitmap. + * - All cpus must go through a quiescent state. + * Since the start of the grace period is not broadcasted, at least two + * calls to rcu_check_quiescent_state are required: + * The first call just notices that a new grace period is running. The + * following calls check if there was a quiescent state since the beginning + * of the grace period. If so, it updates rcu_state.cpumask. If + * the bitmap is empty, then the grace period is completed. + * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace + * period (if necessary). + */ +/* + * Register a new batch of callbacks, and start it up if there is currently no + * active batch and the batch to be registered has not already occurred. + * Caller must hold rcu_state.lock. + */ +static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, + int next_pending) +{ + if (next_pending) + rcp->next_pending = 1; + + if (rcp->next_pending && + rcp->completed == rcp->cur) { + /* Can't change, since spin lock held. */ + cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); + + rcp->next_pending = 0; + /* next_pending == 0 must be visible in __rcu_process_callbacks() + * before it can see new value of cur. + */ + smp_wmb(); + rcp->cur++; + } +} + +/* + * cpu went through a quiescent state since the beginning of the grace period. + * Clear it from the cpu mask and complete the grace period if it was the last + * cpu. Start another grace period if someone has further entries pending + */ +static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) +{ + cpu_clear(cpu, rsp->cpumask); + if (cpus_empty(rsp->cpumask)) { + /* batch completed ! */ + rcp->completed = rcp->cur; + rcu_start_batch(rcp, rsp, 0); + } +} + +/* + * Check if the cpu has gone through a quiescent state (say context + * switch). If so and if it already hasn't done so in this RCU + * quiescent cycle, then indicate that it has done so. + */ +static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, + struct rcu_state *rsp, struct rcu_data *rdp) +{ + if (rdp->quiescbatch != rcp->cur) { + /* start new grace period: */ + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + rdp->quiescbatch = rcp->cur; + return; + } + + /* Grace period already completed for this cpu? + * qs_pending is checked instead of the actual bitmap to avoid + * cacheline trashing. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!rdp->passed_quiesc) + return; + rdp->qs_pending = 0; + + spin_lock(&rsp->lock); + /* + * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync + * during cpu startup. Ignore the quiescent state. + */ + if (likely(rdp->quiescbatch == rcp->cur)) + cpu_quiet(rdp->cpu, rcp, rsp); + + spin_unlock(&rsp->lock); +} + + +#ifdef CONFIG_HOTPLUG_CPU + +/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing + * locking requirements, the list it's pulling from has to belong to a cpu + * which is dead and hence not processing interrupts. + */ +static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, + struct rcu_head **tail) +{ + local_irq_disable(); + *this_rdp->nxttail = list; + if (list) + this_rdp->nxttail = tail; + local_irq_enable(); +} + +static void __rcu_offline_cpu(struct rcu_data *this_rdp, + struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) +{ + /* if the cpu going offline owns the grace period + * we can block indefinitely waiting for it, so flush + * it here + */ + spin_lock_bh(&rsp->lock); + if (rcp->cur != rcp->completed) + cpu_quiet(rdp->cpu, rcp, rsp); + spin_unlock_bh(&rsp->lock); + rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + +} +static void rcu_offline_cpu(int cpu) +{ + struct rcu_data *this_rdp = &get_cpu_var(rcu_data); + struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); + + __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state, + &per_cpu(rcu_data, cpu)); + __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state, + &per_cpu(rcu_bh_data, cpu)); + put_cpu_var(rcu_data); + put_cpu_var(rcu_bh_data); + tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); +} + +#else + +static void rcu_offline_cpu(int cpu) +{ +} + +#endif + +/* + * This does the RCU processing work from tasklet context. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, + struct rcu_state *rsp, struct rcu_data *rdp) +{ + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { + *rdp->donetail = rdp->curlist; + rdp->donetail = rdp->curtail; + rdp->curlist = NULL; + rdp->curtail = &rdp->curlist; + } + + local_irq_disable(); + if (rdp->nxtlist && !rdp->curlist) { + rdp->curlist = rdp->nxtlist; + rdp->curtail = rdp->nxttail; + rdp->nxtlist = NULL; + rdp->nxttail = &rdp->nxtlist; + local_irq_enable(); + + /* + * start the next batch of callbacks + */ + + /* determine batch number */ + rdp->batch = rcp->cur + 1; + /* see the comment and corresponding wmb() in + * the rcu_start_batch() + */ + smp_rmb(); + + if (!rcp->next_pending) { + /* and start it/schedule start if it's a new batch */ + spin_lock(&rsp->lock); + rcu_start_batch(rcp, rsp, 1); + spin_unlock(&rsp->lock); + } + } else { + local_irq_enable(); + } + rcu_check_quiescent_state(rcp, rsp, rdp); + if (rdp->donelist) + rcu_do_batch(rdp); +} + +static void rcu_process_callbacks(unsigned long unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state, + &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state, + &__get_cpu_var(rcu_bh_data)); +} + +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); + tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); +} + +static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + memset(rdp, 0, sizeof(*rdp)); + rdp->curtail = &rdp->curlist; + rdp->nxttail = &rdp->nxtlist; + rdp->donetail = &rdp->donelist; + rdp->quiescbatch = rcp->completed; + rdp->qs_pending = 0; + rdp->cpu = cpu; +} + +static void __devinit rcu_online_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); + + rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); + rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); + tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); +} + +static int __devinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + rcu_online_cpu(cpu); + break; + case CPU_DEAD: + rcu_offline_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata rcu_nb = { + .notifier_call = rcu_cpu_notify, +}; + +/* + * Initializes rcu mechanism. Assumed to be called early. + * That is before local timer(SMP) or jiffie timer (uniproc) is setup. + * Note that rcu_qsctr and friends are implicitly + * initialized due to the choice of ``0'' for RCU_CTR_INVALID. + */ +void __init rcu_init(void) +{ + rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + /* Register notifier for non-boot CPUs */ + register_cpu_notifier(&rcu_nb); +} + +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; + +/* Because of FASTCALL declaration of complete, we use this wrapper */ +static void wakeme_after_rcu(struct rcu_head *head) +{ + struct rcu_synchronize *rcu; + + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); +} + +/** + * synchronize_kernel - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_kernel(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished */ + call_rcu(&rcu.head, wakeme_after_rcu); + + /* Wait for it */ + wait_for_completion(&rcu.completion); +} + +module_param(maxbatch, int, 0); +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); +EXPORT_SYMBOL_GPL(synchronize_kernel); diff --git a/kernel/resource.c b/kernel/resource.c new file mode 100644 index 00000000000..35c99ac02c7 --- /dev/null +++ b/kernel/resource.c @@ -0,0 +1,551 @@ +/* + * linux/kernel/resource.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 1999 Martin Mares <mj@ucw.cz> + * + * Arbitrary resource management. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <asm/io.h> + + +struct resource ioport_resource = { + .name = "PCI IO", + .start = 0x0000, + .end = IO_SPACE_LIMIT, + .flags = IORESOURCE_IO, +}; + +EXPORT_SYMBOL(ioport_resource); + +struct resource iomem_resource = { + .name = "PCI mem", + .start = 0UL, + .end = ~0UL, + .flags = IORESOURCE_MEM, +}; + +EXPORT_SYMBOL(iomem_resource); + +static DEFINE_RWLOCK(resource_lock); + +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + if (p->child) + return p->child; + while (!p->sibling && p->parent) + p = p->parent; + return p->sibling; +} + +static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(resource_lock) +{ + struct resource *p = m->private; + loff_t l = 0; + read_lock(&resource_lock); + for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) + ; + return p; +} + +static void r_stop(struct seq_file *m, void *v) + __releases(resource_lock) +{ + read_unlock(&resource_lock); +} + +static int r_show(struct seq_file *m, void *v) +{ + struct resource *root = m->private; + struct resource *r = v, *p; + int width = root->end < 0x10000 ? 4 : 8; + int depth; + + for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) + if (p->parent == root) + break; + seq_printf(m, "%*s%0*lx-%0*lx : %s\n", + depth * 2, "", + width, r->start, + width, r->end, + r->name ? r->name : "<BAD>"); + return 0; +} + +static struct seq_operations resource_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = r_show, +}; + +static int ioports_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &ioport_resource; + } + return res; +} + +static int iomem_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &iomem_resource; + } + return res; +} + +static struct file_operations proc_ioports_operations = { + .open = ioports_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations proc_iomem_operations = { + .open = iomem_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ioresources_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("ioports", 0, NULL); + if (entry) + entry->proc_fops = &proc_ioports_operations; + entry = create_proc_entry("iomem", 0, NULL); + if (entry) + entry->proc_fops = &proc_iomem_operations; + return 0; +} +__initcall(ioresources_init); + +#endif /* CONFIG_PROC_FS */ + +/* Return the conflict entry if you can't request it */ +static struct resource * __request_resource(struct resource *root, struct resource *new) +{ + unsigned long start = new->start; + unsigned long end = new->end; + struct resource *tmp, **p; + + if (end < start) + return root; + if (start < root->start) + return root; + if (end > root->end) + return root; + p = &root->child; + for (;;) { + tmp = *p; + if (!tmp || tmp->start > end) { + new->sibling = tmp; + *p = new; + new->parent = root; + return NULL; + } + p = &tmp->sibling; + if (tmp->end < start) + continue; + return tmp; + } +} + +static int __release_resource(struct resource *old) +{ + struct resource *tmp, **p; + + p = &old->parent->child; + for (;;) { + tmp = *p; + if (!tmp) + break; + if (tmp == old) { + *p = tmp->sibling; + old->parent = NULL; + return 0; + } + p = &tmp->sibling; + } + return -EINVAL; +} + +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __request_resource(root, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +EXPORT_SYMBOL(request_resource); + +struct resource *____request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __request_resource(root, new); + write_unlock(&resource_lock); + return conflict; +} + +EXPORT_SYMBOL(____request_resource); + +int release_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old); + write_unlock(&resource_lock); + return retval; +} + +EXPORT_SYMBOL(release_resource); + +/* + * Find empty slot in the resource tree given range and alignment. + */ +static int find_resource(struct resource *root, struct resource *new, + unsigned long size, + unsigned long min, unsigned long max, + unsigned long align, + void (*alignf)(void *, struct resource *, + unsigned long, unsigned long), + void *alignf_data) +{ + struct resource *this = root->child; + + new->start = root->start; + /* + * Skip past an allocated resource that starts at 0, since the assignment + * of this->start - 1 to new->end below would cause an underflow. + */ + if (this && this->start == 0) { + new->start = this->end + 1; + this = this->sibling; + } + for(;;) { + if (this) + new->end = this->start - 1; + else + new->end = root->end; + if (new->start < min) + new->start = min; + if (new->end > max) + new->end = max; + new->start = (new->start + align - 1) & ~(align - 1); + if (alignf) + alignf(alignf_data, new, size, align); + if (new->start < new->end && new->end - new->start + 1 >= size) { + new->end = new->start + size - 1; + return 0; + } + if (!this) + break; + new->start = this->end + 1; + this = this->sibling; + } + return -EBUSY; +} + +/* + * Allocate empty slot in the resource tree given range and alignment. + */ +int allocate_resource(struct resource *root, struct resource *new, + unsigned long size, + unsigned long min, unsigned long max, + unsigned long align, + void (*alignf)(void *, struct resource *, + unsigned long, unsigned long), + void *alignf_data) +{ + int err; + + write_lock(&resource_lock); + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + if (err >= 0 && __request_resource(root, new)) + err = -EBUSY; + write_unlock(&resource_lock); + return err; +} + +EXPORT_SYMBOL(allocate_resource); + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent of request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become childs of + * the new resource. Otherwise the new resource becomes the child of + * the conflicting resource + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + int result; + struct resource *first, *next; + + write_lock(&resource_lock); + begin: + result = 0; + first = __request_resource(parent, new); + if (!first) + goto out; + + result = -EBUSY; + if (first == parent) + goto out; + + /* Resource fully contained by the clashing resource? Recurse into it */ + if (first->start <= new->start && first->end >= new->end) { + parent = first; + goto begin; + } + + for (next = first; ; next = next->sibling) { + /* Partial overlap? Bad, and unfixable */ + if (next->start < new->start || next->end > new->end) + goto out; + if (!next->sibling) + break; + if (next->sibling->start > new->end) + break; + } + + result = 0; + + new->parent = parent; + new->sibling = next->sibling; + new->child = first; + + next->sibling = NULL; + for (next = first; next; next = next->sibling) + next->parent = new; + + if (parent->child == first) { + parent->child = new; + } else { + next = parent->child; + while (next->sibling != first) + next = next->sibling; + next->sibling = new; + } + + out: + write_unlock(&resource_lock); + return result; +} + +EXPORT_SYMBOL(insert_resource); + +/* + * Given an existing resource, change its start and size to match the + * arguments. Returns -EBUSY if it can't fit. Existing children of + * the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, unsigned long start, unsigned long size) +{ + struct resource *tmp, *parent = res->parent; + unsigned long end = start + size - 1; + int result = -EBUSY; + + write_lock(&resource_lock); + + if ((start < parent->start) || (end > parent->end)) + goto out; + + for (tmp = res->child; tmp; tmp = tmp->sibling) { + if ((tmp->start < start) || (tmp->end > end)) + goto out; + } + + if (res->sibling && (res->sibling->start <= end)) + goto out; + + tmp = parent->child; + if (tmp != res) { + while (tmp->sibling != res) + tmp = tmp->sibling; + if (start <= tmp->end) + goto out; + } + + res->start = start; + res->end = end; + result = 0; + + out: + write_unlock(&resource_lock); + return result; +} + +EXPORT_SYMBOL(adjust_resource); + +/* + * This is compatibility stuff for IO resources. + * + * Note how this, unlike the above, knows about + * the IO flag meanings (busy etc). + * + * Request-region creates a new busy region. + * + * Check-region returns non-zero if the area is already busy + * + * Release-region releases a matching busy region. + */ +struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) +{ + struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); + + if (res) { + memset(res, 0, sizeof(*res)); + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; + + write_lock(&resource_lock); + + for (;;) { + struct resource *conflict; + + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + write_unlock(&resource_lock); + } + return res; +} + +EXPORT_SYMBOL(__request_region); + +int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n) +{ + struct resource * res; + + res = __request_region(parent, start, n, "check-region"); + if (!res) + return -EBUSY; + + release_resource(res); + kfree(res); + return 0; +} + +EXPORT_SYMBOL(__check_region); + +void __release_region(struct resource *parent, unsigned long start, unsigned long n) +{ + struct resource **p; + unsigned long end; + + p = &parent->child; + end = start + n - 1; + + write_lock(&resource_lock); + + for (;;) { + struct resource *res = *p; + + if (!res) + break; + if (res->start <= start && res->end >= end) { + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + if (res->start != start || res->end != end) + break; + *p = res->sibling; + write_unlock(&resource_lock); + kfree(res); + return; + } + p = &res->sibling; + } + + write_unlock(&resource_lock); + + printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); +} + +EXPORT_SYMBOL(__release_region); + +/* + * Called from init/main.c to reserve IO ports. + */ +#define MAXRESERVE 4 +static int __init reserve_setup(char *str) +{ + static int reserved; + static struct resource reserve[MAXRESERVE]; + + for (;;) { + int io_start, io_num; + int x = reserved; + + if (get_option (&str, &io_start) != 2) + break; + if (get_option (&str, &io_num) == 0) + break; + if (x < MAXRESERVE) { + struct resource *res = reserve + x; + res->name = "reserved"; + res->start = io_start; + res->end = io_start + io_num - 1; + res->flags = IORESOURCE_BUSY; + res->child = NULL; + if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + reserved = x+1; + } + } + return 1; +} + +__setup("reserve=", reserve_setup); diff --git a/kernel/sched.c b/kernel/sched.c new file mode 100644 index 00000000000..f69c4a5361e --- /dev/null +++ b/kernel/sched.c @@ -0,0 +1,5004 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <asm/uaccess.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/suspend.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/seq_file.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/acct.h> +#include <asm/tlb.h> + +#include <asm/unistd.h> + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + +static inline unsigned int task_timeslice(task_t *p) +{ + if (p->static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); +} +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) + +/* + * These are the runqueue data structures: + */ + +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +typedef struct runqueue runqueue_t; + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load; +#endif + unsigned long long nr_switches; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + unsigned long expired_timestamp; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; + prio_array_t *active, *expired, arrays[2]; + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_local; +#endif +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while (0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 11 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + runqueue_t *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcnt = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, rq->sched_goidle, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + for_each_domain(cpu, sd) { + enum idle_type itype; + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, + sd->sbe_pushed, sd->sbe_attempts, + sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + } +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +#endif + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) + __acquires(rq->lock) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHED_SMT +static int cpu_and_siblings_are_idle(int cpu) +{ + int sib; + for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { + if (idle_cpu(sib)) + continue; + return 0; + } + + return 1; +} +#else +#define cpu_and_siblings_are_idle(A) idle_cpu(A) +#endif + +#ifdef CONFIG_SCHEDSTATS +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, prio_array_t *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +/* + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ +static int effective_prio(task_t *p) +{ + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->active); + rq->nr_running++; +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->active); + rq->nr_running++; +} + +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + /* Caller must always ensure 'now >= p->timestamp' */ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > INTERACTIVE_SLEEP(p)) { + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + DEF_TIMESLICE); + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O + */ + if (p->activated == -1 && p->mm) { + if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + INTERACTIVE_SLEEP(p)) { + p->sleep_avg = INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG) + p->sleep_avg = NS_MAX_SLEEP_AVG; + } + } + + p->prio = effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated) { + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->activated = 1; + } + } + p->timestamp = now; + + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + dequeue_task(p, p->array); + p->array = NULL; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ + int need_resched, nrpolling; + + assert_spin_locked(&task_rq(p)->lock); + + /* minimise the chance of sending an interrupt to poll_idle() */ + nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); + nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + + if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); +} +#else +static inline void resched_task(task_t *p) +{ + set_tsk_need_resched(p); +} +#endif + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +#ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + +typedef struct { + struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ + task_t *task; + int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + + struct completion done; +} migration_req_t; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) +{ + runqueue_t *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->type = REQ_MOVE_TASK; + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +void wait_task_inactive(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + int preempted; + +repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ + if (unlikely(p->array || task_running(rq, p))) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); + cpu_relax(); + if (preempted) + yield(); + goto repeat; + } + task_rq_unlock(rq, &flags); +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesnt have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static inline unsigned long source_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return min(rq->cpu_load, load_now); +} + +/* + * Return a high guess at the load of a migration-target cpu + */ +static inline unsigned long target_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return max(rq->cpu_load, load_now); +} + +#endif + +/* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + } + else break; + } + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} +#endif + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + int cpu, this_cpu, success = 0; + unsigned long flags; + long old_state; + runqueue_t *rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd; + int new_cpu; +#endif + + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_cnt); + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + } else { + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif + + new_cpu = cpu; + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + load = source_load(cpu); + this_load = target_load(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) effect of + * the currently running task from the load of the current CPU: + */ + if (sync) + this_load -= SCHED_LOAD_SCALE; + + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) + goto out_set_cpu; + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + /* + * Scan domains for affine wakeup and passive balancing + * possibilities. + */ + for_each_domain(this_cpu, sd) { + unsigned int imbalance; + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + + if ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) { + /* + * This domain has SD_WAKE_AFFINE and p is cache cold + * in this domain. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_move_affine); + goto out_set_cpu; + } + } else if ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) { + /* + * This domain has SD_WAKE_BALANCE and there is + * an imbalance. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_move_balance); + goto out_set_cpu; + } + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu) { + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(rq, &flags); + + return success; +} + +int fastcall wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} + +EXPORT_SYMBOL(wake_up_process); + +int fastcall wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +#ifdef CONFIG_SMP +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd); +#endif + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +void fastcall sched_fork(task_t *p) +{ + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->run_list); + p->array = NULL; + spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#ifdef CONFIG_PREEMPT + /* + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's this_rq()->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. + */ + p->thread_info->preempt_count = 1; +#endif + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + preempt_disable(); + scheduler_tick(); + local_irq_enable(); + preempt_enable(); + } else + local_irq_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + BUG_ON(p->state != TASK_RUNNING); + + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void fastcall sched_exit(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); + } + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(task_t *prev) + __releases(rq->lock) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul <manfred@colorfullife.com> + */ + prev_task_flags = prev->flags; + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(task_t *prev) + __releases(rq->lock) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + unsigned long long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +#ifdef CONFIG_SMP + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + if (rq1 == rq2) { + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); + } +} + +/* + * find_idlest_cpu - find the least busy runqueue. + */ +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) +{ + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; + + min_cpu = UINT_MAX; + min_load = ULONG_MAX; + + cpus_and(mask, sd->span, p->cpus_allowed); + + for_each_cpu_mask(i, mask) { + load = target_load(i); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; + } + } + + /* add +1 to account for the new task */ + this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; + + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; +} + +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + migration_req_t req; + runqueue_t *rq; + unsigned long flags; + + rq = task_rq_lock(p, &flags); + if (!cpu_isset(dest_cpu, p->cpus_allowed) + || unlikely(cpu_is_offline(dest_cpu))) + goto out; + + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + return; + } +out: + task_rq_unlock(rq, &flags); +} + +/* + * sched_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + struct sched_domain *tmp, *sd = NULL; + int new_cpu, this_cpu = get_cpu(); + + /* Prefer the current CPU if there's only this task running */ + if (this_rq()->nr_running <= 1) + goto out; + + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; + + if (sd) { + schedstat_inc(sd, sbe_attempts); + new_cpu = find_idlest_cpu(current, this_cpu, sd); + if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); + put_cpu(); + sched_migrate_task(current, new_cpu); + return; + } + } +out: + put_cpu(); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static inline +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* + * Aggressive migration if: + * 1) the [whole] cpu is idle, or + * 2) too many balance attempts have failed. + */ + + if (cpu_and_siblings_are_idle(this_cpu) || \ + sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + return 1; +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + } else { + array = busiest->active; + dst_array = this_rq->active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired && busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + + do { + unsigned long load; + int local_group; + int i; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + + avg_load += load; + } + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + /* How much load to actually move to equalise the imbalance */ + *imbalance = min((max_load - avg_load) * busiest->cpu_power, + (avg_load - this_load) * this->cpu_power) + / SCHED_LOAD_SCALE; + + if (*imbalance < SCHED_LOAD_SCALE) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + if (max_load*busiest->cpu_power < + SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) + tmp = max_load*busiest->cpu_power/this->cpu_power; + else + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move <= pwr_now) + goto out_balanced; + + *imbalance = 1; + return busiest; + } + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = *imbalance / SCHED_LOAD_SCALE; + + return busiest; + +out_balanced: + if (busiest && (idle == NEWLY_IDLE || + (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { + *imbalance = 1; + return busiest; + } + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved; + + spin_lock(&this_rq->lock); + schedstat_inc(sd, lb_cnt[idle]); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle); + spin_unlock(&busiest->lock); + } + spin_unlock(&this_rq->lock); + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + + /* + * We were unbalanced, but unsuccessful in move_tasks(), + * so bump the balance_interval to lessen the lock contention. + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval++; + } else { + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } + + return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + schedstat_inc(sd, lb_balanced[idle]); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + goto out; + } + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); + if (!nr_moved) + schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + } +} + +/* + * active_load_balance is run by migration threads. It pushes running tasks + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be + * running on each physical CPU where possible, and avoids physical / + * logical imbalances. + * + * Called with busiest_rq locked. + */ +static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) +{ + struct sched_domain *sd; + struct sched_group *cpu_group; + runqueue_t *target_rq; + cpumask_t visited_cpus; + int cpu; + + /* + * Search for suitable CPUs to push tasks to in successively higher + * domains with SD_LOAD_BALANCE set. + */ + visited_cpus = CPU_MASK_NONE; + for_each_domain(busiest_cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + /* no more domains to search */ + break; + + schedstat_inc(sd, alb_cnt); + + cpu_group = sd->groups; + do { + for_each_cpu_mask(cpu, cpu_group->cpumask) { + if (busiest_rq->nr_running <= 1) + /* no more tasks left to move */ + return; + if (cpu_isset(cpu, visited_cpus)) + continue; + cpu_set(cpu, visited_cpus); + if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) + continue; + + target_rq = cpu_rq(cpu); + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + if (move_tasks(target_rq, cpu, busiest_rq, + 1, sd, SCHED_IDLE)) { + schedstat_inc(sd, alb_pushed); + } else { + schedstat_inc(sd, alb_failed); + } + spin_unlock(&target_rq->lock); + } + cpu_group = cpu_group->next; + } while (cpu_group != sd->groups); + } +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) +{ + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; + + /* Update our load */ + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (this_load > old_load) + old_load++; + this_rq->cpu_load = (old_load + this_load) / 2; + + for_each_domain(this_cpu, sd) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != SCHED_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + int ret = 0; +#ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + ret = 1; + } + spin_unlock(&rq->lock); +#endif + return ret; +} + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void update_cpu_clock(task_t *p, runqueue_t *rq, + unsigned long long now) +{ + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - last; +} + +/* + * Return current->sched_time plus any more ns on the sched_clock + * that have not yet been banked. + */ +unsigned long long current_sched_time(const task_t *tsk) +{ + unsigned long long ns; + unsigned long flags; + local_irq_save(flags); + ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); + ns = tsk->sched_time + (sched_clock() - ns); + local_irq_restore(flags); + return ns; +} + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (p != rq->idle) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + /* Account for system time used */ + acct_update_integrals(p); + /* Update rss highwater mark */ + update_mem_hiwater(p); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp = cputime_to_cputime64(steal); + runqueue_t *rq = this_rq(); + + if (p == rq->idle) { + p->stime = cputime_add(p->stime, steal); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + } else + cpustat->steal = cputime64_add(cpustat->steal, tmp); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + unsigned long long now = sched_clock(); + + update_cpu_clock(p, rq, now); + + rq->timestamp_last_tick = now; + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->active); + } + goto out_unlock; + } + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->expired); + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + requeue_task(p, rq->active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + int i; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return; + + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); + + sibling_map = sd->span; + + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + prio_array_t *array; + int ret = 0, i; + task_t *p; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); + sibling_map = sd->span; + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); + + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; + array = this_rq->active; + if (!array->nr_active) + array = this_rq->expired; + BUG_ON(!array->nr_active); + + p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret = 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + return ret; +} +#else +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + return 0; +} +#endif + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + long *switch_count; + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!current->exit_state)) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); +need_resched_nonpreemptible: + rq = this_rq(); + + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ + if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); + if (likely((long long)now - prev->timestamp < NS_MAX_SLEEP_AVG)) { + run_time = now - prev->timestamp; + if (unlikely((long long)now - prev->timestamp < 0)) + run_time = 0; + } else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status + */ + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (!rt_task(next) && next->activated > 0) { + unsigned long long delta = now - next->timestamp; + if (unlikely((long long)now - next->timestamp < 0)) + delta = 0; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0) + prev->sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + schedule(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* Catch callers which need to be fixed*/ + BUG_ON(ti->preempt_count || !irqs_disabled()); + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + local_irq_enable(); + schedule(); + local_irq_disable(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync); +} + +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void fastcall complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +void fastcall complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +void fastcall __sched wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} +EXPORT_SYMBOL(wait_for_completion); + +unsigned long fastcall __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + if (!timeout) { + __remove_wait_queue(&x->wait, &wait); + goto out; + } + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +int fastcall __sched wait_for_completion_interruptible(struct completion *x) +{ + int ret = 0; + + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + __remove_wait_queue(&x->wait, &wait); + goto out; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + + return ret; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +unsigned long fastcall __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + timeout = -ERESTARTSYS; + __remove_wait_queue(&x->wait, &wait); + goto out; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + if (!timeout) { + __remove_wait_queue(&x->wait, &wait); + goto out; + } + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (array) { + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + +EXPORT_SYMBOL(set_user_nice); + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < 0) { + if (!capable(CAP_SYS_NICE)) + return -EPERM; + if (increment < -40) + increment = -40; + } + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const task_t *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const task_t *p) +{ + return TASK_NICE(p); +} + +/* + * The only users of task_nice are binfmt_elf and binfmt_elf32. + * binfmt_elf is no longer modular, but binfmt_elf32 still is. + * Therefore, task_nice is needed if there is a compat_mode. + */ +#ifdef CONFIG_COMPAT +EXPORT_SYMBOL_GPL(task_nice); +#endif + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +EXPORT_SYMBOL_GPL(idle_cpu); + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +task_t *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(p->array); + p->policy = policy; + p->rt_priority = prio; + if (policy != SCHED_NORMAL) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->prio = p->static_prio; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of + * a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +{ + int retval; + int oldprio, oldpolicy = -1; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + if (param->sched_priority < 0 || + param->sched_priority > MAX_USER_RT_PRIO-1) + return -EINVAL; + if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) + return -EINVAL; + + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + return -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + return -EPERM; + + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, &flags); + goto recheck; + } + array = p->array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); + if (array) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); + return 0; +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + int retval; + struct sched_param lparam; + struct task_struct *p; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + read_lock_irq(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) { + read_unlock_irq(&tasklist_lock); + return -ESRCH; + } + retval = sched_setscheduler(p, policy, &lparam); + read_unlock_irq(&tasklist_lock); + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + task_t *p; + int retval; + cpumask_t cpus_allowed; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + cpus_allowed = cpuset_cpus_allowed(p); + cpus_and(new_mask, new_mask, cpus_allowed); + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + unlock_cpu_hotplug(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP +cpumask_t cpu_online_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map = CPU_MASK_ALL; +#endif + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + int retval; + task_t *p; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(*mask, p->cpus_allowed, cpu_possible_map); + +out_unlock: + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + if (retval) + return retval; + + return 0; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + + if (len < sizeof(cpumask_t)) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) + return -EFAULT; + + return sizeof(cpumask_t); +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ +asmlinkage long sys_sched_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->array; + prio_array_t *target = rq->expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->active; + + if (current->array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static inline void __cond_resched(void) +{ + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched cond_resched(void) +{ + if (need_resched()) { + __cond_resched(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ + if (need_lockbreak(lock)) { + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched()) { + __local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_softirq); + + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void __sched io_schedule(void) +{ + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +asmlinkage +long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t * p) +{ + task_t *relative; + unsigned state; + unsigned long free = 0; + static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (state < ARRAY_SIZE(stat_nam)) + printk(stat_nam[state]); + else + printk("?"); +#if (BITS_PER_LONG == 32) + if (state == TASK_RUNNING) + printk(" running "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(" running task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long * n = (unsigned long *) (p->thread_info+1); + while (!*n) + n++; + free = (unsigned long) n - (unsigned long)(p->thread_info+1); + } +#endif + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + if (state != TASK_RUNNING) + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#else + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); +} + +void __devinit init_idle(task_t *idle, int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long flags; + + idle->sleep_avg = 0; + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; + set_tsk_need_resched(idle); + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_MASK_NONE. + */ +cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + unsigned long flags; + int ret = 0; + migration_req_t req; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + if (!cpus_intersects(new_mask, cpu_online_map)) { + ret = -EINVAL; + goto out; + } + + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, &flags); + return ret; +} + +EXPORT_SYMBOL_GPL(set_cpus_allowed); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + runqueue_t *rq_dest, *rq_src; + + if (unlikely(cpu_is_offline(dest_cpu))) + return; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + + set_task_cpu(p, dest_cpu); + if (p->array) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); + } + +out: + double_rq_unlock(rq_src, rq_dest); +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void * data) +{ + runqueue_t *rq; + int cpu = (long)data; + + rq = cpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + struct list_head *head; + migration_req_t *req; + + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + + spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + + if (req->type == REQ_MOVE_TASK) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + spin_unlock_irq(&rq->lock); + } else { + spin_unlock_irq(&rq->lock); + WARN_ON(1); + } + + complete(&req->done); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* Figure out where task on dead CPU should go, use force if neccessary. */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) +{ + int dest_cpu; + cpumask_t mask; + + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + tsk->cpus_allowed = cpuset_cpus_allowed(tsk); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (tsk->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, dead_cpu); + } + __migrate_task(tsk, dead_cpu, dest_cpu); +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(runqueue_t *rq_src) +{ + runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; + rq_src->nr_uninterruptible = 0; + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *tsk, *t; + + write_lock_irq(&tasklist_lock); + + do_each_thread(t, tsk) { + if (tsk == current) + continue; + + if (task_cpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); + } while_each_thread(t, tsk); + + write_unlock_irq(&tasklist_lock); +} + +/* Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible and adding it to + * the _front_ of runqueue. Used by CPU offline code. + */ +void sched_idle_next(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(cpu)); + + /* Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +/* Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +{ + struct runqueue *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(tsk->flags & PF_DEAD); + + get_task_struct(tsk); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, tsk); + spin_lock_irq(&rq->lock); + + put_task_struct(tsk); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int migration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct runqueue *rq; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + break; + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); + + /* No need to migrate the tasks: it was best-effort if + * they didn't do lock_cpu_hotplug(). Just wake up + * the requestors. */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + migration_req_t *req; + req = list_entry(rq->migration_queue.next, + migration_req_t, list); + BUG_ON(req->type != REQ_MOVE_TASK); + list_del_init(&req->list); + complete(&req->done); + } + spin_unlock_irq(&rq->lock); + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = 10 +}; + +int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + return 0; +} +#endif + +#ifdef CONFIG_SMP +#define SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + do { + int i; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (i = 0; i < level + 1; i++) + printk(" "); + printk("domain %d: ", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + break; + } + + printk("span %s\n", str); + + if (!cpu_isset(cpu, sd->span)) + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + if (!cpu_isset(cpu, group->cpumask)) + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + + printk(KERN_DEBUG); + for (i = 0; i < level + 2; i++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->cpu_power) { + printk("\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + } + + if (!cpus_weight(group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: empty group\n"); + } + + if (cpus_intersects(groupmask, group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + } + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); + } + + } while (sd); +} +#else +#define sched_domain_debug(sd, cpu) {} +#endif + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + sched_domain_debug(sd, cpu); + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } +} + +/* cpus with isolated domains */ +cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ints[NR_CPUS], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + cpus_clear(cpu_isolated_map); + for (i = 1; i <= ints[0]; i++) + if (ints[i] < NR_CPUS) + cpu_set(ints[i], cpu_isolated_map); + return 1; +} + +__setup ("isolcpus=", isolated_cpu_setup); + +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +void __devinit init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; + + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; + int j; + + if (cpu_isset(i, covered)) + continue; + + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; + + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; + + cpu_set(j, covered); + cpu_set(j, sg->cpumask); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} + + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __devinit arch_init_sched_domains(void); +extern void __devinit arch_destroy_sched_domains(void); +#else +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA + +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static int __devinit cpu_to_node_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) +/* + * The domains setup code relies on siblings not spanning + * multiple nodes. Make sure the architecture has a proper + * siblings map: + */ +static void check_sibling_maps(void) +{ + int i, j; + + for_each_online_cpu(i) { + for_each_cpu_mask(j, cpu_sibling_map[i]) { + if (cpu_to_node(i) != cpu_to_node(j)) { + printk(KERN_INFO "warning: CPU %d siblings map " + "to different node - isolating " + "them.\n", i); + cpu_sibling_map[i] = cpumask_of_cpu(i); + break; + } + } + } +} +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void __devinit arch_init_sched_domains(void) +{ + int i; + cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); + + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, cpu_default_map); + +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + group = cpu_to_node_group(i); + *sd = SD_NODE_INIT; + sd->span = cpu_default_map; + sd->groups = &sched_group_nodes[group]; +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, cpu_default_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_online_cpu(i) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } + +#ifdef CONFIG_NUMA + /* Set up node groups */ + init_sched_build_groups(sched_group_nodes, cpu_default_map, + &cpu_to_node_group); +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, cpu_default_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + if (i == first_cpu(sd->groups->cpumask)) { + /* Only add "power" once for each physical package. */ + sd = &per_cpu(node_domains, i); + sd->groups->cpu_power += power; + } +#endif + } + + /* Attach the domains */ + for_each_online_cpu(i) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __devinit arch_destroy_sched_domains(void) +{ + /* Do nothing: everything is statically allocated. */ +} +#endif + +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +/* + * Initial dummy domain for early boot and for hotplug cpu. Being static, + * it is initialized to zero, so all balancing flags are cleared which is + * what we want. + */ +static struct sched_domain sched_domain_dummy; + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Force a reinitialization of the sched domains hierarchy. The domains + * and groups cannot be updated in place without racing with the balancing + * code, so we temporarily attach all running cpus to a "dummy" domain + * which will prevent rebalancing while the sched domains are recalculated. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int i; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + for_each_online_cpu(i) + cpu_attach_domain(&sched_domain_dummy, i); + arch_destroy_sched_domains(); + return NOTIFY_OK; + + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + case CPU_DEAD: + /* + * Fall through and re-initialise the domains. + */ + break; + default: + return NOTIFY_DONE; + } + + /* The hotplug lock is already held by cpu_up/cpu_down */ + arch_init_sched_domains(); + + return NOTIFY_OK; +} +#endif + +void __init sched_init_smp(void) +{ + lock_cpu_hotplug(); + arch_init_sched_domains(); + unlock_cpu_hotplug(); + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +void __init sched_init(void) +{ + runqueue_t *rq; + int i, j, k; + + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; + +#ifdef CONFIG_SMP + rq->sd = &sched_domain_dummy; + rq->cpu_load = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); +#endif + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +void normalize_rt_tasks(void) +{ + struct task_struct *p; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p)) + continue; + + rq = task_rq_lock(p, &flags); + + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); + } + read_unlock_irq(&tasklist_lock); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c new file mode 100644 index 00000000000..c3391b6020e --- /dev/null +++ b/kernel/seccomp.c @@ -0,0 +1,56 @@ +/* + * linux/kernel/seccomp.c + * + * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> + * + * This defines a simple but solid secure-computing mode. + */ + +#include <linux/seccomp.h> +#include <linux/sched.h> + +/* #define SECCOMP_DEBUG 1 */ + +/* + * Secure computing mode 1 allows only read/write/exit/sigreturn. + * To be fully secure this must be combined with rlimit + * to limit the stack allocations too. + */ +static int mode1_syscalls[] = { + __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, + 0, /* null terminated */ +}; + +#ifdef TIF_32BIT +static int mode1_syscalls_32[] = { + __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, + 0, /* null terminated */ +}; +#endif + +void __secure_computing(int this_syscall) +{ + int mode = current->seccomp.mode; + int * syscall; + + switch (mode) { + case 1: + syscall = mode1_syscalls; +#ifdef TIF_32BIT + if (test_thread_flag(TIF_32BIT)) + syscall = mode1_syscalls_32; +#endif + do { + if (*syscall == this_syscall) + return; + } while (*++syscall); + break; + default: + BUG(); + } + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + do_exit(SIGKILL); +} diff --git a/kernel/signal.c b/kernel/signal.c new file mode 100644 index 00000000000..f00a1d610f0 --- /dev/null +++ b/kernel/signal.c @@ -0,0 +1,2662 @@ +/* + * linux/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson + * + * 2003-06-02 Jim Houston - Concurrent Computer Corp. + * Changes to use preallocated sigqueue structures + * to allow signals to be sent reliably. + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/tty.h> +#include <linux/binfmts.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/ptrace.h> +#include <linux/posix-timers.h> +#include <asm/param.h> +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/siginfo.h> + +/* + * SLAB caches for signal bits. + */ + +static kmem_cache_t *sigqueue_cachep; + +/* + * In POSIX a signal is sent either to a specific thread (Linux task) + * or to the process as a whole (Linux thread group). How the signal + * is sent determines whether it's to one thread or the whole group, + * which determines which signal mask(s) are involved in blocking it + * from being delivered until later. When the signal is delivered, + * either it's caught or ignored by a user handler or it has a default + * effect that applies to the whole thread group (POSIX process). + * + * The possible effects an unblocked signal set to SIG_DFL can have are: + * ignore - Nothing Happens + * terminate - kill the process, i.e. all threads in the group, + * similar to exit_group. The group leader (only) reports + * WIFSIGNALED status to its parent. + * coredump - write a core dump file describing all threads using + * the same mm and then kill all those threads + * stop - stop all the threads in the group, i.e. TASK_STOPPED state + * + * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. + * Other signals when not blocked and set to SIG_DFL behaves as follows. + * The job control signals also have other special effects. + * + * +--------------------+------------------+ + * | POSIX signal | default action | + * +--------------------+------------------+ + * | SIGHUP | terminate | + * | SIGINT | terminate | + * | SIGQUIT | coredump | + * | SIGILL | coredump | + * | SIGTRAP | coredump | + * | SIGABRT/SIGIOT | coredump | + * | SIGBUS | coredump | + * | SIGFPE | coredump | + * | SIGKILL | terminate(+) | + * | SIGUSR1 | terminate | + * | SIGSEGV | coredump | + * | SIGUSR2 | terminate | + * | SIGPIPE | terminate | + * | SIGALRM | terminate | + * | SIGTERM | terminate | + * | SIGCHLD | ignore | + * | SIGCONT | ignore(*) | + * | SIGSTOP | stop(*)(+) | + * | SIGTSTP | stop(*) | + * | SIGTTIN | stop(*) | + * | SIGTTOU | stop(*) | + * | SIGURG | ignore | + * | SIGXCPU | coredump | + * | SIGXFSZ | coredump | + * | SIGVTALRM | terminate | + * | SIGPROF | terminate | + * | SIGPOLL/SIGIO | terminate | + * | SIGSYS/SIGUNUSED | coredump | + * | SIGSTKFLT | terminate | + * | SIGWINCH | ignore | + * | SIGPWR | terminate | + * | SIGRTMIN-SIGRTMAX | terminate | + * +--------------------+------------------+ + * | non-POSIX signal | default action | + * +--------------------+------------------+ + * | SIGEMT | coredump | + * +--------------------+------------------+ + * + * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". + * (*) Special job control effects: + * When SIGCONT is sent, it resumes the process (all threads in the group) + * from TASK_STOPPED state and also clears any pending/queued stop signals + * (any of those marked with "stop(*)"). This happens regardless of blocking, + * catching, or ignoring SIGCONT. When any stop signal is sent, it clears + * any pending/queued SIGCONT signals; this happens regardless of blocking, + * catching, or ignored the stop signal, though (except for SIGSTOP) the + * default action of stopping the process may happen later or never. + */ + +#ifdef SIGEMT +#define M_SIGEMT M(SIGEMT) +#else +#define M_SIGEMT 0 +#endif + +#if SIGRTMIN > BITS_PER_LONG +#define M(sig) (1ULL << ((sig)-1)) +#else +#define M(sig) (1UL << ((sig)-1)) +#endif +#define T(sig, mask) (M(sig) & (mask)) + +#define SIG_KERNEL_ONLY_MASK (\ + M(SIGKILL) | M(SIGSTOP) ) + +#define SIG_KERNEL_STOP_MASK (\ + M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) ) + +#define SIG_KERNEL_COREDUMP_MASK (\ + M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \ + M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \ + M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT ) + +#define SIG_KERNEL_IGNORE_MASK (\ + M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) ) + +#define sig_kernel_only(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK)) +#define sig_kernel_coredump(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK)) +#define sig_kernel_ignore(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK)) +#define sig_kernel_stop(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) + +#define sig_user_defined(t, signr) \ + (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ + ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) + +#define sig_fatal(t, signr) \ + (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ + (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) + +static int sig_ignored(struct task_struct *t, int sig) +{ + void __user * handler; + + /* + * Tracers always want to know about signals.. + */ + if (t->ptrace & PT_PTRACED) + return 0; + + /* + * Blocked signals are never ignored, since the + * signal handler may change by the time it is + * unblocked. + */ + if (sigismember(&t->blocked, sig)) + return 0; + + /* Is it explicitly or implicitly ignored? */ + handler = t->sighand->action[sig-1].sa.sa_handler; + return handler == SIG_IGN || + (handler == SIG_DFL && sig_kernel_ignore(sig)); +} + +/* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +{ + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; +} + +#define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) + +fastcall void recalc_sigpending_tsk(struct task_struct *t) +{ + if (t->signal->group_stop_count > 0 || + PENDING(&t->pending, &t->blocked) || + PENDING(&t->signal->shared_pending, &t->blocked)) + set_tsk_thread_flag(t, TIF_SIGPENDING); + else + clear_tsk_thread_flag(t, TIF_SIGPENDING); +} + +void recalc_sigpending(void) +{ + recalc_sigpending_tsk(current); +} + +/* Given the mask, find the first available signal that should be serviced. */ + +static int +next_signal(struct sigpending *pending, sigset_t *mask) +{ + unsigned long i, *s, *m, x; + int sig = 0; + + s = pending->signal.sig; + m = mask->sig; + switch (_NSIG_WORDS) { + default: + for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) + if ((x = *s &~ *m) != 0) { + sig = ffz(~x) + i*_NSIG_BPW + 1; + break; + } + break; + + case 2: if ((x = s[0] &~ m[0]) != 0) + sig = 1; + else if ((x = s[1] &~ m[1]) != 0) + sig = _NSIG_BPW + 1; + else + break; + sig += ffz(~x); + break; + + case 1: if ((x = *s &~ *m) != 0) + sig = ffz(~x) + 1; + break; + } + + return sig; +} + +static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, + int override_rlimit) +{ + struct sigqueue *q = NULL; + + atomic_inc(&t->user->sigpending); + if (override_rlimit || + atomic_read(&t->user->sigpending) <= + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + q = kmem_cache_alloc(sigqueue_cachep, flags); + if (unlikely(q == NULL)) { + atomic_dec(&t->user->sigpending); + } else { + INIT_LIST_HEAD(&q->list); + q->flags = 0; + q->lock = NULL; + q->user = get_uid(t->user); + } + return(q); +} + +static inline void __sigqueue_free(struct sigqueue *q) +{ + if (q->flags & SIGQUEUE_PREALLOC) + return; + atomic_dec(&q->user->sigpending); + free_uid(q->user); + kmem_cache_free(sigqueue_cachep, q); +} + +static void flush_sigqueue(struct sigpending *queue) +{ + struct sigqueue *q; + + sigemptyset(&queue->signal); + while (!list_empty(&queue->list)) { + q = list_entry(queue->list.next, struct sigqueue , list); + list_del_init(&q->list); + __sigqueue_free(q); + } +} + +/* + * Flush all pending signals for a task. + */ + +void +flush_signals(struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&t->sighand->siglock, flags); + clear_tsk_thread_flag(t,TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); + spin_unlock_irqrestore(&t->sighand->siglock, flags); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +void __exit_sighand(struct task_struct *tsk) +{ + struct sighand_struct * sighand = tsk->sighand; + + /* Ok, we're done with the signal handlers */ + tsk->sighand = NULL; + if (atomic_dec_and_test(&sighand->count)) + kmem_cache_free(sighand_cachep, sighand); +} + +void exit_sighand(struct task_struct *tsk) +{ + write_lock_irq(&tasklist_lock); + __exit_sighand(tsk); + write_unlock_irq(&tasklist_lock); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct * sig = tsk->signal; + struct sighand_struct * sighand = tsk->sighand; + + if (!sig) + BUG(); + if (!atomic_read(&sig->count)) + BUG(); + spin_lock(&sighand->siglock); + posix_cpu_timers_exit(tsk); + if (atomic_dec_and_test(&sig->count)) { + posix_cpu_timers_exit_group(tsk); + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + tsk->signal = NULL; + spin_unlock(&sighand->siglock); + flush_sigqueue(&sig->shared_pending); + } else { + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { + wake_up_process(sig->group_exit_task); + sig->group_exit_task = NULL; + } + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + tsk->signal = NULL; + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->sched_time += tsk->sched_time; + spin_unlock(&sighand->siglock); + sig = NULL; /* Marker for below. */ + } + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + flush_sigqueue(&tsk->pending); + if (sig) { + /* + * We are cleaning up the signal_struct here. We delayed + * calling exit_itimers until after flush_sigqueue, just in + * case our thread-local pending queue contained a queued + * timer signal that would have been cleared in + * exit_itimers. When that called sigqueue_free, it would + * attempt to re-take the tasklist_lock and deadlock. This + * can never happen if we ensure that all queues the + * timer's signal might be queued on have been flushed + * first. The shared_pending queue, and our own pending + * queue are the only queues the timer could be on, since + * there are no other threads left in the group and timer + * signals are constrained to threads inside the group. + */ + exit_itimers(sig); + exit_thread_group_keys(sig); + kmem_cache_free(signal_cachep, sig); + } +} + +void exit_signal(struct task_struct *tsk) +{ + write_lock_irq(&tasklist_lock); + __exit_signal(tsk); + write_unlock_irq(&tasklist_lock); +} + +/* + * Flush all handlers for a task. + */ + +void +flush_signal_handlers(struct task_struct *t, int force_default) +{ + int i; + struct k_sigaction *ka = &t->sighand->action[0]; + for (i = _NSIG ; i != 0 ; i--) { + if (force_default || ka->sa.sa_handler != SIG_IGN) + ka->sa.sa_handler = SIG_DFL; + ka->sa.sa_flags = 0; + sigemptyset(&ka->sa.sa_mask); + ka++; + } +} + + +/* Notify the system that a driver wants to block all signals for this + * process, and wants to be notified if any signals at all were to be + * sent/acted upon. If the notifier routine returns non-zero, then the + * signal will be acted upon after all. If the notifier routine returns 0, + * then then signal will be blocked. Only one block per process is + * allowed. priv is a pointer to private data that the notifier routine + * can use to determine if the signal should be blocked or not. */ + +void +block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->notifier_mask = mask; + current->notifier_data = priv; + current->notifier = notifier; + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +/* Notify the system that blocking has ended. */ + +void +unblock_all_signals(void) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->notifier = NULL; + current->notifier_data = NULL; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info) +{ + struct sigqueue *q, *first = NULL; + int still_pending = 0; + + if (unlikely(!sigismember(&list->signal, sig))) + return 0; + + /* + * Collect the siginfo appropriate to this signal. Check if + * there is another siginfo for the same signal. + */ + list_for_each_entry(q, &list->list, list) { + if (q->info.si_signo == sig) { + if (first) { + still_pending = 1; + break; + } + first = q; + } + } + if (first) { + list_del_init(&first->list); + copy_siginfo(info, &first->info); + __sigqueue_free(first); + if (!still_pending) + sigdelset(&list->signal, sig); + } else { + + /* Ok, it wasn't in the queue. This must be + a fast-pathed signal or we must have been + out of queue space. So zero out the info. + */ + sigdelset(&list->signal, sig); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = 0; + info->si_pid = 0; + info->si_uid = 0; + } + return 1; +} + +static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, + siginfo_t *info) +{ + int sig = 0; + + sig = next_signal(pending, mask); + if (sig) { + if (current->notifier) { + if (sigismember(current->notifier_mask, sig)) { + if (!(current->notifier)(current->notifier_data)) { + clear_thread_flag(TIF_SIGPENDING); + return 0; + } + } + } + + if (!collect_signal(sig, pending, info)) + sig = 0; + + } + recalc_sigpending(); + + return sig; +} + +/* + * Dequeue a signal and return the element to the caller, which is + * expected to free it. + * + * All callers have to hold the siglock. + */ +int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +{ + int signr = __dequeue_signal(&tsk->pending, mask, info); + if (!signr) + signr = __dequeue_signal(&tsk->signal->shared_pending, + mask, info); + if (signr && unlikely(sig_kernel_stop(signr))) { + /* + * Set a marker that we have dequeued a stop signal. Our + * caller might release the siglock and then the pending + * stop signal it is about to process is no longer in the + * pending bitmasks, but must still be cleared by a SIGCONT + * (and overruled by a SIGKILL). So those cases clear this + * shared flag after we've set it. Note that this flag may + * remain set after the signal we return is ignored or + * handled. That doesn't matter because its only purpose + * is to alert stop-signal processing code when another + * processor has come along and cleared the flag. + */ + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + } + if ( signr && + ((info->si_code & __SI_MASK) == __SI_TIMER) && + info->si_sys_private){ + /* + * Release the siglock to ensure proper locking order + * of timer locks outside of siglocks. Note, we leave + * irqs disabled here, since the posix-timers code is + * about to disable them again anyway. + */ + spin_unlock(&tsk->sighand->siglock); + do_schedule_next_timer(info); + spin_lock(&tsk->sighand->siglock); + } + return signr; +} + +/* + * Tell a process that it has a new active signal.. + * + * NOTE! we rely on the previous spin_lock to + * lock interrupts for us! We can only be called with + * "siglock" held, and the local interrupt must + * have been disabled when that got acquired! + * + * No need to set need_resched since signal event passing + * goes through ->blocked + */ +void signal_wake_up(struct task_struct *t, int resume) +{ + unsigned int mask; + + set_tsk_thread_flag(t, TIF_SIGPENDING); + + /* + * For SIGKILL, we want to wake it up in the stopped/traced case. + * We don't check t->state here because there is a race with it + * executing another processor and just now entering stopped state. + * By using wake_up_state, we ensure the process will wake up and + * handle its death signal. + */ + mask = TASK_INTERRUPTIBLE; + if (resume) + mask |= TASK_STOPPED | TASK_TRACED; + if (!wake_up_state(t, mask)) + kick_process(t); +} + +/* + * Remove signals in mask from the pending set and queue. + * Returns 1 if any signals were found. + * + * All callers must be holding the siglock. + */ +static int rm_from_queue(unsigned long mask, struct sigpending *s) +{ + struct sigqueue *q, *n; + + if (!sigtestsetmask(&s->signal, mask)) + return 0; + + sigdelsetmask(&s->signal, mask); + list_for_each_entry_safe(q, n, &s->list, list) { + if (q->info.si_signo < SIGRTMIN && + (mask & sigmask(q->info.si_signo))) { + list_del_init(&q->list); + __sigqueue_free(q); + } + } + return 1; +} + +/* + * Bad permissions for sending the signal + */ +static int check_kill_permission(int sig, struct siginfo *info, + struct task_struct *t) +{ + int error = -EINVAL; + if (sig < 0 || sig > _NSIG) + return error; + error = -EPERM; + if ((!info || ((unsigned long)info != 1 && + (unsigned long)info != 2 && SI_FROMUSER(info))) + && ((sig != SIGCONT) || + (current->signal->session != t->signal->session)) + && (current->euid ^ t->suid) && (current->euid ^ t->uid) + && (current->uid ^ t->suid) && (current->uid ^ t->uid) + && !capable(CAP_KILL)) + return error; + return security_task_kill(t, info, sig); +} + +/* forward decl */ +static void do_notify_parent_cldstop(struct task_struct *tsk, + struct task_struct *parent, + int why); + +/* + * Handle magic process-wide effects of stop/continue signals. + * Unlike the signal actions, these happen immediately at signal-generation + * time regardless of blocking, ignoring, or handling. This does the + * actual continuing for SIGCONT, but not the actual stopping for stop + * signals. The process stop is done as a signal action for SIG_DFL. + */ +static void handle_stop_signal(int sig, struct task_struct *p) +{ + struct task_struct *t; + + if (p->flags & SIGNAL_GROUP_EXIT) + /* + * The process is in the middle of dying already. + */ + return; + + if (sig_kernel_stop(sig)) { + /* + * This is a stop signal. Remove SIGCONT from all queues. + */ + rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); + t = p; + do { + rm_from_queue(sigmask(SIGCONT), &t->pending); + t = next_thread(t); + } while (t != p); + } else if (sig == SIGCONT) { + /* + * Remove all stop signals from all queues, + * and wake all threads. + */ + if (unlikely(p->signal->group_stop_count > 0)) { + /* + * There was a group stop in progress. We'll + * pretend it finished before we got here. We are + * obliged to report it to the parent: if the + * SIGSTOP happened "after" this SIGCONT, then it + * would have cleared this pending SIGCONT. If it + * happened "before" this SIGCONT, then the parent + * got the SIGCHLD about the stop finishing before + * the continue happened. We do the notification + * now, and it's as if the stop had finished and + * the SIGCHLD was pending on entry to this kill. + */ + p->signal->group_stop_count = 0; + p->signal->flags = SIGNAL_STOP_CONTINUED; + spin_unlock(&p->sighand->siglock); + if (p->ptrace & PT_PTRACED) + do_notify_parent_cldstop(p, p->parent, + CLD_STOPPED); + else + do_notify_parent_cldstop( + p->group_leader, + p->group_leader->real_parent, + CLD_STOPPED); + spin_lock(&p->sighand->siglock); + } + rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); + t = p; + do { + unsigned int state; + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + + /* + * If there is a handler for SIGCONT, we must make + * sure that no thread returns to user mode before + * we post the signal, in case it was the only + * thread eligible to run the signal handler--then + * it must not do anything between resuming and + * running the handler. With the TIF_SIGPENDING + * flag set, the thread will pause and acquire the + * siglock that we hold now and until we've queued + * the pending signal. + * + * Wake up the stopped thread _after_ setting + * TIF_SIGPENDING + */ + state = TASK_STOPPED; + if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { + set_tsk_thread_flag(t, TIF_SIGPENDING); + state |= TASK_INTERRUPTIBLE; + } + wake_up_state(t, state); + + t = next_thread(t); + } while (t != p); + + if (p->signal->flags & SIGNAL_STOP_STOPPED) { + /* + * We were in fact stopped, and are now continued. + * Notify the parent with CLD_CONTINUED. + */ + p->signal->flags = SIGNAL_STOP_CONTINUED; + p->signal->group_exit_code = 0; + spin_unlock(&p->sighand->siglock); + if (p->ptrace & PT_PTRACED) + do_notify_parent_cldstop(p, p->parent, + CLD_CONTINUED); + else + do_notify_parent_cldstop( + p->group_leader, + p->group_leader->real_parent, + CLD_CONTINUED); + spin_lock(&p->sighand->siglock); + } else { + /* + * We are not stopped, but there could be a stop + * signal in the middle of being processed after + * being removed from the queue. Clear that too. + */ + p->signal->flags = 0; + } + } else if (sig == SIGKILL) { + /* + * Make sure that any pending stop signal already dequeued + * is undone by the wakeup for SIGKILL. + */ + p->signal->flags = 0; + } +} + +static int send_signal(int sig, struct siginfo *info, struct task_struct *t, + struct sigpending *signals) +{ + struct sigqueue * q = NULL; + int ret = 0; + + /* + * fast-pathed signals for kernel-internal things like SIGSTOP + * or SIGKILL. + */ + if ((unsigned long)info == 2) + goto out_set; + + /* Real-time signals must be queued if sent by sigqueue, or + some other real-time mechanism. It is implementation + defined whether kill() does so. We attempt to do so, on + the principle of least surprise, but since kill is not + allowed to fail with EAGAIN when low on memory we just + make sure at least one signal gets delivered and don't + pass on the info struct. */ + + q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && + ((unsigned long) info < 2 || + info->si_code >= 0))); + if (q) { + list_add_tail(&q->list, &signals->list); + switch ((unsigned long) info) { + case 0: + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_USER; + q->info.si_pid = current->pid; + q->info.si_uid = current->uid; + break; + case 1: + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_KERNEL; + q->info.si_pid = 0; + q->info.si_uid = 0; + break; + default: + copy_siginfo(&q->info, info); + break; + } + } else { + if (sig >= SIGRTMIN && info && (unsigned long)info != 1 + && info->si_code != SI_USER) + /* + * Queue overflow, abort. We may abort if the signal was rt + * and sent by user using something other than kill(). + */ + return -EAGAIN; + if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) + /* + * Set up a return to indicate that we dropped + * the signal. + */ + ret = info->si_sys_private; + } + +out_set: + sigaddset(&signals->signal, sig); + return ret; +} + +#define LEGACY_QUEUE(sigptr, sig) \ + (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) + + +static int +specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) +{ + int ret = 0; + + if (!irqs_disabled()) + BUG(); + assert_spin_locked(&t->sighand->siglock); + + if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) + /* + * Set up a return to indicate that we dropped the signal. + */ + ret = info->si_sys_private; + + /* Short-circuit ignored signals. */ + if (sig_ignored(t, sig)) + goto out; + + /* Support queueing exactly one non-rt signal, so that we + can get more detailed information about the cause of + the signal. */ + if (LEGACY_QUEUE(&t->pending, sig)) + goto out; + + ret = send_signal(sig, info, t, &t->pending); + if (!ret && !sigismember(&t->blocked, sig)) + signal_wake_up(t, sig == SIGKILL); +out: + return ret; +} + +/* + * Force a signal that the process can't ignore: if necessary + * we unblock the signal and change any SIG_IGN to SIG_DFL. + */ + +int +force_sig_info(int sig, struct siginfo *info, struct task_struct *t) +{ + unsigned long int flags; + int ret; + + spin_lock_irqsave(&t->sighand->siglock, flags); + if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { + t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + } + ret = specific_send_sig_info(sig, info, t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); + + return ret; +} + +void +force_sig_specific(int sig, struct task_struct *t) +{ + unsigned long int flags; + + spin_lock_irqsave(&t->sighand->siglock, flags); + if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) + t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + specific_send_sig_info(sig, (void *)2, t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); +} + +/* + * Test if P wants to take SIG. After we've checked all threads with this, + * it's equivalent to finding no threads not blocking SIG. Any threads not + * blocking SIG were ruled out because they are not running and already + * have pending signals. Such threads will dequeue from the shared queue + * as soon as they're available, so putting the signal on the shared queue + * will be equivalent to sending it to one such thread. + */ +#define wants_signal(sig, p, mask) \ + (!sigismember(&(p)->blocked, sig) \ + && !((p)->state & mask) \ + && !((p)->flags & PF_EXITING) \ + && (task_curr(p) || !signal_pending(p))) + + +static void +__group_complete_signal(int sig, struct task_struct *p) +{ + unsigned int mask; + struct task_struct *t; + + /* + * Don't bother traced and stopped tasks (but + * SIGKILL will punch through that). + */ + mask = TASK_STOPPED | TASK_TRACED; + if (sig == SIGKILL) + mask = 0; + + /* + * Now find a thread we can wake up to take the signal off the queue. + * + * If the main thread wants the signal, it gets first crack. + * Probably the least surprising to the average bear. + */ + if (wants_signal(sig, p, mask)) + t = p; + else if (thread_group_empty(p)) + /* + * There is just one thread and it does not need to be woken. + * It will dequeue unblocked signals before it runs again. + */ + return; + else { + /* + * Otherwise try to find a suitable thread. + */ + t = p->signal->curr_target; + if (t == NULL) + /* restart balancing at this thread */ + t = p->signal->curr_target = p; + BUG_ON(t->tgid != p->tgid); + + while (!wants_signal(sig, t, mask)) { + t = next_thread(t); + if (t == p->signal->curr_target) + /* + * No thread needs to be woken. + * Any eligible threads will see + * the signal in the queue soon. + */ + return; + } + p->signal->curr_target = t; + } + + /* + * Found a killable thread. If the signal will be fatal, + * then start taking the whole group down immediately. + */ + if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) && + !sigismember(&t->real_blocked, sig) && + (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { + /* + * This signal will be fatal to the whole group. + */ + if (!sig_kernel_coredump(sig)) { + /* + * Start a group exit and wake everybody up. + * This way we don't have other threads + * running and doing things after a slower + * thread has the fatal signal pending. + */ + p->signal->flags = SIGNAL_GROUP_EXIT; + p->signal->group_exit_code = sig; + p->signal->group_stop_count = 0; + t = p; + do { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + t = next_thread(t); + } while (t != p); + return; + } + + /* + * There will be a core dump. We make all threads other + * than the chosen one go into a group stop so that nothing + * happens until it gets scheduled, takes the signal off + * the shared queue, and does the core dump. This is a + * little more complicated than strictly necessary, but it + * keeps the signal state that winds up in the core dump + * unchanged from the death state, e.g. which thread had + * the core-dump signal unblocked. + */ + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); + p->signal->group_stop_count = 0; + p->signal->group_exit_task = t; + t = p; + do { + p->signal->group_stop_count++; + signal_wake_up(t, 0); + t = next_thread(t); + } while (t != p); + wake_up_process(p->signal->group_exit_task); + return; + } + + /* + * The signal is already in the shared-pending queue. + * Tell the chosen thread to wake up and dequeue it. + */ + signal_wake_up(t, sig == SIGKILL); + return; +} + +int +__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + int ret = 0; + + assert_spin_locked(&p->sighand->siglock); + handle_stop_signal(sig, p); + + if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) + /* + * Set up a return to indicate that we dropped the signal. + */ + ret = info->si_sys_private; + + /* Short-circuit ignored signals. */ + if (sig_ignored(p, sig)) + return ret; + + if (LEGACY_QUEUE(&p->signal->shared_pending, sig)) + /* This is a non-RT signal and we already have one queued. */ + return ret; + + /* + * Put this signal on the shared-pending queue, or fail with EAGAIN. + * We always use the shared queue for process-wide signals, + * to avoid several races. + */ + ret = send_signal(sig, info, p, &p->signal->shared_pending); + if (unlikely(ret)) + return ret; + + __group_complete_signal(sig, p); + return 0; +} + +/* + * Nuke all other threads in the group. + */ +void zap_other_threads(struct task_struct *p) +{ + struct task_struct *t; + + p->signal->flags = SIGNAL_GROUP_EXIT; + p->signal->group_stop_count = 0; + + if (thread_group_empty(p)) + return; + + for (t = next_thread(p); t != p; t = next_thread(t)) { + /* + * Don't bother with already dead threads + */ + if (t->exit_state) + continue; + + /* + * We don't want to notify the parent, since we are + * killed as part of a thread group due to another + * thread doing an execve() or similar. So set the + * exit signal to -1 to allow immediate reaping of + * the process. But don't detach the thread group + * leader. + */ + if (t != p->group_leader) + t->exit_signal = -1; + + sigaddset(&t->pending.signal, SIGKILL); + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + signal_wake_up(t, 1); + } +} + +/* + * Must be called with the tasklist_lock held for reading! + */ +int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + unsigned long flags; + int ret; + + ret = check_kill_permission(sig, info, p); + if (!ret && sig && p->sighand) { + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = __group_send_sig_info(sig, info, p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + + return ret; +} + +/* + * kill_pg_info() sends a signal to a process group: this is what the tty + * control characters do (^C, ^Z etc) + */ + +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + struct task_struct *p = NULL; + int retval, success; + + if (pgrp <= 0) + return -EINVAL; + + success = 0; + retval = -ESRCH; + do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + int err = group_send_sig_info(sig, info, p); + success |= !err; + retval = err; + } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + return success ? 0 : retval; +} + +int +kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pg_info(sig, info, pgrp); + read_unlock(&tasklist_lock); + + return retval; +} + +int +kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + error = -ESRCH; + if (p) + error = group_send_sig_info(sig, info, p); + read_unlock(&tasklist_lock); + return error; +} + + +/* + * kill_something_info() interprets pid in interesting ways just like kill(2). + * + * POSIX specifies that kill(-1,sig) is unspecified, but what we have + * is probably wrong. Should make it like BSD or SYSV. + */ + +static int kill_something_info(int sig, struct siginfo *info, int pid) +{ + if (!pid) { + return kill_pg_info(sig, info, process_group(current)); + } else if (pid == -1) { + int retval = 0, count = 0; + struct task_struct * p; + + read_lock(&tasklist_lock); + for_each_process(p) { + if (p->pid > 1 && p->tgid != current->tgid) { + int err = group_send_sig_info(sig, info, p); + ++count; + if (err != -EPERM) + retval = err; + } + } + read_unlock(&tasklist_lock); + return count ? retval : -ESRCH; + } else if (pid < 0) { + return kill_pg_info(sig, info, -pid); + } else { + return kill_proc_info(sig, info, pid); + } +} + +/* + * These are for backward compatibility with the rest of the kernel source. + */ + +/* + * These two are the most common entry points. They send a signal + * just to the specific thread. + */ +int +send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + int ret; + unsigned long flags; + + /* + * Make sure legacy kernel users don't send in bad values + * (normal paths check this in check_kill_permission). + */ + if (sig < 0 || sig > _NSIG) + return -EINVAL; + + /* + * We need the tasklist lock even for the specific + * thread case (when we don't need to follow the group + * lists) in order to avoid races with "p->sighand" + * going away or changing from under us. + */ + read_lock(&tasklist_lock); + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = specific_send_sig_info(sig, info, p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + read_unlock(&tasklist_lock); + return ret; +} + +int +send_sig(int sig, struct task_struct *p, int priv) +{ + return send_sig_info(sig, (void*)(long)(priv != 0), p); +} + +/* + * This is the entry point for "process-wide" signals. + * They will go to an appropriate thread in the thread group. + */ +int +send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + int ret; + read_lock(&tasklist_lock); + ret = group_send_sig_info(sig, info, p); + read_unlock(&tasklist_lock); + return ret; +} + +void +force_sig(int sig, struct task_struct *p) +{ + force_sig_info(sig, (void*)1L, p); +} + +/* + * When things go south during signal handling, we + * will force a SIGSEGV. And if the signal that caused + * the problem was already a SIGSEGV, we'll want to + * make sure we don't even try to deliver the signal.. + */ +int +force_sigsegv(int sig, struct task_struct *p) +{ + if (sig == SIGSEGV) { + unsigned long flags; + spin_lock_irqsave(&p->sighand->siglock, flags); + p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + force_sig(SIGSEGV, p); + return 0; +} + +int +kill_pg(pid_t pgrp, int sig, int priv) +{ + return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); +} + +int +kill_proc(pid_t pid, int sig, int priv) +{ + return kill_proc_info(sig, (void *)(long)(priv != 0), pid); +} + +/* + * These functions support sending signals using preallocated sigqueue + * structures. This is needed "because realtime applications cannot + * afford to lose notifications of asynchronous events, like timer + * expirations or I/O completions". In the case of Posix Timers + * we allocate the sigqueue structure from the timer_create. If this + * allocation fails we are able to report the failure to the application + * with an EAGAIN error. + */ + +struct sigqueue *sigqueue_alloc(void) +{ + struct sigqueue *q; + + if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) + q->flags |= SIGQUEUE_PREALLOC; + return(q); +} + +void sigqueue_free(struct sigqueue *q) +{ + unsigned long flags; + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + /* + * If the signal is still pending remove it from the + * pending queue. + */ + if (unlikely(!list_empty(&q->list))) { + read_lock(&tasklist_lock); + spin_lock_irqsave(q->lock, flags); + if (!list_empty(&q->list)) + list_del_init(&q->list); + spin_unlock_irqrestore(q->lock, flags); + read_unlock(&tasklist_lock); + } + q->flags &= ~SIGQUEUE_PREALLOC; + __sigqueue_free(q); +} + +int +send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) +{ + unsigned long flags; + int ret = 0; + + /* + * We need the tasklist lock even for the specific + * thread case (when we don't need to follow the group + * lists) in order to avoid races with "p->sighand" + * going away or changing from under us. + */ + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + read_lock(&tasklist_lock); + spin_lock_irqsave(&p->sighand->siglock, flags); + + if (unlikely(!list_empty(&q->list))) { + /* + * If an SI_TIMER entry is already queue just increment + * the overrun count. + */ + if (q->info.si_code != SI_TIMER) + BUG(); + q->info.si_overrun++; + goto out; + } + /* Short-circuit ignored signals. */ + if (sig_ignored(p, sig)) { + ret = 1; + goto out; + } + + q->lock = &p->sighand->siglock; + list_add_tail(&q->list, &p->pending.list); + sigaddset(&p->pending.signal, sig); + if (!sigismember(&p->blocked, sig)) + signal_wake_up(p, sig == SIGKILL); + +out: + spin_unlock_irqrestore(&p->sighand->siglock, flags); + read_unlock(&tasklist_lock); + return(ret); +} + +int +send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) +{ + unsigned long flags; + int ret = 0; + + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + read_lock(&tasklist_lock); + spin_lock_irqsave(&p->sighand->siglock, flags); + handle_stop_signal(sig, p); + + /* Short-circuit ignored signals. */ + if (sig_ignored(p, sig)) { + ret = 1; + goto out; + } + + if (unlikely(!list_empty(&q->list))) { + /* + * If an SI_TIMER entry is already queue just increment + * the overrun count. Other uses should not try to + * send the signal multiple times. + */ + if (q->info.si_code != SI_TIMER) + BUG(); + q->info.si_overrun++; + goto out; + } + + /* + * Put this signal on the shared-pending queue. + * We always use the shared queue for process-wide signals, + * to avoid several races. + */ + q->lock = &p->sighand->siglock; + list_add_tail(&q->list, &p->signal->shared_pending.list); + sigaddset(&p->signal->shared_pending.signal, sig); + + __group_complete_signal(sig, p); +out: + spin_unlock_irqrestore(&p->sighand->siglock, flags); + read_unlock(&tasklist_lock); + return(ret); +} + +/* + * Wake up any threads in the parent blocked in wait* syscalls. + */ +static inline void __wake_up_parent(struct task_struct *p, + struct task_struct *parent) +{ + wake_up_interruptible_sync(&parent->signal->wait_chldexit); +} + +/* + * Let a parent know about the death of a child. + * For a stopped/continued status change, use do_notify_parent_cldstop instead. + */ + +void do_notify_parent(struct task_struct *tsk, int sig) +{ + struct siginfo info; + unsigned long flags; + struct sighand_struct *psig; + + BUG_ON(sig == -1); + + /* do_notify_parent_cldstop should have been called instead. */ + BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED)); + + BUG_ON(!tsk->ptrace && + (tsk->group_leader != tsk || !thread_group_empty(tsk))); + + info.si_signo = sig; + info.si_errno = 0; + info.si_pid = tsk->pid; + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ + info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime, + tsk->signal->stime)); + + info.si_status = tsk->exit_code & 0x7f; + if (tsk->exit_code & 0x80) + info.si_code = CLD_DUMPED; + else if (tsk->exit_code & 0x7f) + info.si_code = CLD_KILLED; + else { + info.si_code = CLD_EXITED; + info.si_status = tsk->exit_code >> 8; + } + + psig = tsk->parent->sighand; + spin_lock_irqsave(&psig->siglock, flags); + if (sig == SIGCHLD && + (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { + /* + * We are exiting and our parent doesn't care. POSIX.1 + * defines special semantics for setting SIGCHLD to SIG_IGN + * or setting the SA_NOCLDWAIT flag: we should be reaped + * automatically and not left for our parent's wait4 call. + * Rather than having the parent do it as a magic kind of + * signal handler, we just set this to tell do_exit that we + * can be cleaned up without becoming a zombie. Note that + * we still call __wake_up_parent in this case, because a + * blocked sys_wait4 might now return -ECHILD. + * + * Whether we send SIGCHLD or not for SA_NOCLDWAIT + * is implementation-defined: we do (if you don't want + * it, just use SIG_IGN instead). + */ + tsk->exit_signal = -1; + if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) + sig = 0; + } + if (sig > 0 && sig <= _NSIG) + __group_send_sig_info(sig, &info, tsk->parent); + __wake_up_parent(tsk, tsk->parent); + spin_unlock_irqrestore(&psig->siglock, flags); +} + +static void +do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, + int why) +{ + struct siginfo info; + unsigned long flags; + struct sighand_struct *sighand; + + info.si_signo = SIGCHLD; + info.si_errno = 0; + info.si_pid = tsk->pid; + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ + info.si_utime = cputime_to_jiffies(tsk->utime); + info.si_stime = cputime_to_jiffies(tsk->stime); + + info.si_code = why; + switch (why) { + case CLD_CONTINUED: + info.si_status = SIGCONT; + break; + case CLD_STOPPED: + info.si_status = tsk->signal->group_exit_code & 0x7f; + break; + case CLD_TRAPPED: + info.si_status = tsk->exit_code & 0x7f; + break; + default: + BUG(); + } + + sighand = parent->sighand; + spin_lock_irqsave(&sighand->siglock, flags); + if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && + !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) + __group_send_sig_info(SIGCHLD, &info, parent); + /* + * Even if SIGCHLD is not generated, we must wake up wait4 calls. + */ + __wake_up_parent(tsk, parent); + spin_unlock_irqrestore(&sighand->siglock, flags); +} + +/* + * This must be called with current->sighand->siglock held. + * + * This should be the path for all ptrace stops. + * We always set current->last_siginfo while stopped here. + * That makes it a way to test a stopped process for + * being ptrace-stopped vs being job-control-stopped. + * + * If we actually decide not to stop at all because the tracer is gone, + * we leave nostop_code in current->exit_code. + */ +static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) +{ + /* + * If there is a group stop in progress, + * we must participate in the bookkeeping. + */ + if (current->signal->group_stop_count > 0) + --current->signal->group_stop_count; + + current->last_siginfo = info; + current->exit_code = exit_code; + + /* Let the debugger run. */ + set_current_state(TASK_TRACED); + spin_unlock_irq(¤t->sighand->siglock); + read_lock(&tasklist_lock); + if (likely(current->ptrace & PT_PTRACED) && + likely(current->parent != current->real_parent || + !(current->ptrace & PT_ATTACHED)) && + (likely(current->parent->signal != current->signal) || + !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { + do_notify_parent_cldstop(current, current->parent, + CLD_TRAPPED); + read_unlock(&tasklist_lock); + schedule(); + } else { + /* + * By the time we got the lock, our tracer went away. + * Don't stop here. + */ + read_unlock(&tasklist_lock); + set_current_state(TASK_RUNNING); + current->exit_code = nostop_code; + } + + /* + * We are back. Now reacquire the siglock before touching + * last_siginfo, so that we are sure to have synchronized with + * any signal-sending on another CPU that wants to examine it. + */ + spin_lock_irq(¤t->sighand->siglock); + current->last_siginfo = NULL; + + /* + * Queued signals ignored us while we were stopped for tracing. + * So check for any that we should take before resuming user mode. + */ + recalc_sigpending(); +} + +void ptrace_notify(int exit_code) +{ + siginfo_t info; + + BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + + memset(&info, 0, sizeof info); + info.si_signo = SIGTRAP; + info.si_code = exit_code; + info.si_pid = current->pid; + info.si_uid = current->uid; + + /* Let the debugger run. */ + spin_lock_irq(¤t->sighand->siglock); + ptrace_stop(exit_code, 0, &info); + spin_unlock_irq(¤t->sighand->siglock); +} + +#ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER + +static void +finish_stop(int stop_count) +{ + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, + * report to the parent. When ptraced, every thread reports itself. + */ + if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, current->parent, + CLD_STOPPED); + read_unlock(&tasklist_lock); + } + else if (stop_count == 0) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, + current->group_leader->real_parent, + CLD_STOPPED); + read_unlock(&tasklist_lock); + } + + schedule(); + /* + * Now we don't run again until continued. + */ + current->exit_code = 0; +} + +/* + * This performs the stopping for SIGSTOP and other stop signals. + * We have to stop all threads in the thread group. + * Returns nonzero if we've actually stopped and released the siglock. + * Returns zero if we didn't stop and still hold the siglock. + */ +static int +do_signal_stop(int signr) +{ + struct signal_struct *sig = current->signal; + struct sighand_struct *sighand = current->sighand; + int stop_count = -1; + + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) + return 0; + + if (sig->group_stop_count > 0) { + /* + * There is a group stop in progress. We don't need to + * start another one. + */ + signr = sig->group_exit_code; + stop_count = --sig->group_stop_count; + current->exit_code = signr; + set_current_state(TASK_STOPPED); + if (stop_count == 0) + sig->flags = SIGNAL_STOP_STOPPED; + spin_unlock_irq(&sighand->siglock); + } + else if (thread_group_empty(current)) { + /* + * Lock must be held through transition to stopped state. + */ + current->exit_code = current->signal->group_exit_code = signr; + set_current_state(TASK_STOPPED); + sig->flags = SIGNAL_STOP_STOPPED; + spin_unlock_irq(&sighand->siglock); + } + else { + /* + * There is no group stop already in progress. + * We must initiate one now, but that requires + * dropping siglock to get both the tasklist lock + * and siglock again in the proper order. Note that + * this allows an intervening SIGCONT to be posted. + * We need to check for that and bail out if necessary. + */ + struct task_struct *t; + + spin_unlock_irq(&sighand->siglock); + + /* signals can be posted during this window */ + + read_lock(&tasklist_lock); + spin_lock_irq(&sighand->siglock); + + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { + /* + * Another stop or continue happened while we + * didn't have the lock. We can just swallow this + * signal now. If we raced with a SIGCONT, that + * should have just cleared it now. If we raced + * with another processor delivering a stop signal, + * then the SIGCONT that wakes us up should clear it. + */ + read_unlock(&tasklist_lock); + return 0; + } + + if (sig->group_stop_count == 0) { + sig->group_exit_code = signr; + stop_count = 0; + for (t = next_thread(current); t != current; + t = next_thread(t)) + /* + * Setting state to TASK_STOPPED for a group + * stop is always done with the siglock held, + * so this check has no races. + */ + if (t->state < TASK_STOPPED) { + stop_count++; + signal_wake_up(t, 0); + } + sig->group_stop_count = stop_count; + } + else { + /* A race with another thread while unlocked. */ + signr = sig->group_exit_code; + stop_count = --sig->group_stop_count; + } + + current->exit_code = signr; + set_current_state(TASK_STOPPED); + if (stop_count == 0) + sig->flags = SIGNAL_STOP_STOPPED; + + spin_unlock_irq(&sighand->siglock); + read_unlock(&tasklist_lock); + } + + finish_stop(stop_count); + return 1; +} + +/* + * Do appropriate magic when group_stop_count > 0. + * We return nonzero if we stopped, after releasing the siglock. + * We return zero if we still hold the siglock and should look + * for another signal without checking group_stop_count again. + */ +static inline int handle_group_stop(void) +{ + int stop_count; + + if (current->signal->group_exit_task == current) { + /* + * Group stop is so we can do a core dump, + * We are the initiating thread, so get on with it. + */ + current->signal->group_exit_task = NULL; + return 0; + } + + if (current->signal->flags & SIGNAL_GROUP_EXIT) + /* + * Group stop is so another thread can do a core dump, + * or else we are racing against a death signal. + * Just punt the stop so we can get the next signal. + */ + return 0; + + /* + * There is a group stop in progress. We stop + * without any associated signal being in our queue. + */ + stop_count = --current->signal->group_stop_count; + if (stop_count == 0) + current->signal->flags = SIGNAL_STOP_STOPPED; + current->exit_code = current->signal->group_exit_code; + set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + finish_stop(stop_count); + return 1; +} + +int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, + struct pt_regs *regs, void *cookie) +{ + sigset_t *mask = ¤t->blocked; + int signr = 0; + +relock: + spin_lock_irq(¤t->sighand->siglock); + for (;;) { + struct k_sigaction *ka; + + if (unlikely(current->signal->group_stop_count > 0) && + handle_group_stop()) + goto relock; + + signr = dequeue_signal(current, mask, info); + + if (!signr) + break; /* will return 0 */ + + if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { + ptrace_signal_deliver(regs, cookie); + + /* Let the debugger run. */ + ptrace_stop(signr, signr, info); + + /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; + if (signr == 0) + continue; + + current->exit_code = 0; + + /* Update the siginfo structure if the signal has + changed. If the debugger wanted something + specific in the siginfo structure then it should + have updated *info via PTRACE_SETSIGINFO. */ + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = current->parent->pid; + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + specific_send_sig_info(signr, info, current); + continue; + } + } + + ka = ¤t->sighand->action[signr-1]; + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ + continue; + if (ka->sa.sa_handler != SIG_DFL) { + /* Run the handler. */ + *return_ka = *ka; + + if (ka->sa.sa_flags & SA_ONESHOT) + ka->sa.sa_handler = SIG_DFL; + + break; /* will return non-zero "signr" value */ + } + + /* + * Now we are doing the default action for this signal. + */ + if (sig_kernel_ignore(signr)) /* Default is nothing. */ + continue; + + /* Init gets no signals it doesn't want. */ + if (current->pid == 1) + continue; + + if (sig_kernel_stop(signr)) { + /* + * The default action is to stop all threads in + * the thread group. The job control signals + * do nothing in an orphaned pgrp, but SIGSTOP + * always works. Note that siglock needs to be + * dropped during the call to is_orphaned_pgrp() + * because of lock ordering with tasklist_lock. + * This allows an intervening SIGCONT to be posted. + * We need to check for that and bail out if necessary. + */ + if (signr != SIGSTOP) { + spin_unlock_irq(¤t->sighand->siglock); + + /* signals can be posted during this window */ + + if (is_orphaned_pgrp(process_group(current))) + goto relock; + + spin_lock_irq(¤t->sighand->siglock); + } + + if (likely(do_signal_stop(signr))) { + /* It released the siglock. */ + goto relock; + } + + /* + * We didn't actually stop, due to a race + * with SIGCONT or something like that. + */ + continue; + } + + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Anything else is fatal, maybe with a core dump. + */ + current->flags |= PF_SIGNALED; + if (sig_kernel_coredump(signr)) { + /* + * If it was able to dump core, this kills all + * other threads in the group and synchronizes with + * their demise. If we lost the race with another + * thread getting here, it set group_exit_code + * first and our do_group_exit call below will use + * that value and ignore the one we pass it. + */ + do_coredump((long)signr, signr, regs); + } + + /* + * Death signals, no core dump. + */ + do_group_exit(signr); + /* NOTREACHED */ + } + spin_unlock_irq(¤t->sighand->siglock); + return signr; +} + +#endif + +EXPORT_SYMBOL(recalc_sigpending); +EXPORT_SYMBOL_GPL(dequeue_signal); +EXPORT_SYMBOL(flush_signals); +EXPORT_SYMBOL(force_sig); +EXPORT_SYMBOL(kill_pg); +EXPORT_SYMBOL(kill_proc); +EXPORT_SYMBOL(ptrace_notify); +EXPORT_SYMBOL(send_sig); +EXPORT_SYMBOL(send_sig_info); +EXPORT_SYMBOL(sigprocmask); +EXPORT_SYMBOL(block_all_signals); +EXPORT_SYMBOL(unblock_all_signals); + + +/* + * System call entry points. + */ + +asmlinkage long sys_restart_syscall(void) +{ + struct restart_block *restart = ¤t_thread_info()->restart_block; + return restart->fn(restart); +} + +long do_no_restart_syscall(struct restart_block *param) +{ + return -EINTR; +} + +/* + * We don't need to get the kernel lock - this is all local to this + * particular thread.. (and that's good, because this is _heavily_ + * used by various programs) + */ + +/* + * This is also useful for kernel threads that want to temporarily + * (or permanently) block certain signals. + * + * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel + * interface happily blocks "unblockable" signals like SIGKILL + * and friends. + */ +int sigprocmask(int how, sigset_t *set, sigset_t *oldset) +{ + int error; + sigset_t old_block; + + spin_lock_irq(¤t->sighand->siglock); + old_block = current->blocked; + error = 0; + switch (how) { + case SIG_BLOCK: + sigorsets(¤t->blocked, ¤t->blocked, set); + break; + case SIG_UNBLOCK: + signandsets(¤t->blocked, ¤t->blocked, set); + break; + case SIG_SETMASK: + current->blocked = *set; + break; + default: + error = -EINVAL; + } + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + if (oldset) + *oldset = old_block; + return error; +} + +asmlinkage long +sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) +{ + int error = -EINVAL; + sigset_t old_set, new_set; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + goto out; + + if (set) { + error = -EFAULT; + if (copy_from_user(&new_set, set, sizeof(*set))) + goto out; + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + error = sigprocmask(how, &new_set, &old_set); + if (error) + goto out; + if (oset) + goto set_old; + } else if (oset) { + spin_lock_irq(¤t->sighand->siglock); + old_set = current->blocked; + spin_unlock_irq(¤t->sighand->siglock); + + set_old: + error = -EFAULT; + if (copy_to_user(oset, &old_set, sizeof(*oset))) + goto out; + } + error = 0; +out: + return error; +} + +long do_sigpending(void __user *set, unsigned long sigsetsize) +{ + long error = -EINVAL; + sigset_t pending; + + if (sigsetsize > sizeof(sigset_t)) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + sigorsets(&pending, ¤t->pending.signal, + ¤t->signal->shared_pending.signal); + spin_unlock_irq(¤t->sighand->siglock); + + /* Outside the lock because only this thread touches it. */ + sigandsets(&pending, ¤t->blocked, &pending); + + error = -EFAULT; + if (!copy_to_user(set, &pending, sigsetsize)) + error = 0; + +out: + return error; +} + +asmlinkage long +sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize) +{ + return do_sigpending(set, sigsetsize); +} + +#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER + +int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) +{ + int err; + + if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) + return -EFAULT; + if (from->si_code < 0) + return __copy_to_user(to, from, sizeof(siginfo_t)) + ? -EFAULT : 0; + /* + * If you change siginfo_t structure, please be sure + * this code is fixed accordingly. + * It should never copy any pad contained in the structure + * to avoid security leaks, but must copy the generic + * 3 ints plus the relevant union member. + */ + err = __put_user(from->si_signo, &to->si_signo); + err |= __put_user(from->si_errno, &to->si_errno); + err |= __put_user((short)from->si_code, &to->si_code); + switch (from->si_code & __SI_MASK) { + case __SI_KILL: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + break; + case __SI_TIMER: + err |= __put_user(from->si_tid, &to->si_tid); + err |= __put_user(from->si_overrun, &to->si_overrun); + err |= __put_user(from->si_ptr, &to->si_ptr); + break; + case __SI_POLL: + err |= __put_user(from->si_band, &to->si_band); + err |= __put_user(from->si_fd, &to->si_fd); + break; + case __SI_FAULT: + err |= __put_user(from->si_addr, &to->si_addr); +#ifdef __ARCH_SI_TRAPNO + err |= __put_user(from->si_trapno, &to->si_trapno); +#endif + break; + case __SI_CHLD: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_status, &to->si_status); + err |= __put_user(from->si_utime, &to->si_utime); + err |= __put_user(from->si_stime, &to->si_stime); + break; + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_ptr, &to->si_ptr); + break; + default: /* this is just in case for now ... */ + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + break; + } + return err; +} + +#endif + +asmlinkage long +sys_rt_sigtimedwait(const sigset_t __user *uthese, + siginfo_t __user *uinfo, + const struct timespec __user *uts, + size_t sigsetsize) +{ + int ret, sig; + sigset_t these; + struct timespec ts; + siginfo_t info; + long timeout = 0; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&these, uthese, sizeof(these))) + return -EFAULT; + + /* + * Invert the set of allowed signals to get those we + * want to block. + */ + sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); + signotset(&these); + + if (uts) { + if (copy_from_user(&ts, uts, sizeof(ts))) + return -EFAULT; + if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 + || ts.tv_sec < 0) + return -EINVAL; + } + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &these, &info); + if (!sig) { + timeout = MAX_SCHEDULE_TIMEOUT; + if (uts) + timeout = (timespec_to_jiffies(&ts) + + (ts.tv_sec || ts.tv_nsec)); + + if (timeout) { + /* None ready -- temporarily unblock those we're + * interested while we are sleeping in so that we'll + * be awakened when they arrive. */ + current->real_blocked = current->blocked; + sigandsets(¤t->blocked, ¤t->blocked, &these); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + timeout = schedule_timeout(timeout); + + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &these, &info); + current->blocked = current->real_blocked; + siginitset(¤t->real_blocked, 0); + recalc_sigpending(); + } + } + spin_unlock_irq(¤t->sighand->siglock); + + if (sig) { + ret = sig; + if (uinfo) { + if (copy_siginfo_to_user(uinfo, &info)) + ret = -EFAULT; + } + } else { + ret = -EAGAIN; + if (timeout) + ret = -EINTR; + } + + return ret; +} + +asmlinkage long +sys_kill(int pid, int sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_USER; + info.si_pid = current->tgid; + info.si_uid = current->uid; + + return kill_something_info(sig, &info, pid); +} + +/** + * sys_tgkill - send signal to one specific thread + * @tgid: the thread group ID of the thread + * @pid: the PID of the thread + * @sig: signal to be sent + * + * This syscall also checks the tgid and returns -ESRCH even if the PID + * exists but it's not belonging to the target process anymore. This + * method solves the problem of threads exiting and PIDs getting reused. + */ +asmlinkage long sys_tgkill(int tgid, int pid, int sig) +{ + struct siginfo info; + int error; + struct task_struct *p; + + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = current->tgid; + info.si_uid = current->uid; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + error = -ESRCH; + if (p && (p->tgid == tgid)) { + error = check_kill_permission(sig, &info, p); + /* + * The null signal is a permissions and process existence + * probe. No signal is actually delivered. + */ + if (!error && sig && p->sighand) { + spin_lock_irq(&p->sighand->siglock); + handle_stop_signal(sig, p); + error = specific_send_sig_info(sig, &info, p); + spin_unlock_irq(&p->sighand->siglock); + } + } + read_unlock(&tasklist_lock); + return error; +} + +/* + * Send a signal to only one task, even if it's a CLONE_THREAD task. + */ +asmlinkage long +sys_tkill(int pid, int sig) +{ + struct siginfo info; + int error; + struct task_struct *p; + + /* This is only valid for single tasks */ + if (pid <= 0) + return -EINVAL; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = current->tgid; + info.si_uid = current->uid; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + error = -ESRCH; + if (p) { + error = check_kill_permission(sig, &info, p); + /* + * The null signal is a permissions and process existence + * probe. No signal is actually delivered. + */ + if (!error && sig && p->sighand) { + spin_lock_irq(&p->sighand->siglock); + handle_stop_signal(sig, p); + error = specific_send_sig_info(sig, &info, p); + spin_unlock_irq(&p->sighand->siglock); + } + } + read_unlock(&tasklist_lock); + return error; +} + +asmlinkage long +sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + /* Not even root can pretend to send signals from the kernel. + Nor can they impersonate a kill(), which adds source info. */ + if (info.si_code >= 0) + return -EPERM; + info.si_signo = sig; + + /* POSIX.1b doesn't mention process groups. */ + return kill_proc_info(sig, &info, pid); +} + +int +do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) +{ + struct k_sigaction *k; + + if (sig < 1 || sig > _NSIG || (act && sig_kernel_only(sig))) + return -EINVAL; + + k = ¤t->sighand->action[sig-1]; + + spin_lock_irq(¤t->sighand->siglock); + if (signal_pending(current)) { + /* + * If there might be a fatal signal pending on multiple + * threads, make sure we take it before changing the action. + */ + spin_unlock_irq(¤t->sighand->siglock); + return -ERESTARTNOINTR; + } + + if (oact) + *oact = *k; + + if (act) { + /* + * POSIX 3.3.1.3: + * "Setting a signal action to SIG_IGN for a signal that is + * pending shall cause the pending signal to be discarded, + * whether or not it is blocked." + * + * "Setting a signal action to SIG_DFL for a signal that is + * pending and whose default action is to ignore the signal + * (for example, SIGCHLD), shall cause the pending signal to + * be discarded, whether or not it is blocked" + */ + if (act->sa.sa_handler == SIG_IGN || + (act->sa.sa_handler == SIG_DFL && + sig_kernel_ignore(sig))) { + /* + * This is a fairly rare case, so we only take the + * tasklist_lock once we're sure we'll need it. + * Now we must do this little unlock and relock + * dance to maintain the lock hierarchy. + */ + struct task_struct *t = current; + spin_unlock_irq(&t->sighand->siglock); + read_lock(&tasklist_lock); + spin_lock_irq(&t->sighand->siglock); + *k = *act; + sigdelsetmask(&k->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); + rm_from_queue(sigmask(sig), &t->signal->shared_pending); + do { + rm_from_queue(sigmask(sig), &t->pending); + recalc_sigpending_tsk(t); + t = next_thread(t); + } while (t != current); + spin_unlock_irq(¤t->sighand->siglock); + read_unlock(&tasklist_lock); + return 0; + } + + *k = *act; + sigdelsetmask(&k->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); + } + + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +int +do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) +{ + stack_t oss; + int error; + + if (uoss) { + oss.ss_sp = (void __user *) current->sas_ss_sp; + oss.ss_size = current->sas_ss_size; + oss.ss_flags = sas_ss_flags(sp); + } + + if (uss) { + void __user *ss_sp; + size_t ss_size; + int ss_flags; + + error = -EFAULT; + if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) + || __get_user(ss_sp, &uss->ss_sp) + || __get_user(ss_flags, &uss->ss_flags) + || __get_user(ss_size, &uss->ss_size)) + goto out; + + error = -EPERM; + if (on_sig_stack(sp)) + goto out; + + error = -EINVAL; + /* + * + * Note - this code used to test ss_flags incorrectly + * old code may have been written using ss_flags==0 + * to mean ss_flags==SS_ONSTACK (as this was the only + * way that worked) - this fix preserves that older + * mechanism + */ + if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + goto out; + + if (ss_flags == SS_DISABLE) { + ss_size = 0; + ss_sp = NULL; + } else { + error = -ENOMEM; + if (ss_size < MINSIGSTKSZ) + goto out; + } + + current->sas_ss_sp = (unsigned long) ss_sp; + current->sas_ss_size = ss_size; + } + + if (uoss) { + error = -EFAULT; + if (copy_to_user(uoss, &oss, sizeof(oss))) + goto out; + } + + error = 0; +out: + return error; +} + +#ifdef __ARCH_WANT_SYS_SIGPENDING + +asmlinkage long +sys_sigpending(old_sigset_t __user *set) +{ + return do_sigpending(set, sizeof(*set)); +} + +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK +/* Some platforms have their own version with special arguments others + support only sys_rt_sigprocmask. */ + +asmlinkage long +sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset) +{ + int error; + old_sigset_t old_set, new_set; + + if (set) { + error = -EFAULT; + if (copy_from_user(&new_set, set, sizeof(*set))) + goto out; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + + spin_lock_irq(¤t->sighand->siglock); + old_set = current->blocked.sig[0]; + + error = 0; + switch (how) { + default: + error = -EINVAL; + break; + case SIG_BLOCK: + sigaddsetmask(¤t->blocked, new_set); + break; + case SIG_UNBLOCK: + sigdelsetmask(¤t->blocked, new_set); + break; + case SIG_SETMASK: + current->blocked.sig[0] = new_set; + break; + } + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + if (error) + goto out; + if (oset) + goto set_old; + } else if (oset) { + old_set = current->blocked.sig[0]; + set_old: + error = -EFAULT; + if (copy_to_user(oset, &old_set, sizeof(*oset))) + goto out; + } + error = 0; +out: + return error; +} +#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ + +#ifdef __ARCH_WANT_SYS_RT_SIGACTION +asmlinkage long +sys_rt_sigaction(int sig, + const struct sigaction __user *act, + struct sigaction __user *oact, + size_t sigsetsize) +{ + struct k_sigaction new_sa, old_sa; + int ret = -EINVAL; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + goto out; + + if (act) { + if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) + return -EFAULT; + } + + ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); + + if (!ret && oact) { + if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) + return -EFAULT; + } +out: + return ret; +} +#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ + +#ifdef __ARCH_WANT_SYS_SGETMASK + +/* + * For backwards compatibility. Functionality superseded by sigprocmask. + */ +asmlinkage long +sys_sgetmask(void) +{ + /* SMP safe */ + return current->blocked.sig[0]; +} + +asmlinkage long +sys_ssetmask(int newmask) +{ + int old; + + spin_lock_irq(¤t->sighand->siglock); + old = current->blocked.sig[0]; + + siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| + sigmask(SIGSTOP))); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + return old; +} +#endif /* __ARCH_WANT_SGETMASK */ + +#ifdef __ARCH_WANT_SYS_SIGNAL +/* + * For backwards compatibility. Functionality superseded by sigaction. + */ +asmlinkage unsigned long +sys_signal(int sig, __sighandler_t handler) +{ + struct k_sigaction new_sa, old_sa; + int ret; + + new_sa.sa.sa_handler = handler; + new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; + + ret = do_sigaction(sig, &new_sa, &old_sa); + + return ret ? ret : (unsigned long)old_sa.sa.sa_handler; +} +#endif /* __ARCH_WANT_SYS_SIGNAL */ + +#ifdef __ARCH_WANT_SYS_PAUSE + +asmlinkage long +sys_pause(void) +{ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return -ERESTARTNOHAND; +} + +#endif + +void __init signals_init(void) +{ + sigqueue_cachep = + kmem_cache_create("sigqueue", + sizeof(struct sigqueue), + __alignof__(struct sigqueue), + SLAB_PANIC, NULL, NULL); +} diff --git a/kernel/softirq.c b/kernel/softirq.c new file mode 100644 index 00000000000..b4ab6af1dea --- /dev/null +++ b/kernel/softirq.c @@ -0,0 +1,496 @@ +/* + * linux/kernel/softirq.c + * + * Copyright (C) 1992 Linus Torvalds + * + * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + */ + +#include <linux/module.h> +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <linux/kthread.h> +#include <linux/rcupdate.h> + +#include <asm/irq.h> +/* + - No shared variables, all the data are CPU local. + - If a softirq needs serialization, let it serialize itself + by its own spinlocks. + - Even if softirq is serialized, only local cpu is marked for + execution. Hence, we get something sort of weak cpu binding. + Though it is still not clear, will it result in better locality + or will not. + + Examples: + - NET RX softirq. It is multithreaded and does not require + any global serialization. + - NET TX softirq. It kicks software netdevice queues, hence + it is logically serialized per device, but this serialization + is invisible to common code. + - Tasklets: serialized wrt itself. + */ + +#ifndef __ARCH_IRQ_STAT +irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; +EXPORT_SYMBOL(irq_stat); +#endif + +static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; + +static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +/* + * we cannot loop indefinitely here to avoid userspace starvation, + * but we also don't want to introduce a worst case 1/HZ latency + * to the pending events, so lets the scheduler to balance + * the softirq load for us. + */ +static inline void wakeup_softirqd(void) +{ + /* Interrupts are disabled: no need to stop preemption */ + struct task_struct *tsk = __get_cpu_var(ksoftirqd); + + if (tsk && tsk->state != TASK_RUNNING) + wake_up_process(tsk); +} + +/* + * We restart softirq processing MAX_SOFTIRQ_RESTART times, + * and we fall back to softirqd after that. + * + * This number has been established via experimentation. + * The two things to balance is latency against fairness - + * we want to handle softirqs as soon as possible, but they + * should not be able to lock up the box. + */ +#define MAX_SOFTIRQ_RESTART 10 + +asmlinkage void __do_softirq(void) +{ + struct softirq_action *h; + __u32 pending; + int max_restart = MAX_SOFTIRQ_RESTART; + int cpu; + + pending = local_softirq_pending(); + + local_bh_disable(); + cpu = smp_processor_id(); +restart: + /* Reset the pending bitmask before enabling irqs */ + local_softirq_pending() = 0; + + local_irq_enable(); + + h = softirq_vec; + + do { + if (pending & 1) { + h->action(h); + rcu_bh_qsctr_inc(cpu); + } + h++; + pending >>= 1; + } while (pending); + + local_irq_disable(); + + pending = local_softirq_pending(); + if (pending && --max_restart) + goto restart; + + if (pending) + wakeup_softirqd(); + + __local_bh_enable(); +} + +#ifndef __ARCH_HAS_DO_SOFTIRQ + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + pending = local_softirq_pending(); + + if (pending) + __do_softirq(); + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); + +#endif + +void local_bh_enable(void) +{ + WARN_ON(irqs_disabled()); + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); + + if (unlikely(!in_interrupt() && local_softirq_pending())) + do_softirq(); + + dec_preempt_count(); + preempt_check_resched(); +} +EXPORT_SYMBOL(local_bh_enable); + +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED +# define invoke_softirq() __do_softirq() +#else +# define invoke_softirq() do_softirq() +#endif + +/* + * Exit an interrupt context. Process softirqs if needed and possible: + */ +void irq_exit(void) +{ + account_system_vtime(current); + sub_preempt_count(IRQ_EXIT_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); + preempt_enable_no_resched(); +} + +/* + * This function must run with irqs disabled! + */ +inline fastcall void raise_softirq_irqoff(unsigned int nr) +{ + __raise_softirq_irqoff(nr); + + /* + * If we're in an interrupt or softirq, we're done + * (this also catches softirq-disabled code). We will + * actually run the softirq once we return from + * the irq or softirq. + * + * Otherwise we wake up ksoftirqd to make sure we + * schedule the softirq soon. + */ + if (!in_interrupt()) + wakeup_softirqd(); +} + +EXPORT_SYMBOL(raise_softirq_irqoff); + +void fastcall raise_softirq(unsigned int nr) +{ + unsigned long flags; + + local_irq_save(flags); + raise_softirq_irqoff(nr); + local_irq_restore(flags); +} + +void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) +{ + softirq_vec[nr].data = data; + softirq_vec[nr].action = action; +} + +EXPORT_SYMBOL(open_softirq); + +/* Tasklets */ +struct tasklet_head +{ + struct tasklet_struct *list; +}; + +/* Some compilers disobey section attribute on statics when not + initialized -- RR */ +static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; +static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; + +void fastcall __tasklet_schedule(struct tasklet_struct *t) +{ + unsigned long flags; + + local_irq_save(flags); + t->next = __get_cpu_var(tasklet_vec).list; + __get_cpu_var(tasklet_vec).list = t; + raise_softirq_irqoff(TASKLET_SOFTIRQ); + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__tasklet_schedule); + +void fastcall __tasklet_hi_schedule(struct tasklet_struct *t) +{ + unsigned long flags; + + local_irq_save(flags); + t->next = __get_cpu_var(tasklet_hi_vec).list; + __get_cpu_var(tasklet_hi_vec).list = t; + raise_softirq_irqoff(HI_SOFTIRQ); + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__tasklet_hi_schedule); + +static void tasklet_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __get_cpu_var(tasklet_vec).list; + __get_cpu_var(tasklet_vec).list = NULL; + local_irq_enable(); + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + local_irq_disable(); + t->next = __get_cpu_var(tasklet_vec).list; + __get_cpu_var(tasklet_vec).list = t; + __raise_softirq_irqoff(TASKLET_SOFTIRQ); + local_irq_enable(); + } +} + +static void tasklet_hi_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __get_cpu_var(tasklet_hi_vec).list; + __get_cpu_var(tasklet_hi_vec).list = NULL; + local_irq_enable(); + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + local_irq_disable(); + t->next = __get_cpu_var(tasklet_hi_vec).list; + __get_cpu_var(tasklet_hi_vec).list = t; + __raise_softirq_irqoff(HI_SOFTIRQ); + local_irq_enable(); + } +} + + +void tasklet_init(struct tasklet_struct *t, + void (*func)(unsigned long), unsigned long data) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->func = func; + t->data = data; +} + +EXPORT_SYMBOL(tasklet_init); + +void tasklet_kill(struct tasklet_struct *t) +{ + if (in_interrupt()) + printk("Attempt to kill tasklet from interrupt\n"); + + while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { + do + yield(); + while (test_bit(TASKLET_STATE_SCHED, &t->state)); + } + tasklet_unlock_wait(t); + clear_bit(TASKLET_STATE_SCHED, &t->state); +} + +EXPORT_SYMBOL(tasklet_kill); + +void __init softirq_init(void) +{ + open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); + open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); +} + +static int ksoftirqd(void * __bind_cpu) +{ + set_user_nice(current, 19); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + preempt_disable(); + if (!local_softirq_pending()) { + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } + + __set_current_state(TASK_RUNNING); + + while (local_softirq_pending()) { + /* Preempt disable stops cpu going offline. + If already offline, we'll be on wrong CPU: + don't process */ + if (cpu_is_offline((long)__bind_cpu)) + goto wait_to_die; + do_softirq(); + preempt_enable_no_resched(); + cond_resched(); + preempt_disable(); + } + preempt_enable(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + preempt_enable(); + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * tasklet_kill_immediate is called to remove a tasklet which can already be + * scheduled for execution on @cpu. + * + * Unlike tasklet_kill, this function removes the tasklet + * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. + * + * When this function is called, @cpu must be in the CPU_DEAD state. + */ +void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) +{ + struct tasklet_struct **i; + + BUG_ON(cpu_online(cpu)); + BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); + + if (!test_bit(TASKLET_STATE_SCHED, &t->state)) + return; + + /* CPU is dead, so no lock needed. */ + for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { + if (*i == t) { + *i = t->next; + return; + } + } + BUG(); +} + +static void takeover_tasklets(unsigned int cpu) +{ + struct tasklet_struct **i; + + /* CPU is dead, so no lock needed. */ + local_irq_disable(); + + /* Find end, append list for that CPU. */ + for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); + *i = per_cpu(tasklet_vec, cpu).list; + per_cpu(tasklet_vec, cpu).list = NULL; + raise_softirq_irqoff(TASKLET_SOFTIRQ); + + for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); + *i = per_cpu(tasklet_hi_vec, cpu).list; + per_cpu(tasklet_hi_vec, cpu).list = NULL; + raise_softirq_irqoff(HI_SOFTIRQ); + + local_irq_enable(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(per_cpu(tasklet_vec, hotcpu).list); + BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); + p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu) = p; + break; + case CPU_ONLINE: + wake_up_process(per_cpu(ksoftirqd, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id()); + case CPU_DEAD: + p = per_cpu(ksoftirqd, hotcpu); + per_cpu(ksoftirqd, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_ksoftirqd(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} diff --git a/kernel/spinlock.c b/kernel/spinlock.c new file mode 100644 index 00000000000..e15ed17863f --- /dev/null +++ b/kernel/spinlock.c @@ -0,0 +1,371 @@ +/* + * Copyright (2004) Linus Torvalds + * + * Author: Zwane Mwaikambo <zwane@fsmlabs.com> + * + * Copyright (2004) Ingo Molnar + */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/module.h> + +/* + * Generic declaration of the raw read_trylock() function, + * architectures are supposed to optimize this: + */ +int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +{ + _raw_read_lock(lock); + return 1; +} +EXPORT_SYMBOL(generic_raw_read_trylock); + +int __lockfunc _spin_trylock(spinlock_t *lock) +{ + preempt_disable(); + if (_raw_spin_trylock(lock)) + return 1; + + preempt_enable(); + return 0; +} +EXPORT_SYMBOL(_spin_trylock); + +int __lockfunc _read_trylock(rwlock_t *lock) +{ + preempt_disable(); + if (_raw_read_trylock(lock)) + return 1; + + preempt_enable(); + return 0; +} +EXPORT_SYMBOL(_read_trylock); + +int __lockfunc _write_trylock(rwlock_t *lock) +{ + preempt_disable(); + if (_raw_write_trylock(lock)) + return 1; + + preempt_enable(); + return 0; +} +EXPORT_SYMBOL(_write_trylock); + +#ifndef CONFIG_PREEMPT + +void __lockfunc _read_lock(rwlock_t *lock) +{ + preempt_disable(); + _raw_read_lock(lock); +} +EXPORT_SYMBOL(_read_lock); + +unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + _raw_spin_lock_flags(lock, flags); + return flags; +} +EXPORT_SYMBOL(_spin_lock_irqsave); + +void __lockfunc _spin_lock_irq(spinlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + _raw_spin_lock(lock); +} +EXPORT_SYMBOL(_spin_lock_irq); + +void __lockfunc _spin_lock_bh(spinlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + _raw_spin_lock(lock); +} +EXPORT_SYMBOL(_spin_lock_bh); + +unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + _raw_read_lock(lock); + return flags; +} +EXPORT_SYMBOL(_read_lock_irqsave); + +void __lockfunc _read_lock_irq(rwlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + _raw_read_lock(lock); +} +EXPORT_SYMBOL(_read_lock_irq); + +void __lockfunc _read_lock_bh(rwlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + _raw_read_lock(lock); +} +EXPORT_SYMBOL(_read_lock_bh); + +unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + _raw_write_lock(lock); + return flags; +} +EXPORT_SYMBOL(_write_lock_irqsave); + +void __lockfunc _write_lock_irq(rwlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + _raw_write_lock(lock); +} +EXPORT_SYMBOL(_write_lock_irq); + +void __lockfunc _write_lock_bh(rwlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + _raw_write_lock(lock); +} +EXPORT_SYMBOL(_write_lock_bh); + +void __lockfunc _spin_lock(spinlock_t *lock) +{ + preempt_disable(); + _raw_spin_lock(lock); +} + +EXPORT_SYMBOL(_spin_lock); + +void __lockfunc _write_lock(rwlock_t *lock) +{ + preempt_disable(); + _raw_write_lock(lock); +} + +EXPORT_SYMBOL(_write_lock); + +#else /* CONFIG_PREEMPT: */ + +/* + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + * + * (We do this in a function because inlining it would be excessive.) + */ + +#define BUILD_LOCK_OPS(op, locktype) \ +void __lockfunc _##op##_lock(locktype##_t *lock) \ +{ \ + preempt_disable(); \ + for (;;) { \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!op##_can_lock(lock) && (lock)->break_lock) \ + cpu_relax(); \ + preempt_disable(); \ + } \ + (lock)->break_lock = 0; \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock); \ + \ +unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + preempt_disable(); \ + for (;;) { \ + local_irq_save(flags); \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!op##_can_lock(lock) && (lock)->break_lock) \ + cpu_relax(); \ + preempt_disable(); \ + } \ + (lock)->break_lock = 0; \ + return flags; \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irqsave); \ + \ +void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +{ \ + _##op##_lock_irqsave(lock); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irq); \ + \ +void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_bh) + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * _[spin|read|write]_lock() + * _[spin|read|write]_lock_irq() + * _[spin|read|write]_lock_irqsave() + * _[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, spinlock); +BUILD_LOCK_OPS(read, rwlock); +BUILD_LOCK_OPS(write, rwlock); + +#endif /* CONFIG_PREEMPT */ + +void __lockfunc _spin_unlock(spinlock_t *lock) +{ + _raw_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_spin_unlock); + +void __lockfunc _write_unlock(rwlock_t *lock) +{ + _raw_write_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock); + +void __lockfunc _read_unlock(rwlock_t *lock) +{ + _raw_read_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock); + +void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +{ + _raw_spin_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} +EXPORT_SYMBOL(_spin_unlock_irqrestore); + +void __lockfunc _spin_unlock_irq(spinlock_t *lock) +{ + _raw_spin_unlock(lock); + local_irq_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(_spin_unlock_irq); + +void __lockfunc _spin_unlock_bh(spinlock_t *lock) +{ + _raw_spin_unlock(lock); + preempt_enable(); + local_bh_enable(); +} +EXPORT_SYMBOL(_spin_unlock_bh); + +void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + _raw_read_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock_irqrestore); + +void __lockfunc _read_unlock_irq(rwlock_t *lock) +{ + _raw_read_unlock(lock); + local_irq_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock_irq); + +void __lockfunc _read_unlock_bh(rwlock_t *lock) +{ + _raw_read_unlock(lock); + preempt_enable(); + local_bh_enable(); +} +EXPORT_SYMBOL(_read_unlock_bh); + +void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + _raw_write_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock_irqrestore); + +void __lockfunc _write_unlock_irq(rwlock_t *lock) +{ + _raw_write_unlock(lock); + local_irq_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock_irq); + +void __lockfunc _write_unlock_bh(rwlock_t *lock) +{ + _raw_write_unlock(lock); + preempt_enable(); + local_bh_enable(); +} +EXPORT_SYMBOL(_write_unlock_bh); + +int __lockfunc _spin_trylock_bh(spinlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + if (_raw_spin_trylock(lock)) + return 1; + + preempt_enable(); + local_bh_enable(); + return 0; +} +EXPORT_SYMBOL(_spin_trylock_bh); + +int in_lock_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __lockfunc functions */ + extern char __lock_text_start[], __lock_text_end[]; + + return addr >= (unsigned long)__lock_text_start + && addr < (unsigned long)__lock_text_end; +} +EXPORT_SYMBOL(in_lock_functions); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c new file mode 100644 index 00000000000..c39ed70af17 --- /dev/null +++ b/kernel/stop_machine.c @@ -0,0 +1,212 @@ +#include <linux/stop_machine.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/syscalls.h> +#include <asm/atomic.h> +#include <asm/semaphore.h> +#include <asm/uaccess.h> + +/* Since we effect priority and affinity (both of which are visible + * to, and settable by outside processes) we do indirection via a + * kthread. */ + +/* Thread to stop each CPU in user context. */ +enum stopmachine_state { + STOPMACHINE_WAIT, + STOPMACHINE_PREPARE, + STOPMACHINE_DISABLE_IRQ, + STOPMACHINE_EXIT, +}; + +static enum stopmachine_state stopmachine_state; +static unsigned int stopmachine_num_threads; +static atomic_t stopmachine_thread_ack; +static DECLARE_MUTEX(stopmachine_mutex); + +static int stopmachine(void *cpu) +{ + int irqs_disabled = 0; + int prepared = 0; + + set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); + + /* Ack: we are alive */ + mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ + atomic_inc(&stopmachine_thread_ack); + + /* Simple state machine */ + while (stopmachine_state != STOPMACHINE_EXIT) { + if (stopmachine_state == STOPMACHINE_DISABLE_IRQ + && !irqs_disabled) { + local_irq_disable(); + irqs_disabled = 1; + /* Ack: irqs disabled. */ + mb(); /* Must read state first. */ + atomic_inc(&stopmachine_thread_ack); + } else if (stopmachine_state == STOPMACHINE_PREPARE + && !prepared) { + /* Everyone is in place, hold CPU. */ + preempt_disable(); + prepared = 1; + mb(); /* Must read state first. */ + atomic_inc(&stopmachine_thread_ack); + } + /* Yield in first stage: migration threads need to + * help our sisters onto their CPUs. */ + if (!prepared && !irqs_disabled) + yield(); + else + cpu_relax(); + } + + /* Ack: we are exiting. */ + mb(); /* Must read state first. */ + atomic_inc(&stopmachine_thread_ack); + + if (irqs_disabled) + local_irq_enable(); + if (prepared) + preempt_enable(); + + return 0; +} + +/* Change the thread state */ +static void stopmachine_set_state(enum stopmachine_state state) +{ + atomic_set(&stopmachine_thread_ack, 0); + wmb(); + stopmachine_state = state; + while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) + cpu_relax(); +} + +static int stop_machine(void) +{ + int i, ret = 0; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + mm_segment_t old_fs = get_fs(); + + /* One high-prio thread per cpu. We'll do this one. */ + set_fs(KERNEL_DS); + sys_sched_setscheduler(current->pid, SCHED_FIFO, + (struct sched_param __user *)¶m); + set_fs(old_fs); + + atomic_set(&stopmachine_thread_ack, 0); + stopmachine_num_threads = 0; + stopmachine_state = STOPMACHINE_WAIT; + + for_each_online_cpu(i) { + if (i == _smp_processor_id()) + continue; + ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); + if (ret < 0) + break; + stopmachine_num_threads++; + } + + /* Wait for them all to come to life. */ + while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) + yield(); + + /* If some failed, kill them all. */ + if (ret < 0) { + stopmachine_set_state(STOPMACHINE_EXIT); + up(&stopmachine_mutex); + return ret; + } + + /* Don't schedule us away at this point, please. */ + local_irq_disable(); + + /* Now they are all started, make them hold the CPUs, ready. */ + stopmachine_set_state(STOPMACHINE_PREPARE); + + /* Make them disable irqs. */ + stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); + + return 0; +} + +static void restart_machine(void) +{ + stopmachine_set_state(STOPMACHINE_EXIT); + local_irq_enable(); +} + +struct stop_machine_data +{ + int (*fn)(void *); + void *data; + struct completion done; +}; + +static int do_stop(void *_smdata) +{ + struct stop_machine_data *smdata = _smdata; + int ret; + + ret = stop_machine(); + if (ret == 0) { + ret = smdata->fn(smdata->data); + restart_machine(); + } + + /* We're done: you can kthread_stop us now */ + complete(&smdata->done); + + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return ret; +} + +struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, + unsigned int cpu) +{ + struct stop_machine_data smdata; + struct task_struct *p; + + smdata.fn = fn; + smdata.data = data; + init_completion(&smdata.done); + + down(&stopmachine_mutex); + + /* If they don't care which CPU fn runs on, bind to any online one. */ + if (cpu == NR_CPUS) + cpu = _smp_processor_id(); + + p = kthread_create(do_stop, &smdata, "kstopmachine"); + if (!IS_ERR(p)) { + kthread_bind(p, cpu); + wake_up_process(p); + wait_for_completion(&smdata.done); + } + up(&stopmachine_mutex); + return p; +} + +int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +{ + struct task_struct *p; + int ret; + + /* No CPUs can come up or down during this. */ + lock_cpu_hotplug(); + p = __stop_machine_run(fn, data, cpu); + if (!IS_ERR(p)) + ret = kthread_stop(p); + else + ret = PTR_ERR(p); + unlock_cpu_hotplug(); + + return ret; +} diff --git a/kernel/sys.c b/kernel/sys.c new file mode 100644 index 00000000000..462d78d5589 --- /dev/null +++ b/kernel/sys.c @@ -0,0 +1,1725 @@ +/* + * linux/kernel/sys.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/utsname.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/prctl.h> +#include <linux/init.h> +#include <linux/highuid.h> +#include <linux/fs.h> +#include <linux/workqueue.h> +#include <linux/device.h> +#include <linux/key.h> +#include <linux/times.h> +#include <linux/posix-timers.h> +#include <linux/security.h> +#include <linux/dcookies.h> +#include <linux/suspend.h> +#include <linux/tty.h> + +#include <linux/compat.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/unistd.h> + +#ifndef SET_UNALIGN_CTL +# define SET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_UNALIGN_CTL +# define GET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEMU_CTL +# define SET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEMU_CTL +# define GET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEXC_CTL +# define SET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEXC_CTL +# define GET_FPEXC_CTL(a,b) (-EINVAL) +#endif + +/* + * this is where the system-wide overflow UID and GID are defined, for + * architectures that now have 32-bit UID/GID but didn't in the past + */ + +int overflowuid = DEFAULT_OVERFLOWUID; +int overflowgid = DEFAULT_OVERFLOWGID; + +#ifdef CONFIG_UID16 +EXPORT_SYMBOL(overflowuid); +EXPORT_SYMBOL(overflowgid); +#endif + +/* + * the same as above, but for filesystems which can only store a 16-bit + * UID and GID. as such, this is needed on all architectures + */ + +int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; + +EXPORT_SYMBOL(fs_overflowuid); +EXPORT_SYMBOL(fs_overflowgid); + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +int cad_pid = 1; + +/* + * Notifier list for kernel code which wants to be called + * at shutdown. This is used to stop any idling DMA operations + * and the like. + */ + +static struct notifier_block *reboot_notifier_list; +static DEFINE_RWLOCK(notifier_lock); + +/** + * notifier_chain_register - Add notifier to a notifier chain + * @list: Pointer to root list pointer + * @n: New entry in notifier chain + * + * Adds a notifier to a notifier chain. + * + * Currently always returns zero. + */ + +int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) +{ + write_lock(¬ifier_lock); + while(*list) + { + if(n->priority > (*list)->priority) + break; + list= &((*list)->next); + } + n->next = *list; + *list=n; + write_unlock(¬ifier_lock); + return 0; +} + +EXPORT_SYMBOL(notifier_chain_register); + +/** + * notifier_chain_unregister - Remove notifier from a notifier chain + * @nl: Pointer to root list pointer + * @n: New entry in notifier chain + * + * Removes a notifier from a notifier chain. + * + * Returns zero on success, or %-ENOENT on failure. + */ + +int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) +{ + write_lock(¬ifier_lock); + while((*nl)!=NULL) + { + if((*nl)==n) + { + *nl=n->next; + write_unlock(¬ifier_lock); + return 0; + } + nl=&((*nl)->next); + } + write_unlock(¬ifier_lock); + return -ENOENT; +} + +EXPORT_SYMBOL(notifier_chain_unregister); + +/** + * notifier_call_chain - Call functions in a notifier chain + * @n: Pointer to root pointer of notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. + * + * If the return value of the notifier can be and'd + * with %NOTIFY_STOP_MASK, then notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise, the return value is the return value + * of the last notifier function called. + */ + +int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +{ + int ret=NOTIFY_DONE; + struct notifier_block *nb = *n; + + while(nb) + { + ret=nb->notifier_call(nb,val,v); + if(ret&NOTIFY_STOP_MASK) + { + return ret; + } + nb=nb->next; + } + return ret; +} + +EXPORT_SYMBOL(notifier_call_chain); + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as notifier_chain_register + * always returns zero. + */ + +int register_reboot_notifier(struct notifier_block * nb) +{ + return notifier_chain_register(&reboot_notifier_list, nb); +} + +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ + +int unregister_reboot_notifier(struct notifier_block * nb) +{ + return notifier_chain_unregister(&reboot_notifier_list, nb); +} + +EXPORT_SYMBOL(unregister_reboot_notifier); + +static int set_one_prio(struct task_struct *p, int niceval, int error) +{ + int no_nice; + + if (p->uid != current->euid && + p->euid != current->euid && !capable(CAP_SYS_NICE)) { + error = -EPERM; + goto out; + } + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) { + error = -EACCES; + goto out; + } + no_nice = security_task_setnice(p, niceval); + if (no_nice) { + error = no_nice; + goto out; + } + if (error == -ESRCH) + error = 0; + set_user_nice(p, niceval); +out: + return error; +} + +asmlinkage long sys_setpriority(int which, int who, int niceval) +{ + struct task_struct *g, *p; + struct user_struct *user; + int error = -EINVAL; + + if (which > 2 || which < 0) + goto out; + + /* normalize: avoid signed division (rounding problems) */ + error = -ESRCH; + if (niceval < -20) + niceval = -20; + if (niceval > 19) + niceval = 19; + + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (!who) + who = current->pid; + p = find_task_by_pid(who); + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (!who) + who = process_group(current); + do_each_task_pid(who, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_task_pid(who, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = current->user; + if (!who) + who = current->uid; + else + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) + if (p->uid == who) + error = set_one_prio(p, niceval, error); + while_each_thread(g, p); + if (who != current->uid) + free_uid(user); /* For find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); +out: + return error; +} + +/* + * Ugh. To avoid negative return values, "getpriority()" will + * not return the normal nice-value, but a negated value that + * has been offset by 20 (ie it returns 40..1 instead of -20..19) + * to stay compatible. + */ +asmlinkage long sys_getpriority(int which, int who) +{ + struct task_struct *g, *p; + struct user_struct *user; + long niceval, retval = -ESRCH; + + if (which > 2 || which < 0) + return -EINVAL; + + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (!who) + who = current->pid; + p = find_task_by_pid(who); + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (!who) + who = process_group(current); + do_each_task_pid(who, PIDTYPE_PGID, p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } while_each_task_pid(who, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = current->user; + if (!who) + who = current->uid; + else + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) + if (p->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + while_each_thread(g, p); + if (who != current->uid) + free_uid(user); /* for find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); + + return retval; +} + + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) +{ + char buffer[256]; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + lock_kernel(); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + printk(KERN_EMERG "Restarting system.\n"); + machine_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); + system_state = SYSTEM_HALT; + device_shutdown(); + printk(KERN_EMERG "System halted.\n"); + machine_halt(); + unlock_kernel(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); + system_state = SYSTEM_POWER_OFF; + device_shutdown(); + printk(KERN_EMERG "Power down.\n"); + machine_power_off(); + unlock_kernel(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + unlock_kernel(); + return -EFAULT; + } + buffer[sizeof(buffer) - 1] = '\0'; + + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); + system_state = SYSTEM_RESTART; + device_shutdown(); + printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); + machine_restart(buffer); + break; + +#ifdef CONFIG_SOFTWARE_SUSPEND + case LINUX_REBOOT_CMD_SW_SUSPEND: + { + int ret = software_suspend(); + unlock_kernel(); + return ret; + } +#endif + + default: + unlock_kernel(); + return -EINVAL; + } + unlock_kernel(); + return 0; +} + +static void deferred_cad(void *dummy) +{ + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + machine_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad, NULL); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_proc(cad_pid, SIGINT, 1); +} + + +/* + * Unprivileged users may change the real gid to the effective gid + * or vice versa. (BSD-style) + * + * If you set the real gid at all, or set the effective gid to a value not + * equal to the real gid, then the saved gid is set to the new effective gid. + * + * This makes it possible for a setgid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setregid() will be + * 100% compatible with BSD. A program which uses just setgid() will be + * 100% compatible with POSIX with saved IDs. + * + * SMP: There are not races, the GIDs are checked only by filesystem + * operations (as far as semantic preservation is concerned). + */ +asmlinkage long sys_setregid(gid_t rgid, gid_t egid) +{ + int old_rgid = current->gid; + int old_egid = current->egid; + int new_rgid = old_rgid; + int new_egid = old_egid; + int retval; + + retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); + if (retval) + return retval; + + if (rgid != (gid_t) -1) { + if ((old_rgid == rgid) || + (current->egid==rgid) || + capable(CAP_SETGID)) + new_rgid = rgid; + else + return -EPERM; + } + if (egid != (gid_t) -1) { + if ((old_rgid == egid) || + (current->egid == egid) || + (current->sgid == egid) || + capable(CAP_SETGID)) + new_egid = egid; + else { + return -EPERM; + } + } + if (new_egid != old_egid) + { + current->mm->dumpable = 0; + wmb(); + } + if (rgid != (gid_t) -1 || + (egid != (gid_t) -1 && egid != old_rgid)) + current->sgid = new_egid; + current->fsgid = new_egid; + current->egid = new_egid; + current->gid = new_rgid; + key_fsgid_changed(current); + return 0; +} + +/* + * setgid() is implemented like SysV w/ SAVED_IDS + * + * SMP: Same implicit races as above. + */ +asmlinkage long sys_setgid(gid_t gid) +{ + int old_egid = current->egid; + int retval; + + retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); + if (retval) + return retval; + + if (capable(CAP_SETGID)) + { + if(old_egid != gid) + { + current->mm->dumpable=0; + wmb(); + } + current->gid = current->egid = current->sgid = current->fsgid = gid; + } + else if ((gid == current->gid) || (gid == current->sgid)) + { + if(old_egid != gid) + { + current->mm->dumpable=0; + wmb(); + } + current->egid = current->fsgid = gid; + } + else + return -EPERM; + + key_fsgid_changed(current); + return 0; +} + +static int set_user(uid_t new_ruid, int dumpclear) +{ + struct user_struct *new_user; + + new_user = alloc_uid(new_ruid); + if (!new_user) + return -EAGAIN; + + if (atomic_read(&new_user->processes) >= + current->signal->rlim[RLIMIT_NPROC].rlim_cur && + new_user != &root_user) { + free_uid(new_user); + return -EAGAIN; + } + + switch_uid(new_user); + + if(dumpclear) + { + current->mm->dumpable = 0; + wmb(); + } + current->uid = new_ruid; + return 0; +} + +/* + * Unprivileged users may change the real uid to the effective uid + * or vice versa. (BSD-style) + * + * If you set the real uid at all, or set the effective uid to a value not + * equal to the real uid, then the saved uid is set to the new effective uid. + * + * This makes it possible for a setuid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setreuid() will be + * 100% compatible with BSD. A program which uses just setuid() will be + * 100% compatible with POSIX with saved IDs. + */ +asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) +{ + int old_ruid, old_euid, old_suid, new_ruid, new_euid; + int retval; + + retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); + if (retval) + return retval; + + new_ruid = old_ruid = current->uid; + new_euid = old_euid = current->euid; + old_suid = current->suid; + + if (ruid != (uid_t) -1) { + new_ruid = ruid; + if ((old_ruid != ruid) && + (current->euid != ruid) && + !capable(CAP_SETUID)) + return -EPERM; + } + + if (euid != (uid_t) -1) { + new_euid = euid; + if ((old_ruid != euid) && + (current->euid != euid) && + (current->suid != euid) && + !capable(CAP_SETUID)) + return -EPERM; + } + + if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) + return -EAGAIN; + + if (new_euid != old_euid) + { + current->mm->dumpable=0; + wmb(); + } + current->fsuid = current->euid = new_euid; + if (ruid != (uid_t) -1 || + (euid != (uid_t) -1 && euid != old_ruid)) + current->suid = current->euid; + current->fsuid = current->euid; + + key_fsuid_changed(current); + + return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); +} + + + +/* + * setuid() is implemented like SysV with SAVED_IDS + * + * Note that SAVED_ID's is deficient in that a setuid root program + * like sendmail, for example, cannot set its uid to be a normal + * user and then switch back, because if you're root, setuid() sets + * the saved uid too. If you don't like this, blame the bright people + * in the POSIX committee and/or USG. Note that the BSD-style setreuid() + * will allow a root program to temporarily drop privileges and be able to + * regain them by swapping the real and effective uid. + */ +asmlinkage long sys_setuid(uid_t uid) +{ + int old_euid = current->euid; + int old_ruid, old_suid, new_ruid, new_suid; + int retval; + + retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); + if (retval) + return retval; + + old_ruid = new_ruid = current->uid; + old_suid = current->suid; + new_suid = old_suid; + + if (capable(CAP_SETUID)) { + if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) + return -EAGAIN; + new_suid = uid; + } else if ((uid != current->uid) && (uid != new_suid)) + return -EPERM; + + if (old_euid != uid) + { + current->mm->dumpable = 0; + wmb(); + } + current->fsuid = current->euid = uid; + current->suid = new_suid; + + key_fsuid_changed(current); + + return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); +} + + +/* + * This function implements a generic ability to update ruid, euid, + * and suid. This allows you to implement the 4.4 compatible seteuid(). + */ +asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) +{ + int old_ruid = current->uid; + int old_euid = current->euid; + int old_suid = current->suid; + int retval; + + retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); + if (retval) + return retval; + + if (!capable(CAP_SETUID)) { + if ((ruid != (uid_t) -1) && (ruid != current->uid) && + (ruid != current->euid) && (ruid != current->suid)) + return -EPERM; + if ((euid != (uid_t) -1) && (euid != current->uid) && + (euid != current->euid) && (euid != current->suid)) + return -EPERM; + if ((suid != (uid_t) -1) && (suid != current->uid) && + (suid != current->euid) && (suid != current->suid)) + return -EPERM; + } + if (ruid != (uid_t) -1) { + if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) + return -EAGAIN; + } + if (euid != (uid_t) -1) { + if (euid != current->euid) + { + current->mm->dumpable = 0; + wmb(); + } + current->euid = euid; + } + current->fsuid = current->euid; + if (suid != (uid_t) -1) + current->suid = suid; + + key_fsuid_changed(current); + + return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); +} + +asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) +{ + int retval; + + if (!(retval = put_user(current->uid, ruid)) && + !(retval = put_user(current->euid, euid))) + retval = put_user(current->suid, suid); + + return retval; +} + +/* + * Same as above, but for rgid, egid, sgid. + */ +asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) +{ + int retval; + + retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); + if (retval) + return retval; + + if (!capable(CAP_SETGID)) { + if ((rgid != (gid_t) -1) && (rgid != current->gid) && + (rgid != current->egid) && (rgid != current->sgid)) + return -EPERM; + if ((egid != (gid_t) -1) && (egid != current->gid) && + (egid != current->egid) && (egid != current->sgid)) + return -EPERM; + if ((sgid != (gid_t) -1) && (sgid != current->gid) && + (sgid != current->egid) && (sgid != current->sgid)) + return -EPERM; + } + if (egid != (gid_t) -1) { + if (egid != current->egid) + { + current->mm->dumpable = 0; + wmb(); + } + current->egid = egid; + } + current->fsgid = current->egid; + if (rgid != (gid_t) -1) + current->gid = rgid; + if (sgid != (gid_t) -1) + current->sgid = sgid; + + key_fsgid_changed(current); + return 0; +} + +asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) +{ + int retval; + + if (!(retval = put_user(current->gid, rgid)) && + !(retval = put_user(current->egid, egid))) + retval = put_user(current->sgid, sgid); + + return retval; +} + + +/* + * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This + * is used for "access()" and for the NFS daemon (letting nfsd stay at + * whatever uid it wants to). It normally shadows "euid", except when + * explicitly set by setfsuid() or for access.. + */ +asmlinkage long sys_setfsuid(uid_t uid) +{ + int old_fsuid; + + old_fsuid = current->fsuid; + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) + return old_fsuid; + + if (uid == current->uid || uid == current->euid || + uid == current->suid || uid == current->fsuid || + capable(CAP_SETUID)) + { + if (uid != old_fsuid) + { + current->mm->dumpable = 0; + wmb(); + } + current->fsuid = uid; + } + + key_fsuid_changed(current); + + security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); + + return old_fsuid; +} + +/* + * Samma p� svenska.. + */ +asmlinkage long sys_setfsgid(gid_t gid) +{ + int old_fsgid; + + old_fsgid = current->fsgid; + if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) + return old_fsgid; + + if (gid == current->gid || gid == current->egid || + gid == current->sgid || gid == current->fsgid || + capable(CAP_SETGID)) + { + if (gid != old_fsgid) + { + current->mm->dumpable = 0; + wmb(); + } + current->fsgid = gid; + key_fsgid_changed(current); + } + return old_fsgid; +} + +asmlinkage long sys_times(struct tms __user * tbuf) +{ + /* + * In the SMP world we might just be unlucky and have one of + * the times increment as we use it. Since the value is an + * atomically safe type this is just fine. Conceptually its + * as if the syscall took an instant longer to occur. + */ + if (tbuf) { + struct tms tmp; + struct task_struct *tsk = current; + struct task_struct *t; + cputime_t utime, stime, cutime, cstime; + + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + + tmp.tms_utime = cputime_to_clock_t(utime); + tmp.tms_stime = cputime_to_clock_t(stime); + tmp.tms_cutime = cputime_to_clock_t(cutime); + tmp.tms_cstime = cputime_to_clock_t(cstime); + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } + return (long) jiffies_64_to_clock_t(get_jiffies_64()); +} + +/* + * This needs some heavy checking ... + * I just haven't the stomach for it. I also don't fully + * understand sessions/pgrp etc. Let somebody who does explain it. + * + * OK, I think I have the protection semantics right.... this is really + * only important on a multi-user system anyway, to make sure one user + * can't send a signal to a process owned by another. -TYT, 12/12/91 + * + * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. + * LBT 04.03.94 + */ + +asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) +{ + struct task_struct *p; + int err = -EINVAL; + + if (!pid) + pid = current->pid; + if (!pgid) + pgid = pid; + if (pgid < 0) + return -EINVAL; + + /* From this point forward we keep holding onto the tasklist lock + * so that our parent does not change from under us. -DaveM + */ + write_lock_irq(&tasklist_lock); + + err = -ESRCH; + p = find_task_by_pid(pid); + if (!p) + goto out; + + err = -EINVAL; + if (!thread_group_leader(p)) + goto out; + + if (p->parent == current || p->real_parent == current) { + err = -EPERM; + if (p->signal->session != current->signal->session) + goto out; + err = -EACCES; + if (p->did_exec) + goto out; + } else { + err = -ESRCH; + if (p != current) + goto out; + } + + err = -EPERM; + if (p->signal->leader) + goto out; + + if (pgid != pid) { + struct task_struct *p; + + do_each_task_pid(pgid, PIDTYPE_PGID, p) { + if (p->signal->session == current->signal->session) + goto ok_pgid; + } while_each_task_pid(pgid, PIDTYPE_PGID, p); + goto out; + } + +ok_pgid: + err = security_task_setpgid(p, pgid); + if (err) + goto out; + + if (process_group(p) != pgid) { + detach_pid(p, PIDTYPE_PGID); + p->signal->pgrp = pgid; + attach_pid(p, PIDTYPE_PGID, pgid); + } + + err = 0; +out: + /* All paths lead to here, thus we are safe. -DaveM */ + write_unlock_irq(&tasklist_lock); + return err; +} + +asmlinkage long sys_getpgid(pid_t pid) +{ + if (!pid) { + return process_group(current); + } else { + int retval; + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + + retval = -ESRCH; + if (p) { + retval = security_task_getpgid(p); + if (!retval) + retval = process_group(p); + } + read_unlock(&tasklist_lock); + return retval; + } +} + +#ifdef __ARCH_WANT_SYS_GETPGRP + +asmlinkage long sys_getpgrp(void) +{ + /* SMP - assuming writes are word atomic this is fine */ + return process_group(current); +} + +#endif + +asmlinkage long sys_getsid(pid_t pid) +{ + if (!pid) { + return current->signal->session; + } else { + int retval; + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + + retval = -ESRCH; + if(p) { + retval = security_task_getsid(p); + if (!retval) + retval = p->signal->session; + } + read_unlock(&tasklist_lock); + return retval; + } +} + +asmlinkage long sys_setsid(void) +{ + struct pid *pid; + int err = -EPERM; + + if (!thread_group_leader(current)) + return -EINVAL; + + down(&tty_sem); + write_lock_irq(&tasklist_lock); + + pid = find_pid(PIDTYPE_PGID, current->pid); + if (pid) + goto out; + + current->signal->leader = 1; + __set_special_pids(current->pid, current->pid); + current->signal->tty = NULL; + current->signal->tty_old_pgrp = 0; + err = process_group(current); +out: + write_unlock_irq(&tasklist_lock); + up(&tty_sem); + return err; +} + +/* + * Supplementary group IDs + */ + +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + +struct group_info *groups_alloc(int gidsetsize) +{ + struct group_info *group_info; + int nblocks; + int i; + + nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; + /* Make sure we always allocate at least one indirect block pointer */ + nblocks = nblocks ? : 1; + group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); + if (!group_info) + return NULL; + group_info->ngroups = gidsetsize; + group_info->nblocks = nblocks; + atomic_set(&group_info->usage, 1); + + if (gidsetsize <= NGROUPS_SMALL) { + group_info->blocks[0] = group_info->small_block; + } else { + for (i = 0; i < nblocks; i++) { + gid_t *b; + b = (void *)__get_free_page(GFP_USER); + if (!b) + goto out_undo_partial_alloc; + group_info->blocks[i] = b; + } + } + return group_info; + +out_undo_partial_alloc: + while (--i >= 0) { + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); + return NULL; +} + +EXPORT_SYMBOL(groups_alloc); + +void groups_free(struct group_info *group_info) +{ + if (group_info->blocks[0] != group_info->small_block) { + int i; + for (i = 0; i < group_info->nblocks; i++) + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); +} + +EXPORT_SYMBOL(groups_free); + +/* export the group_info to a user-space array */ +static int groups_to_user(gid_t __user *grouplist, + struct group_info *group_info) +{ + int i; + int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + int cp_count = min(NGROUPS_PER_BLOCK, count); + int off = i * NGROUPS_PER_BLOCK; + int len = cp_count * sizeof(*grouplist); + + if (copy_to_user(grouplist+off, group_info->blocks[i], len)) + return -EFAULT; + + count -= cp_count; + } + return 0; +} + +/* fill a group_info from a user-space array - it must be allocated already */ +static int groups_from_user(struct group_info *group_info, + gid_t __user *grouplist) + { + int i; + int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + int cp_count = min(NGROUPS_PER_BLOCK, count); + int off = i * NGROUPS_PER_BLOCK; + int len = cp_count * sizeof(*grouplist); + + if (copy_from_user(group_info->blocks[i], grouplist+off, len)) + return -EFAULT; + + count -= cp_count; + } + return 0; +} + +/* a simple shell-metzner sort */ +static void groups_sort(struct group_info *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = GROUP_AT(group_info, right); + + while (left >= 0 && GROUP_AT(group_info, left) > tmp) { + GROUP_AT(group_info, right) = + GROUP_AT(group_info, left); + right = left; + left -= stride; + } + GROUP_AT(group_info, right) = tmp; + } + stride /= 3; + } +} + +/* a simple bsearch */ +static int groups_search(struct group_info *group_info, gid_t grp) +{ + int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + int mid = (left+right)/2; + int cmp = grp - GROUP_AT(group_info, mid); + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +/* validate and set current->group_info */ +int set_current_groups(struct group_info *group_info) +{ + int retval; + struct group_info *old_info; + + retval = security_task_setgroups(group_info); + if (retval) + return retval; + + groups_sort(group_info); + get_group_info(group_info); + + task_lock(current); + old_info = current->group_info; + current->group_info = group_info; + task_unlock(current); + + put_group_info(old_info); + + return 0; +} + +EXPORT_SYMBOL(set_current_groups); + +asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) +{ + int i = 0; + + /* + * SMP: Nobody else can change our grouplist. Thus we are + * safe. + */ + + if (gidsetsize < 0) + return -EINVAL; + + /* no need to grab task_lock here; it cannot change */ + get_group_info(current->group_info); + i = current->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups_to_user(grouplist, current->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + put_group_info(current->group_info); + return i; +} + +/* + * SMP: Our groups are copy-on-write. We can set them safely + * without another task interfering. + */ + +asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) +{ + struct group_info *group_info; + int retval; + + if (!capable(CAP_SETGID)) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +/* + * Check whether we're fsgid/egid or in the supplemental group.. + */ +int in_group_p(gid_t grp) +{ + int retval = 1; + if (grp != current->fsgid) { + get_group_info(current->group_info); + retval = groups_search(current->group_info, grp); + put_group_info(current->group_info); + } + return retval; +} + +EXPORT_SYMBOL(in_group_p); + +int in_egroup_p(gid_t grp) +{ + int retval = 1; + if (grp != current->egid) { + get_group_info(current->group_info); + retval = groups_search(current->group_info, grp); + put_group_info(current->group_info); + } + return retval; +} + +EXPORT_SYMBOL(in_egroup_p); + +DECLARE_RWSEM(uts_sem); + +EXPORT_SYMBOL(uts_sem); + +asmlinkage long sys_newuname(struct new_utsname __user * name) +{ + int errno = 0; + + down_read(&uts_sem); + if (copy_to_user(name,&system_utsname,sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +asmlinkage long sys_sethostname(char __user *name, int len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + memcpy(system_utsname.nodename, tmp, len); + system_utsname.nodename[len] = 0; + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +#ifdef __ARCH_WANT_SYS_GETHOSTNAME + +asmlinkage long sys_gethostname(char __user *name, int len) +{ + int i, errno; + + if (len < 0) + return -EINVAL; + down_read(&uts_sem); + i = 1 + strlen(system_utsname.nodename); + if (i > len) + i = len; + errno = 0; + if (copy_to_user(name, system_utsname.nodename, i)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +#endif + +/* + * Only setdomainname; getdomainname can be implemented by calling + * uname() + */ +asmlinkage long sys_setdomainname(char __user *name, int len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + memcpy(system_utsname.domainname, tmp, len); + system_utsname.domainname[len] = 0; + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) +{ + if (resource >= RLIM_NLIMITS) + return -EINVAL; + else { + struct rlimit value; + task_lock(current->group_leader); + value = current->signal->rlim[resource]; + task_unlock(current->group_leader); + return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + } +} + +#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT + +/* + * Back compatibility for getrlimit. Needed for some apps. + */ + +asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) +{ + struct rlimit x; + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + x = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if(x.rlim_cur > 0x7FFFFFFF) + x.rlim_cur = 0x7FFFFFFF; + if(x.rlim_max > 0x7FFFFFFF) + x.rlim_max = 0x7FFFFFFF; + return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; +} + +#endif + +asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) +{ + struct rlimit new_rlim, *old_rlim; + int retval; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; + old_rlim = current->signal->rlim + resource; + if ((new_rlim.rlim_max > old_rlim->rlim_max) && + !capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) + return -EPERM; + + retval = security_task_setrlimit(resource, &new_rlim); + if (retval) + return retval; + + task_lock(current->group_leader); + *old_rlim = new_rlim; + task_unlock(current->group_leader); + + if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && + (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + new_rlim.rlim_cur <= cputime_to_secs( + current->signal->it_prof_expires))) { + cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); + read_lock(&tasklist_lock); + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, + &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); + read_unlock(&tasklist_lock); + } + + return 0; +} + +/* + * It would make sense to put struct rusage in the task_struct, + * except that would make the task_struct be *really big*. After + * task_struct gets moved into malloc'ed memory, it would + * make sense to do this. It will make moving the rest of the information + * a lot simpler! (Which we're not doing right now because we're not + * measuring them yet). + * + * This expects to be called with tasklist_lock read-locked or better, + * and the siglock not locked. It may momentarily take the siglock. + * + * When sampling multiple threads for RUSAGE_SELF, under SMP we might have + * races with threads incrementing their own counters. But since word + * reads are atomic, we either get new values or old values and we don't + * care which for the sums. We always take the siglock to protect reading + * the c* fields from p->signal from races with exit.c updating those + * fields when reaping, so a sample either gets all the additions of a + * given child after it's reaped, or none so this sample is before reaping. + */ + +static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +{ + struct task_struct *t; + unsigned long flags; + cputime_t utime, stime; + + memset((char *) r, 0, sizeof *r); + + if (unlikely(!p->signal)) + return; + + switch (who) { + case RUSAGE_CHILDREN: + spin_lock_irqsave(&p->sighand->siglock, flags); + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + spin_unlock_irqrestore(&p->sighand->siglock, flags); + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); + break; + case RUSAGE_SELF: + spin_lock_irqsave(&p->sighand->siglock, flags); + utime = stime = cputime_zero; + goto sum_group; + case RUSAGE_BOTH: + spin_lock_irqsave(&p->sighand->siglock, flags); + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + sum_group: + utime = cputime_add(utime, p->signal->utime); + stime = cputime_add(stime, p->signal->stime); + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + t = p; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + t = next_thread(t); + } while (t != p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); + break; + default: + BUG(); + } +} + +int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +{ + struct rusage r; + read_lock(&tasklist_lock); + k_getrusage(p, who, &r); + read_unlock(&tasklist_lock); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; +} + +asmlinkage long sys_getrusage(int who, struct rusage __user *ru) +{ + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) + return -EINVAL; + return getrusage(current, who, ru); +} + +asmlinkage long sys_umask(int mask) +{ + mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); + return mask; +} + +asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + long error; + int sig; + + error = security_task_prctl(option, arg2, arg3, arg4, arg5); + if (error) + return error; + + switch (option) { + case PR_SET_PDEATHSIG: + sig = arg2; + if (sig < 0 || sig > _NSIG) { + error = -EINVAL; + break; + } + current->pdeath_signal = sig; + break; + case PR_GET_PDEATHSIG: + error = put_user(current->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + if (current->mm->dumpable) + error = 1; + break; + case PR_SET_DUMPABLE: + if (arg2 != 0 && arg2 != 1) { + error = -EINVAL; + break; + } + current->mm->dumpable = arg2; + break; + + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(current, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(current, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(current, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(current, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(current, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(current, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 == PR_TIMING_STATISTICAL) + error = 0; + else + error = -EINVAL; + break; + + case PR_GET_KEEPCAPS: + if (current->keep_capabilities) + error = 1; + break; + case PR_SET_KEEPCAPS: + if (arg2 != 0 && arg2 != 1) { + error = -EINVAL; + break; + } + current->keep_capabilities = arg2; + break; + case PR_SET_NAME: { + struct task_struct *me = current; + unsigned char ncomm[sizeof(me->comm)]; + + ncomm[sizeof(me->comm)-1] = 0; + if (strncpy_from_user(ncomm, (char __user *)arg2, + sizeof(me->comm)-1) < 0) + return -EFAULT; + set_task_comm(me, ncomm); + return 0; + } + case PR_GET_NAME: { + struct task_struct *me = current; + unsigned char tcomm[sizeof(me->comm)]; + + get_task_comm(tcomm, me); + if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) + return -EFAULT; + return 0; + } + default: + error = -EINVAL; + break; + } + return error; +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c new file mode 100644 index 00000000000..1802a311dd3 --- /dev/null +++ b/kernel/sys_ni.c @@ -0,0 +1,86 @@ + +#include <linux/linkage.h> +#include <linux/errno.h> + +#include <asm/unistd.h> + +/* + * Non-implemented system calls get redirected here. + */ +asmlinkage long sys_ni_syscall(void) +{ + return -ENOSYS; +} + +cond_syscall(sys_nfsservctl); +cond_syscall(sys_quotactl); +cond_syscall(sys_acct); +cond_syscall(sys_lookup_dcookie); +cond_syscall(sys_swapon); +cond_syscall(sys_swapoff); +cond_syscall(sys_init_module); +cond_syscall(sys_delete_module); +cond_syscall(sys_socketpair); +cond_syscall(sys_bind); +cond_syscall(sys_listen); +cond_syscall(sys_accept); +cond_syscall(sys_connect); +cond_syscall(sys_getsockname); +cond_syscall(sys_getpeername); +cond_syscall(sys_sendto); +cond_syscall(sys_send); +cond_syscall(sys_recvfrom); +cond_syscall(sys_recv); +cond_syscall(sys_socket); +cond_syscall(sys_setsockopt); +cond_syscall(sys_getsockopt); +cond_syscall(sys_shutdown); +cond_syscall(sys_sendmsg); +cond_syscall(sys_recvmsg); +cond_syscall(sys_socketcall); +cond_syscall(sys_futex); +cond_syscall(compat_sys_futex); +cond_syscall(sys_epoll_create); +cond_syscall(sys_epoll_ctl); +cond_syscall(sys_epoll_wait); +cond_syscall(sys_semget); +cond_syscall(sys_semop); +cond_syscall(sys_semtimedop); +cond_syscall(sys_semctl); +cond_syscall(sys_msgget); +cond_syscall(sys_msgsnd); +cond_syscall(sys_msgrcv); +cond_syscall(sys_msgctl); +cond_syscall(sys_shmget); +cond_syscall(sys_shmdt); +cond_syscall(sys_shmctl); +cond_syscall(sys_mq_open); +cond_syscall(sys_mq_unlink); +cond_syscall(sys_mq_timedsend); +cond_syscall(sys_mq_timedreceive); +cond_syscall(sys_mq_notify); +cond_syscall(sys_mq_getsetattr); +cond_syscall(compat_sys_mq_open); +cond_syscall(compat_sys_mq_timedsend); +cond_syscall(compat_sys_mq_timedreceive); +cond_syscall(compat_sys_mq_notify); +cond_syscall(compat_sys_mq_getsetattr); +cond_syscall(sys_mbind); +cond_syscall(sys_get_mempolicy); +cond_syscall(sys_set_mempolicy); +cond_syscall(compat_sys_mbind); +cond_syscall(compat_sys_get_mempolicy); +cond_syscall(compat_sys_set_mempolicy); +cond_syscall(sys_add_key); +cond_syscall(sys_request_key); +cond_syscall(sys_keyctl); +cond_syscall(compat_sys_keyctl); +cond_syscall(compat_sys_socketcall); + +/* arch-specific weak syscall entries */ +cond_syscall(sys_pciconfig_read); +cond_syscall(sys_pciconfig_write); +cond_syscall(sys_pciconfig_iobase); +cond_syscall(sys32_ipc); +cond_syscall(sys32_sysctl); +cond_syscall(ppc_rtas); diff --git a/kernel/sysctl.c b/kernel/sysctl.c new file mode 100644 index 00000000000..79dbd93bd69 --- /dev/null +++ b/kernel/sysctl.c @@ -0,0 +1,2337 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * Added /proc support, Dec 1995 + * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. + * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. + * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. + * Dynamic registration fixes, Stephen Tweedie. + * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. + * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris + * Horn. + * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. + * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. + * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill + * Wendling. + * The list_for_each() macro wasn't appropriate for the sysctl loop. + * Removed it and replaced it with older style, 03/23/00, Bill Wendling + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/ctype.h> +#include <linux/utsname.h> +#include <linux/capability.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sysrq.h> +#include <linux/highuid.h> +#include <linux/writeback.h> +#include <linux/hugetlb.h> +#include <linux/security.h> +#include <linux/initrd.h> +#include <linux/times.h> +#include <linux/limits.h> +#include <linux/dcache.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> +#include <asm/processor.h> + +#ifdef CONFIG_ROOT_NFS +#include <linux/nfs_fs.h> +#endif + +#if defined(CONFIG_SYSCTL) + +/* External variables not in a header file. */ +extern int C_A_D; +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern int max_threads; +extern int sysrq_enabled; +extern int core_uses_pid; +extern char core_pattern[]; +extern int cad_pid; +extern int pid_max; +extern int min_free_kbytes; +extern int printk_ratelimit_jiffies; +extern int printk_ratelimit_burst; +extern int pid_max_min, pid_max_max; + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +int unknown_nmi_panic; +extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +#endif + +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +static int maxolduid = 65535; +static int minolduid; + +static int ngroups_max = NGROUPS_MAX; + +#ifdef CONFIG_KMOD +extern char modprobe_path[]; +#endif +#ifdef CONFIG_HOTPLUG +extern char hotplug_path[]; +#endif +#ifdef CONFIG_CHR_DEV_SG +extern int sg_big_buff; +#endif +#ifdef CONFIG_SYSVIPC +extern size_t shm_ctlmax; +extern size_t shm_ctlall; +extern int shm_ctlmni; +extern int msg_ctlmax; +extern int msg_ctlmnb; +extern int msg_ctlmni; +extern int sem_ctls[]; +#endif + +#ifdef __sparc__ +extern char reboot_command []; +extern int stop_a_enabled; +extern int scons_pwroff; +#endif + +#ifdef __hppa__ +extern int pwrsw_enabled; +extern int unaligned_enabled; +#endif + +#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_MATHEMU +extern int sysctl_ieee_emulation_warnings; +#endif +extern int sysctl_userprocess_debug; +#endif + +extern int sysctl_hz_timer; + +#ifdef CONFIG_BSD_PROCESS_ACCT +extern int acct_parm[]; +#endif + +int randomize_va_space = 1; + +static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, + ctl_table *, void **); +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static ctl_table root_table[]; +static struct ctl_table_header root_table_header = + { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; + +static ctl_table kern_table[]; +static ctl_table vm_table[]; +#ifdef CONFIG_NET +extern ctl_table net_table[]; +#endif +static ctl_table proc_table[]; +static ctl_table fs_table[]; +static ctl_table debug_table[]; +static ctl_table dev_table[]; +extern ctl_table random_table[]; +#ifdef CONFIG_UNIX98_PTYS +extern ctl_table pty_table[]; +#endif + +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +int sysctl_legacy_va_layout; +#endif + +/* /proc declarations: */ + +#ifdef CONFIG_PROC_FS + +static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); +static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); +static int proc_opensys(struct inode *, struct file *); + +struct file_operations proc_sys_file_operations = { + .open = proc_opensys, + .read = proc_readsys, + .write = proc_writesys, +}; + +extern struct proc_dir_entry *proc_sys_root; + +static void register_proc_table(ctl_table *, struct proc_dir_entry *); +static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); +#endif + +/* The default sysctl tables: */ + +static ctl_table root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .ctl_name = CTL_VM, + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, +#ifdef CONFIG_NET + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = net_table, + }, +#endif + { + .ctl_name = CTL_PROC, + .procname = "proc", + .mode = 0555, + .child = proc_table, + }, + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .ctl_name = CTL_DEV, + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table kern_table[] = { + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = system_utsname.sysname, + .maxlen = sizeof(system_utsname.sysname), + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = system_utsname.release, + .maxlen = sizeof(system_utsname.release), + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = system_utsname.version, + .maxlen = sizeof(system_utsname.version), + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = system_utsname.nodename, + .maxlen = sizeof(system_utsname.nodename), + .mode = 0644, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = system_utsname.domainname, + .maxlen = sizeof(system_utsname.domainname), + .mode = 0644, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_PANIC, + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CORE_USES_PID, + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CORE_PATTERN, + .procname = "core_pattern", + .data = core_pattern, + .maxlen = 64, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_TAINTED, + .procname = "tainted", + .data = &tainted, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CAP_BSET, + .procname = "cap-bound", + .data = &cap_bset, + .maxlen = sizeof(kernel_cap_t), + .mode = 0600, + .proc_handler = &proc_dointvec_bset, + }, +#ifdef CONFIG_BLK_DEV_INITRD + { + .ctl_name = KERN_REALROOTDEV, + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef __sparc__ + { + .ctl_name = KERN_SPARC_REBOOT, + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_SPARC_STOP_A, + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_SPARC_SCONS_PWROFF, + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef __hppa__ + { + .ctl_name = KERN_HPPA_PWRSW, + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPPA_UNALIGNED, + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_CTLALTDEL, + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PRINTK, + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_KMOD + { + .ctl_name = KERN_MODPROBE, + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +#endif +#ifdef CONFIG_HOTPLUG + { + .ctl_name = KERN_HOTPLUG, + .procname = "hotplug", + .data = &hotplug_path, + .maxlen = HOTPLUG_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .ctl_name = KERN_SG_BIG_BUFF, + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .ctl_name = KERN_ACCT, + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_SYSVIPC + { + .ctl_name = KERN_SHMMAX, + .procname = "shmmax", + .data = &shm_ctlmax, + .maxlen = sizeof (size_t), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = KERN_SHMALL, + .procname = "shmall", + .data = &shm_ctlall, + .maxlen = sizeof (size_t), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = KERN_SHMMNI, + .procname = "shmmni", + .data = &shm_ctlmni, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMAX, + .procname = "msgmax", + .data = &msg_ctlmax, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMNI, + .procname = "msgmni", + .data = &msg_ctlmni, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMNB, + .procname = "msgmnb", + .data = &msg_ctlmnb, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_SEM, + .procname = "sem", + .data = &sem_ctls, + .maxlen = 4*sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .ctl_name = KERN_SYSRQ, + .procname = "sysrq", + .data = &sysrq_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_CADPID, + .procname = "cad_pid", + .data = &cad_pid, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MAX_THREADS, + .procname = "threads-max", + .data = &max_threads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_RANDOM, + .procname = "random", + .mode = 0555, + .child = random_table, + }, +#ifdef CONFIG_UNIX98_PTYS + { + .ctl_name = KERN_PTY, + .procname = "pty", + .mode = 0555, + .child = pty_table, + }, +#endif + { + .ctl_name = KERN_OVERFLOWUID, + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = KERN_OVERFLOWGID, + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_MATHEMU + { + .ctl_name = KERN_IEEE_EMULATION_WARNINGS, + .procname = "ieee_emulation_warnings", + .data = &sysctl_ieee_emulation_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_NO_IDLE_HZ + { + .ctl_name = KERN_HZ_TIMER, + .procname = "hz_timer", + .data = &sysctl_hz_timer, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_S390_USER_DEBUG_LOGGING, + .procname = "userprocess_debug", + .data = &sysctl_userprocess_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_PIDMAX, + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .ctl_name = KERN_PANIC_ON_OOPS, + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PRINTK_RATELIMIT, + .procname = "printk_ratelimit", + .data = &printk_ratelimit_jiffies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = KERN_PRINTK_RATELIMIT_BURST, + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_NGROUPS_MAX, + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .ctl_name = KERN_UNKNOWN_NMI_PANIC, + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_unknown_nmi_panic, + }, +#endif +#if defined(CONFIG_X86) + { + .ctl_name = KERN_BOOTLOADER_TYPE, + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_RANDOMIZE, + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + + { .ctl_name = 0 } +}; + +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + + +static ctl_table vm_table[] = { + { + .ctl_name = VM_OVERCOMMIT_MEMORY, + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_OVERCOMMIT_RATIO, + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_PAGE_CLUSTER, + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_DIRTY_BACKGROUND, + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_DIRTY_RATIO, + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_DIRTY_WB_CS, + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_centisecs, + .maxlen = sizeof(dirty_writeback_centisecs), + .mode = 0644, + .proc_handler = &dirty_writeback_centisecs_handler, + }, + { + .ctl_name = VM_DIRTY_EXPIRE_CS, + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_centisecs, + .maxlen = sizeof(dirty_expire_centisecs), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_NR_PDFLUSH_THREADS, + .procname = "nr_pdflush_threads", + .data = &nr_pdflush_threads, + .maxlen = sizeof nr_pdflush_threads, + .mode = 0444 /* read-only*/, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_SWAPPINESS, + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, + .procname = "nr_hugepages", + .data = &max_huge_pages, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, + { + .ctl_name = VM_HUGETLB_GROUP, + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = VM_LOWMEM_RESERVE_RATIO, + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = &lowmem_reserve_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = VM_MIN_FREE_KBYTES, + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = &min_free_kbytes_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#ifdef CONFIG_MMU + { + .ctl_name = VM_MAX_MAP_COUNT, + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif + { + .ctl_name = VM_LAPTOP_MODE, + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_BLOCK_DUMP, + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_VFS_CACHE_PRESSURE, + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT + { + .ctl_name = VM_LEGACY_VA_LAYOUT, + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_SWAP + { + .ctl_name = VM_SWAP_TOKEN_TIMEOUT, + .procname = "swap_token_timeout", + .data = &swap_token_default_timeout, + .maxlen = sizeof(swap_token_default_timeout), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, +#endif + { .ctl_name = 0 } +}; + +static ctl_table proc_table[] = { + { .ctl_name = 0 } +}; + +static ctl_table fs_table[] = { + { + .ctl_name = FS_NRINODE, + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_STATINODE, + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_NRFILE, + .procname = "file-nr", + .data = &files_stat, + .maxlen = 3*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_MAXFILE, + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_DENTRY, + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_OVERFLOWUID, + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = FS_OVERFLOWGID, + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = FS_LEASES, + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_DNOTIFY + { + .ctl_name = FS_DIR_NOTIFY, + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU + { + .ctl_name = FS_LEASE_TIME, + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_AIO_NR, + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_AIO_MAX_NR, + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } +}; + +static ctl_table debug_table[] = { + { .ctl_name = 0 } +}; + +static ctl_table dev_table[] = { + { .ctl_name = 0 } +}; + +extern void init_irq_proc (void); + +void __init sysctl_init(void) +{ +#ifdef CONFIG_PROC_FS + register_proc_table(root_table, proc_sys_root); + init_irq_proc(); +#endif +} + +int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct list_head *tmp; + + if (nlen <= 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; + if (oldval) { + int old_len; + if (!oldlenp || get_user(old_len, oldlenp)) + return -EFAULT; + } + tmp = &root_table_header.ctl_entry; + do { + struct ctl_table_header *head = + list_entry(tmp, struct ctl_table_header, ctl_entry); + void *context = NULL; + int error = parse_table(name, nlen, oldval, oldlenp, + newval, newlen, head->ctl_table, + &context); + if (context) + kfree(context); + if (error != -ENOTDIR) + return error; + tmp = tmp->next; + } while (tmp != &root_table_header.ctl_entry); + return -ENOTDIR; +} + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +{ + struct __sysctl_args tmp; + int error; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + lock_kernel(); + error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, + tmp.newval, tmp.newlen); + unlock_kernel(); + return error; +} + +/* + * ctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current->euid) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((mode & op & 0007) == op) + return 0; + return -EACCES; +} + +static inline int ctl_perm(ctl_table *table, int op) +{ + int error; + error = security_sysctl(table, op); + if (error) + return error; + return test_perm(table->mode, op); +} + +static int parse_table(int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + ctl_table *table, void **context) +{ + int n; +repeat: + if (!nlen) + return -ENOTDIR; + if (get_user(n, name)) + return -EFAULT; + for ( ; table->ctl_name; table++) { + if (n == table->ctl_name || table->ctl_name == CTL_ANY) { + int error; + if (table->child) { + if (ctl_perm(table, 001)) + return -EPERM; + if (table->strategy) { + error = table->strategy( + table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + if (error) + return error; + } + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + return error; + } + } + return -ENOTDIR; +} + +/* Perform the actual read/write of a sysctl table entry. */ +int do_sysctl_strategy (ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + int op = 0, rc; + size_t len; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + if (ctl_perm(table, op)) + return -EPERM; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen, context); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + } + } + return 0; +} + +/** + * register_sysctl_table - register a sysctl hierarchy + * @table: the top-level table structure + * @insert_at_head: whether the entry should be inserted in front or at the end + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. An entry with a ctl_name of 0 terminates the table. + * + * The members of the &ctl_table structure are used as follows: + * + * ctl_name - This is the numeric sysctl value used by sysctl(2). The number + * must be unique within that level of sysctl + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * strategy - the strategy routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * More sophisticated management can be enabled by the provision of a + * strategy routine with the table entry. This will be called before + * any automatic read or write of the data is performed. + * + * The strategy routine may return + * + * < 0 - Error occurred (error is passed to user process) + * + * 0 - OK - proceed with automatic read or write. + * + * > 0 - OK - read or write has been done by the strategy routine, so + * return immediately. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + struct ctl_table_header *tmp; + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); + if (!tmp) + return NULL; + tmp->ctl_table = table; + INIT_LIST_HEAD(&tmp->ctl_entry); + if (insert_at_head) + list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); + else + list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); +#ifdef CONFIG_PROC_FS + register_proc_table(table, proc_sys_root); +#endif + return tmp; +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + list_del(&header->ctl_entry); +#ifdef CONFIG_PROC_FS + unregister_proc_table(header->ctl_table, proc_sys_root); +#endif + kfree(header); +} + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_FS + +/* Scan the sysctl entries in table and add them all into /proc */ +static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de; + int len; + mode_t mode; + + for (; table->ctl_name; table++) { + /* Can't do anything without a proc name. */ + if (!table->procname) + continue; + /* Maybe we can't do anything with it... */ + if (!table->proc_handler && !table->child) { + printk(KERN_WARNING "SYSCTL: Can't register %s\n", + table->procname); + continue; + } + + len = strlen(table->procname); + mode = table->mode; + + de = NULL; + if (table->proc_handler) + mode |= S_IFREG; + else { + mode |= S_IFDIR; + for (de = root->subdir; de; de = de->next) { + if (proc_match(len, table->procname, de)) + break; + } + /* If the subdir exists already, de is non-NULL */ + } + + if (!de) { + de = create_proc_entry(table->procname, mode, root); + if (!de) + continue; + de->data = (void *) table; + if (table->proc_handler) + de->proc_fops = &proc_sys_file_operations; + } + table->de = de; + if (de->mode & S_IFDIR) + register_proc_table(table->child, de); + } +} + +/* + * Unregister a /proc sysctl table and any subdirectories. + */ +static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de; + for (; table->ctl_name; table++) { + if (!(de = table->de)) + continue; + if (de->mode & S_IFDIR) { + if (!table->child) { + printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); + continue; + } + unregister_proc_table(table->child, de); + + /* Don't unregister directories which still have entries.. */ + if (de->subdir) + continue; + } + + /* Don't unregister proc entries that are still being used.. */ + if (atomic_read(&de->count)) + continue; + + table->de = NULL; + remove_proc_entry(table->procname, root); + } +} + +static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + int op; + struct proc_dir_entry *de; + struct ctl_table *table; + size_t res; + ssize_t error; + + de = PDE(file->f_dentry->d_inode); + if (!de || !de->data) + return -ENOTDIR; + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + return -ENOTDIR; + op = (write ? 002 : 004); + if (ctl_perm(table, op)) + return -EPERM; + + res = count; + + error = (*table->proc_handler) (table, write, file, buf, &res, ppos); + if (error) + return error; + return res; +} + +static int proc_opensys(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_WRITE) { + /* + * sysctl entries that are not writable, + * are _NOT_ writable, capabilities or not. + */ + if (!(inode->i_mode & S_IWUSR)) + return -EPERM; + } + + return 0; +} + +static ssize_t proc_readsys(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(0, file, buf, count, ppos); +} + +static ssize_t proc_writesys(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(1, file, (char __user *) buf, count, ppos); +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t len; + char __user *p; + char c; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + if (get_user(c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= table->maxlen) + len = table->maxlen-1; + if(copy_from_user(table->data, buffer, len)) + return -EFAULT; + ((char *) table->data)[len] = 0; + *ppos += *lenp; + } else { + len = strlen(table->data); + if (len > table->maxlen) + len = table->maxlen; + if (len > *lenp) + len = *lenp; + if (len) + if(copy_to_user(buffer, table->data, len)) + return -EFAULT; + if (len < *lenp) { + if(put_user('\n', ((char __user *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + *ppos += len; + } + return 0; +} + +/* + * Special case of dostring for the UTS structure. This has locks + * to observe. Should this be in kernel/sys.c ???? + */ + +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + + if (!write) { + down_read(&uts_sem); + r=proc_dostring(table,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=proc_dostring(table,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + return r; +} + +static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -*lvalp : *lvalp; + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ +#define TMPBUFLEN 21 + int *i, vleft, first=1, neg, val; + unsigned long lval; + size_t left, len; + + char buf[TMPBUFLEN], *p; + char __user *s = buffer; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) table->data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + for (; left && vleft--; i++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c, s)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + s++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > sizeof(buf) - 1) + len = sizeof(buf) - 1; + if (copy_from_user(buf, s, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + + lval = simple_strtoul(p, &p, 0); + + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + s += len; + left -= len; + + if (conv(&neg, &lval, i, 1, data)) + break; + } else { + p = buf; + if (!first) + *p++ = '\t'; + + if (conv(&neg, &lval, i, 0, data)) + break; + + sprintf(p, "%s%lu", neg ? "-" : "", lval); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; + s += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', s)) + return -EFAULT; + left--, s++; + } + if (write) { + while (left) { + char c; + if (get_user(c, s++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + *ppos += *lenp; + return 0; +#undef TMPBUFLEN +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + NULL,NULL); +} + +#define OP_SET 0 +#define OP_AND 1 +#define OP_OR 2 +#define OP_MAX 3 +#define OP_MIN 4 + +static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int op = *(int *)data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + switch(op) { + case OP_SET: *valp = val; break; + case OP_AND: *valp &= val; break; + case OP_OR: *valp |= val; break; + case OP_MAX: if(*valp < val) + *valp = val; + break; + case OP_MIN: if(*valp > val) + *valp = val; + break; + } + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/* + * init may raise the set. + */ + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int op; + + if (!capable(CAP_SYS_MODULE)) { + return -EPERM; + } + + op = (current->pid == 1) ? OP_SET : OP_AND; + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_bset_conv,&op); +} + +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + struct do_proc_dointvec_minmax_conv_param *param = data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + do_proc_dointvec_minmax_conv, ¶m); +} + +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ +#define TMPBUFLEN 21 + unsigned long *i, *min, *max, val; + int vleft, first=1, neg; + size_t len, left; + char buf[TMPBUFLEN], *p; + char __user *s = buffer; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned long *) table->data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; + + for (; left && vleft--; i++, min++, max++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c, s)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + s++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + if (copy_from_user(buf, s, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoul(p, &p, 0) * convmul / convdiv ; + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + s += len; + left -= len; + + if(neg) + continue; + if ((min && val < *min) || (max && val > *max)) + continue; + *i = val; + } else { + p = buf; + if (!first) + *p++ = '\t'; + sprintf(p, "%lu", convdiv * (*i) / convmul); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; + s += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', s)) + return -EFAULT; + left--, s++; + } + if (write) { + while (left) { + char c; + if (get_user(c, s++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + *ppos += *lenp; + return 0; +#undef TMPBUFLEN +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, filp, buffer, + lenp, ppos, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_msecs(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_conv, NULL); +} + +#else /* CONFIG_PROC_FS */ + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + + +#endif /* CONFIG_PROC_FS */ + + +/* + * General sysctl support routines + */ + +/* The generic string strategy routine: */ +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + size_t l, len; + + if (!table->data || !table->maxlen) + return -ENOTDIR; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + l = strlen(table->data); + if (len > l) len = l; + if (len >= table->maxlen) + len = table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(0, ((char __user *) oldval) + len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + if (len == table->maxlen) + len--; + ((char *) table->data)[len] = 0; + } + return 0; +} + +/* + * This function makes sure that all of the integers in the vector + * are between the minimum and maximum values given in the arrays + * table->extra1 and table->extra2, respectively. + */ +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + + if (newval && newlen) { + int __user *vec = (int __user *) newval; + int *min = (int *) table->extra1; + int *max = (int *) table->extra2; + size_t length; + int i; + + if (newlen % sizeof(int) != 0) + return -EINVAL; + + if (!table->extra1 && !table->extra2) + return 0; + + if (newlen > table->maxlen) + newlen = table->maxlen; + length = newlen / sizeof(int); + + for (i = 0; i < length; i++) { + int value; + if (get_user(value, vec + i)) + return -EFAULT; + if (min && value < min[i]) + return -EINVAL; + if (max && value > max[i]) + return -EINVAL; + } + } + return 0; +} + +/* Strategy function to convert jiffies to seconds */ +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + if (oldval) { + size_t olen; + if (oldlenp) { + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen!=sizeof(int)) + return -EINVAL; + } + if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) || + (oldlenp && put_user(sizeof(int),oldlenp))) + return -EFAULT; + } + if (newval && newlen) { + int new; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int __user *)newval)) + return -EFAULT; + *(int *)(table->data) = new*HZ; + } + return 1; +} + +/* Strategy function to convert jiffies to seconds */ +int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + if (oldval) { + size_t olen; + if (oldlenp) { + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen!=sizeof(int)) + return -EINVAL; + } + if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) || + (oldlenp && put_user(sizeof(int),oldlenp))) + return -EFAULT; + } + if (newval && newlen) { + int new; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int __user *)newval)) + return -EFAULT; + *(int *)(table->data) = msecs_to_jiffies(new); + } + return 1; +} + +#else /* CONFIG_SYSCTL */ + + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +{ + return -ENOSYS; +} + +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + return NULL; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +#endif /* CONFIG_SYSCTL */ + +/* + * No sense putting this after each symbol definition, twice, + * exception granted :-) + */ +EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_dointvec_jiffies); +EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); +EXPORT_SYMBOL(proc_dostring); +EXPORT_SYMBOL(proc_doulongvec_minmax); +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(register_sysctl_table); +EXPORT_SYMBOL(sysctl_intvec); +EXPORT_SYMBOL(sysctl_jiffies); +EXPORT_SYMBOL(sysctl_ms_jiffies); +EXPORT_SYMBOL(sysctl_string); +EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/time.c b/kernel/time.c new file mode 100644 index 00000000000..96fd0f49963 --- /dev/null +++ b/kernel/time.c @@ -0,0 +1,599 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include <linux/module.h> +#include <linux/timex.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/fs.h> +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +asmlinkage long sys_time(time_t __user * tloc) +{ + time_t i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + i = -EFAULT; + } + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +asmlinkage long sys_stime(time_t __user *tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) +{ + if (likely(tv != NULL)) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +inline static void warp_clock(void) +{ + write_seqlock_irq(&xtime_lock); + wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; + xtime.tv_sec += sys_tz.tz_minuteswest * 60; + time_interpolator_reset(); + write_sequnlock_irq(&xtime_lock); + clock_was_set(); +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + error = security_settime(tv, tz); + if (error) + return error; + + if (tz) { + /* SMP safe, global irq locking makes it work. */ + sys_tz = *tz; + if (firsttime) { + firsttime = 0; + if (!tv) + warp_clock(); + } + } + if (tv) + { + /* SMP safe, again the code in arch/foo/time.c should + * globally block out interrupts when it runs. + */ + return do_settimeofday(tv); + } + return 0; +} + +asmlinkage long sys_settimeofday(struct timeval __user *tv, + struct timezone __user *tz) +{ + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +long pps_offset; /* pps time offset (us) */ +long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ + +long pps_freq; /* frequency offset (scaled ppm) */ +long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ + +long pps_valid = PPS_VALID; /* pps signal watchdog counter */ + +int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ + +long pps_jitcnt; /* jitter limit exceeded */ +long pps_calcnt; /* calibration intervals */ +long pps_errcnt; /* calibration errors */ +long pps_stbcnt; /* stability limit exceeded */ + +/* hook for a loadable hardpps kernel module */ +void (*hardpps_ptr)(struct timeval *); + +/* we call this to notify the arch when the clock is being + * controlled. If no such arch routine, do nothing. + */ +void __attribute__ ((weak)) notify_arch_cmos_timer(void) +{ + return; +} + +/* adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + long ltemp, mtemp, save_adjust; + int result; + + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* Now we validate the data before disabling interrupts */ + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + /* singleshot must not be used with any other mode bits */ + if (txc->modes != ADJ_OFFSET_SINGLESHOT) + return -EINVAL; + + if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) + /* adjustment Offset limited to +- .512 seconds */ + if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) + return -EINVAL; + + /* if the quartz is off by more than 10% something is VERY wrong ! */ + if (txc->modes & ADJ_TICK) + if (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + result = time_state; /* mostly `TIME_OK' */ + + /* Save for later - semantics of adjtime is to return old value */ + save_adjust = time_next_adjust ? time_next_adjust : time_adjust; + +#if 0 /* STA_CLOCKERR is never set yet */ + time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ +#endif + /* If there are input parameters, then process them */ + if (txc->modes) + { + if (txc->modes & ADJ_STATUS) /* only set allowed bits */ + time_status = (txc->status & ~STA_RONLY) | + (time_status & STA_RONLY); + + if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ + if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { + result = -EINVAL; + goto leave; + } + time_freq = txc->freq - pps_freq; + } + + if (txc->modes & ADJ_MAXERROR) { + if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_maxerror = txc->maxerror; + } + + if (txc->modes & ADJ_ESTERROR) { + if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_esterror = txc->esterror; + } + + if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ + if (txc->constant < 0) { /* NTP v4 uses values > 6 */ + result = -EINVAL; + goto leave; + } + time_constant = txc->constant; + } + + if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ + if (txc->modes == ADJ_OFFSET_SINGLESHOT) { + /* adjtime() is independent from ntp_adjtime() */ + if ((time_next_adjust = txc->offset) == 0) + time_adjust = 0; + } + else if ( time_status & (STA_PLL | STA_PPSTIME) ) { + ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == + (STA_PPSTIME | STA_PPSSIGNAL) ? + pps_offset : txc->offset; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + if (ltemp > MAXPHASE) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -MAXPHASE) + time_offset = -(MAXPHASE << SHIFT_UPDATE); + else + time_offset = ltemp << SHIFT_UPDATE; + + /* + * Select whether the frequency is to be controlled + * and in which mode (PLL or FLL). Clamp to the operating + * range. Ugly multiply/divide should be replaced someday. + */ + + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + if (time_status & STA_FLL) { + if (mtemp >= MINSEC) { + ltemp = (time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE); + if (ltemp < 0) + time_freq -= -ltemp >> SHIFT_KH; + else + time_freq += ltemp >> SHIFT_KH; + } else /* calibration interval too short (p. 12) */ + result = TIME_ERROR; + } else { /* PLL mode */ + if (mtemp < MAXSEC) { + ltemp *= mtemp; + if (ltemp < 0) + time_freq -= -ltemp >> (time_constant + + time_constant + + SHIFT_KF - SHIFT_USEC); + else + time_freq += ltemp >> (time_constant + + time_constant + + SHIFT_KF - SHIFT_USEC); + } else /* calibration interval too long (p. 12) */ + result = TIME_ERROR; + } + if (time_freq > time_tolerance) + time_freq = time_tolerance; + else if (time_freq < -time_tolerance) + time_freq = -time_tolerance; + } /* STA_PLL || STA_PPSTIME */ + } /* txc->modes & ADJ_OFFSET */ + if (txc->modes & ADJ_TICK) { + tick_usec = txc->tick; + tick_nsec = TICK_USEC_TO_NSEC(tick_usec); + } + } /* txc->modes */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 + || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0 + && (time_status & STA_PPSSIGNAL) == 0) + /* p. 24, (b) */ + || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* p. 24, (c) */ + || ((time_status & STA_PPSFREQ) != 0 + && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) + /* p. 24, (d) */ + result = TIME_ERROR; + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + txc->offset = save_adjust; + else { + if (time_offset < 0) + txc->offset = -(-time_offset >> SHIFT_UPDATE); + else + txc->offset = time_offset >> SHIFT_UPDATE; + } + txc->freq = time_freq + pps_freq; + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = time_precision; + txc->tolerance = time_tolerance; + txc->tick = tick_usec; + txc->ppsfreq = pps_freq; + txc->jitter = pps_jitter >> PPS_AVG; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; + write_sequnlock_irq(&xtime_lock); + do_gettimeofday(&txc->time); + notify_arch_cmos_timer(); + return(result); +} + +asmlinkage long sys_adjtimex(struct timex __user *txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +inline struct timespec current_kernel_time(void) +{ + struct timespec now; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime; + } while (read_seqretry(&xtime_lock, seq)); + + return now; +} + +EXPORT_SYMBOL(current_kernel_time); + +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granuality supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/** + * timespec_trunc - Truncate timespec to a granuality + * @t: Timespec + * @gran: Granuality in ns. + * + * Truncate a timespec to a granuality. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the later. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +#ifdef CONFIG_TIME_INTERPOLATION +void getnstimeofday (struct timespec *tv) +{ + unsigned long seq,sec,nsec; + + do { + seq = read_seqbegin(&xtime_lock); + sec = xtime.tv_sec; + nsec = xtime.tv_nsec+time_interpolator_get_offset(); + } while (unlikely(read_seqretry(&xtime_lock, seq))); + + while (unlikely(nsec >= NSEC_PER_SEC)) { + nsec -= NSEC_PER_SEC; + ++sec; + } + tv->tv_sec = sec; + tv->tv_nsec = nsec; +} +EXPORT_SYMBOL_GPL(getnstimeofday); + +int do_settimeofday (struct timespec *tv) +{ + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + { + /* + * This is revolting. We need to set "xtime" correctly. However, the value + * in this location is the value at the most recent update of wall time. + * Discover what correction gettimeofday would have done, and then undo + * it! + */ + nsec -= time_interpolator_get_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + time_interpolator_reset(); + } + write_sequnlock_irq(&xtime_lock); + clock_was_set(); + return 0; +} + +void do_gettimeofday (struct timeval *tv) +{ + unsigned long seq, nsec, usec, sec, offset; + do { + seq = read_seqbegin(&xtime_lock); + offset = time_interpolator_get_offset(); + sec = xtime.tv_sec; + nsec = xtime.tv_nsec; + } while (unlikely(read_seqretry(&xtime_lock, seq))); + + usec = (nsec + offset) / 1000; + + while (unlikely(usec >= USEC_PER_SEC)) { + usec -= USEC_PER_SEC; + ++sec; + } + + tv->tv_sec = sec; + tv->tv_usec = usec; +} + +EXPORT_SYMBOL(do_gettimeofday); + + +#else +/* + * Simulate gettimeofday using do_gettimeofday which only allows a timeval + * and therefore only yields usec accuracy + */ +void getnstimeofday(struct timespec *tv) +{ + struct timeval x; + + do_gettimeofday(&x); + tv->tv_sec = x.tv_sec; + tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; +} +#endif + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} + +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); diff --git a/kernel/timer.c b/kernel/timer.c new file mode 100644 index 00000000000..ecb3d67c0e1 --- /dev/null +++ b/kernel/timer.c @@ -0,0 +1,1611 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, kernel timekeeping, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/notifier.h> +#include <linux/thread_info.h> +#include <linux/time.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/div64.h> +#include <asm/timex.h> +#include <asm/io.h> + +#ifdef CONFIG_TIME_INTERPOLATION +static void time_interpolator_update(long delta_nsec); +#else +#define time_interpolator_update(x) +#endif + +/* + * per-CPU timer vector definitions: + */ + +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +typedef struct tvec_s { + struct list_head vec[TVN_SIZE]; +} tvec_t; + +typedef struct tvec_root_s { + struct list_head vec[TVR_SIZE]; +} tvec_root_t; + +struct tvec_t_base_s { + spinlock_t lock; + unsigned long timer_jiffies; + struct timer_list *running_timer; + tvec_root_t tv1; + tvec_t tv2; + tvec_t tv3; + tvec_t tv4; + tvec_t tv5; +} ____cacheline_aligned_in_smp; + +typedef struct tvec_t_base_s tvec_base_t; + +static inline void set_running_timer(tvec_base_t *base, + struct timer_list *timer) +{ +#ifdef CONFIG_SMP + base->running_timer = timer; +#endif +} + +/* Fake initialization */ +static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; + +static void check_timer_failed(struct timer_list *timer) +{ + static int whine_count; + if (whine_count < 16) { + whine_count++; + printk("Uninitialised timer!\n"); + printk("This is just a warning. Your computer is OK\n"); + printk("function=0x%p, data=0x%lx\n", + timer->function, timer->data); + dump_stack(); + } + /* + * Now fix it up + */ + spin_lock_init(&timer->lock); + timer->magic = TIMER_MAGIC; +} + +static inline void check_timer(struct timer_list *timer) +{ + if (timer->magic != TIMER_MAGIC) + check_timer_failed(timer); +} + + +static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) +{ + unsigned long expires = timer->expires; + unsigned long idx = expires - base->timer_jiffies; + struct list_head *vec; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if ((signed long) idx < 0) { + /* + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; + /* If the timeout is larger than 0xffffffff on 64-bit + * architectures then we use the maximum timeout: + */ + if (idx > 0xffffffffUL) { + idx = 0xffffffffUL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + /* + * Timers are FIFO: + */ + list_add_tail(&timer->entry, vec); +} + +int __mod_timer(struct timer_list *timer, unsigned long expires) +{ + tvec_base_t *old_base, *new_base; + unsigned long flags; + int ret = 0; + + BUG_ON(!timer->function); + + check_timer(timer); + + spin_lock_irqsave(&timer->lock, flags); + new_base = &__get_cpu_var(tvec_bases); +repeat: + old_base = timer->base; + + /* + * Prevent deadlocks via ordering by old_base < new_base. + */ + if (old_base && (new_base != old_base)) { + if (old_base < new_base) { + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + } else { + spin_lock(&old_base->lock); + spin_lock(&new_base->lock); + } + /* + * The timer base might have been cancelled while we were + * trying to take the lock(s): + */ + if (timer->base != old_base) { + spin_unlock(&new_base->lock); + spin_unlock(&old_base->lock); + goto repeat; + } + } else { + spin_lock(&new_base->lock); + if (timer->base != old_base) { + spin_unlock(&new_base->lock); + goto repeat; + } + } + + /* + * Delete the previous timeout (if there was any), and install + * the new one: + */ + if (old_base) { + list_del(&timer->entry); + ret = 1; + } + timer->expires = expires; + internal_add_timer(new_base, timer); + timer->base = new_base; + + if (old_base && (new_base != old_base)) + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); + spin_unlock_irqrestore(&timer->lock, flags); + + return ret; +} + +EXPORT_SYMBOL(__mod_timer); + +/*** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + tvec_base_t *base = &per_cpu(tvec_bases, cpu); + unsigned long flags; + + BUG_ON(timer_pending(timer) || !timer->function); + + check_timer(timer); + + spin_lock_irqsave(&base->lock, flags); + internal_add_timer(base, timer); + timer->base = base; + spin_unlock_irqrestore(&base->lock, flags); +} + + +/*** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * + * mod_timer is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + BUG_ON(!timer->function); + + check_timer(timer); + + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires); +} + +EXPORT_SYMBOL(mod_timer); + +/*** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + unsigned long flags; + tvec_base_t *base; + + check_timer(timer); + +repeat: + base = timer->base; + if (!base) + return 0; + spin_lock_irqsave(&base->lock, flags); + if (base != timer->base) { + spin_unlock_irqrestore(&base->lock, flags); + goto repeat; + } + list_del(&timer->entry); + /* Need to make sure that anybody who sees a NULL base also sees the list ops */ + smp_wmb(); + timer->base = NULL; + spin_unlock_irqrestore(&base->lock, flags); + + return 1; +} + +EXPORT_SYMBOL(del_timer); + +#ifdef CONFIG_SMP +/*** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts. The caller must not hold locks which would prevent + * completion of the timer's handler. Upon exit the timer is not queued and + * the handler is not running on any CPU. + * + * The function returns whether it has deactivated a pending timer or not. + * + * del_timer_sync() is slow and complicated because it copes with timer + * handlers which re-arm the timer (periodic timers). If the timer handler + * is known to not do this (a single shot timer) then use + * del_singleshot_timer_sync() instead. + */ +int del_timer_sync(struct timer_list *timer) +{ + tvec_base_t *base; + int i, ret = 0; + + check_timer(timer); + +del_again: + ret += del_timer(timer); + + for_each_online_cpu(i) { + base = &per_cpu(tvec_bases, i); + if (base->running_timer == timer) { + while (base->running_timer == timer) { + cpu_relax(); + preempt_check_resched(); + } + break; + } + } + smp_rmb(); + if (timer_pending(timer)) + goto del_again; + + return ret; +} +EXPORT_SYMBOL(del_timer_sync); + +/*** + * del_singleshot_timer_sync - deactivate a non-recursive timer + * @timer: the timer to be deactivated + * + * This function is an optimization of del_timer_sync for the case where the + * caller can guarantee the timer does not reschedule itself in its timer + * function. + * + * Synchronization rules: callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts. The caller must not hold locks which wold prevent + * completion of the timer's handler. Upon exit the timer is not queued and + * the handler is not running on any CPU. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_singleshot_timer_sync(struct timer_list *timer) +{ + int ret = del_timer(timer); + + if (!ret) { + ret = del_timer_sync(timer); + BUG_ON(ret); + } + + return ret; +} +EXPORT_SYMBOL(del_singleshot_timer_sync); +#endif + +static int cascade(tvec_base_t *base, tvec_t *tv, int index) +{ + /* cascade all the timers from tv up one level */ + struct list_head *head, *curr; + + head = tv->vec + index; + curr = head->next; + /* + * We are removing _all_ timers from the list, so we don't have to + * detach them individually, just clear the list afterwards. + */ + while (curr != head) { + struct timer_list *tmp; + + tmp = list_entry(curr, struct timer_list, entry); + BUG_ON(tmp->base != base); + curr = curr->next; + internal_add_timer(base, tmp); + } + INIT_LIST_HEAD(head); + + return index; +} + +/*** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + * + * This function cascades all vectors and executes all expired timer + * vectors. + */ +#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK + +static inline void __run_timers(tvec_base_t *base) +{ + struct timer_list *timer; + + spin_lock_irq(&base->lock); + while (time_after_eq(jiffies, base->timer_jiffies)) { + struct list_head work_list = LIST_HEAD_INIT(work_list); + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + + /* + * Cascade timers: + */ + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + ++base->timer_jiffies; + list_splice_init(base->tv1.vec + index, &work_list); +repeat: + if (!list_empty(head)) { + void (*fn)(unsigned long); + unsigned long data; + + timer = list_entry(head->next,struct timer_list,entry); + fn = timer->function; + data = timer->data; + + list_del(&timer->entry); + set_running_timer(base, timer); + smp_wmb(); + timer->base = NULL; + spin_unlock_irq(&base->lock); + { + u32 preempt_count = preempt_count(); + fn(data); + if (preempt_count != preempt_count()) { + printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); + BUG(); + } + } + spin_lock_irq(&base->lock); + goto repeat; + } + } + set_running_timer(base, NULL); + spin_unlock_irq(&base->lock); +} + +#ifdef CONFIG_NO_IDLE_HZ +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when a cpus is idle. + * This functions needs to be called disabled. + */ +unsigned long next_timer_interrupt(void) +{ + tvec_base_t *base; + struct list_head *list; + struct timer_list *nte; + unsigned long expires; + tvec_t *varray[4]; + int i, j; + + base = &__get_cpu_var(tvec_bases); + spin_lock(&base->lock); + expires = base->timer_jiffies + (LONG_MAX >> 1); + list = 0; + + /* Look for timer events in tv1. */ + j = base->timer_jiffies & TVR_MASK; + do { + list_for_each_entry(nte, base->tv1.vec + j, entry) { + expires = nte->expires; + if (j < (base->timer_jiffies & TVR_MASK)) + list = base->tv2.vec + (INDEX(0)); + goto found; + } + j = (j + 1) & TVR_MASK; + } while (j != (base->timer_jiffies & TVR_MASK)); + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; + varray[1] = &base->tv3; + varray[2] = &base->tv4; + varray[3] = &base->tv5; + for (i = 0; i < 4; i++) { + j = INDEX(i); + do { + if (list_empty(varray[i]->vec + j)) { + j = (j + 1) & TVN_MASK; + continue; + } + list_for_each_entry(nte, varray[i]->vec + j, entry) + if (time_before(nte->expires, expires)) + expires = nte->expires; + if (j < (INDEX(i)) && i < 3) + list = varray[i + 1]->vec + (INDEX(i + 1)); + goto found; + } while (j != (INDEX(i))); + } +found: + if (list) { + /* + * The search wrapped. We need to look at the next list + * from next tv element that would cascade into tv element + * where we found the timer element. + */ + list_for_each_entry(nte, list, entry) { + if (time_before(nte->expires, expires)) + expires = nte->expires; + } + } + spin_unlock(&base->lock); + return expires; +} +#endif + +/******************************************************************/ + +/* + * Timekeeping variables + */ +unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ +unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ + +/* + * The current time + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + */ +struct timespec xtime __attribute__ ((aligned (16))); +struct timespec wall_to_monotonic __attribute__ ((aligned (16))); + +EXPORT_SYMBOL(xtime); + +/* Don't completely fail for HZ > 500. */ +int tickadj = 500/HZ ? : 1; /* microsecs */ + + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +long time_offset; /* time adjustment (us) */ +long time_constant = 2; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +static long time_phase; /* phase offset (scaled us) */ +long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; + /* frequency offset (scaled ppm)*/ +static long time_adj; /* tick adjust (scaled 1 / HZ) */ +long time_reftime; /* time at last adjustment (s) */ +long time_adjust; +long time_next_adjust; + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + */ +static void second_overflow(void) +{ + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if ( time_maxerror > NTP_PHASE_LIMIT ) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* The timer interpolator will make time change gradually instead + * of an immediate jump by one second. + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* Use of time interpolator for a gradual change of time */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if HZ == 100 + /* Compensate for (HZ==100) != (1 << SHIFT_HZ). + * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) + */ + if (time_adj < 0) + time_adj -= (-time_adj >> 2) + (-time_adj >> 5); + else + time_adj += (time_adj >> 2) + (time_adj >> 5); +#endif +#if HZ == 1000 + /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). + * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) + */ + if (time_adj < 0) + time_adj -= (-time_adj >> 6) + (-time_adj >> 7); + else + time_adj += (time_adj >> 6) + (time_adj >> 7); +#endif +} + +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + long time_adjust_step, delta_nsec; + + if ( (time_adjust_step = time_adjust) != 0 ) { + /* We are doing an adjtime thing. + * + * Prepare time_adjust_step to be within bounds. + * Note that a positive time_adjust means we want the clock + * to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + delta_nsec = tick_nsec + time_adjust_step * 1000; + /* + * Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if (time_phase <= -FINENSEC) { + long ltemp = -time_phase >> (SHIFT_SCALE - 10); + time_phase += ltemp << (SHIFT_SCALE - 10); + delta_nsec -= ltemp; + } + else if (time_phase >= FINENSEC) { + long ltemp = time_phase >> (SHIFT_SCALE - 10); + time_phase -= ltemp << (SHIFT_SCALE - 10); + delta_nsec += ltemp; + } + xtime.tv_nsec += delta_nsec; + time_interpolator_update(delta_nsec); + + /* Changes by adjtime() do not take effect till next tick. */ + if (time_next_adjust != 0) { + time_adjust = time_next_adjust; + time_next_adjust = 0; + } +} + +/* + * Using a loop looks inefficient, but "ticks" is + * usually just one (we shouldn't be losing ticks, + * we're doing this this way mainly for interrupt + * latency reasons, not because we think we'll + * have lots of lost timer ticks + */ +static void update_wall_time(unsigned long ticks) +{ + do { + ticks--; + update_wall_time_one_tick(); + if (xtime.tv_nsec >= 1000000000) { + xtime.tv_nsec -= 1000000000; + xtime.tv_sec++; + second_overflow(); + } + } while (ticks); +} + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + /* Note: this timer irq context must be accounted for as well. */ + if (user_tick) + account_user_time(p, jiffies_to_cputime(1)); + else + account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); + run_local_timers(); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_tick); + scheduler_tick(); + run_posix_cpu_timers(p); +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + return (nr_running() + nr_uninterruptible()) * FIXED_1; +} + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + * + * Requires xtime_lock to access. + */ +unsigned long avenrun[3]; + +EXPORT_SYMBOL(avenrun); + +/* + * calc_load - given tick count, update the avenrun load estimates. + * This is called while holding a write_lock on xtime_lock. + */ +static inline void calc_load(unsigned long ticks) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + count -= ticks; + if (count < 0) { + count += LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + } +} + +/* jiffies at the most recent update of wall time */ +unsigned long wall_jiffies = INITIAL_JIFFIES; + +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime and avenrun. + */ +#ifndef ARCH_HAVE_XTIME_LOCK +seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; + +EXPORT_SYMBOL(xtime_lock); +#endif + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + tvec_base_t *base = &__get_cpu_var(tvec_bases); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + raise_softirq(TIMER_SOFTIRQ); +} + +/* + * Called by the timer interrupt. xtime_lock must already be taken + * by the timer IRQ! + */ +static inline void update_times(void) +{ + unsigned long ticks; + + ticks = jiffies - wall_jiffies; + if (ticks) { + wall_jiffies += ticks; + update_wall_time(ticks); + } + calc_load(ticks); +} + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ + +void do_timer(struct pt_regs *regs) +{ + jiffies_64++; + update_times(); +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +asmlinkage unsigned long sys_alarm(unsigned int seconds) +{ + struct itimerval it_new, it_old; + unsigned int oldalarm; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + do_setitimer(ITIMER_REAL, &it_new, &it_old); + oldalarm = it_old.it_value.tv_sec; + /* ehhh.. We can't return 0 if we have an alarm pending.. */ + /* And we'd better return too much than too little anyway */ + if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) + oldalarm++; + return oldalarm; +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +asmlinkage long sys_getpid(void) +{ + return current->tgid; +} + +/* + * Accessing ->group_leader->real_parent is not SMP-safe, it could + * change from under us. However, rather than getting any lock + * we can use an optimistic algorithm: get the parent + * pid, and go back and check that the parent is still + * the same. If it has changed (which is extremely unlikely + * indeed), we just try again.. + * + * NOTE! This depends on the fact that even if we _do_ + * get an old value of "parent", we can happily dereference + * the pointer (it was and remains a dereferencable kernel pointer + * no matter what): we just can't necessarily trust the result + * until we know that the parent pointer is valid. + * + * NOTE2: ->group_leader never changes from under us. + */ +asmlinkage long sys_getppid(void) +{ + int pid; + struct task_struct *me = current; + struct task_struct *parent; + + parent = me->group_leader->real_parent; + for (;;) { + pid = parent->tgid; +#ifdef CONFIG_SMP +{ + struct task_struct *old = parent; + + /* + * Make sure we read the pid before re-reading the + * parent pointer: + */ + rmb(); + parent = me->group_leader->real_parent; + if (old != parent) + continue; +} +#endif + break; + } + return pid; +} + +asmlinkage long sys_getuid(void) +{ + /* Only we change this so SMP safe */ + return current->uid; +} + +asmlinkage long sys_geteuid(void) +{ + /* Only we change this so SMP safe */ + return current->euid; +} + +asmlinkage long sys_getgid(void) +{ + /* Only we change this so SMP safe */ + return current->gid; +} + +asmlinkage long sys_getegid(void) +{ + /* Only we change this so SMP safe */ + return current->egid; +} + +#endif + +static void process_timeout(unsigned long __data) +{ + wake_up_process((task_t *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +fastcall signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_singleshot_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + +EXPORT_SYMBOL(schedule_timeout); + +/* Thread ID - the internal kernel "pid" */ +asmlinkage long sys_gettid(void) +{ + return current->pid; +} + +static long __sched nanosleep_restart(struct restart_block *restart) +{ + unsigned long expire = restart->arg0, now = jiffies; + struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; + long ret; + + /* Did it expire while we handled signals? */ + if (!time_after(expire, now)) + return 0; + + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire - now); + + ret = 0; + if (expire) { + struct timespec t; + jiffies_to_timespec(expire, &t); + + ret = -ERESTART_RESTARTBLOCK; + if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) + ret = -EFAULT; + /* The 'restart' block is already filled in */ + } + return ret; +} + +asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) +{ + struct timespec t; + unsigned long expire; + long ret; + + if (copy_from_user(&t, rqtp, sizeof(t))) + return -EFAULT; + + if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) + return -EINVAL; + + expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire); + + ret = 0; + if (expire) { + struct restart_block *restart; + jiffies_to_timespec(expire, &t); + if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) + return -EFAULT; + + restart = ¤t_thread_info()->restart_block; + restart->fn = nanosleep_restart; + restart->arg0 = jiffies + expire; + restart->arg1 = (unsigned long) rmtp; + ret = -ERESTART_RESTARTBLOCK; + } + return ret; +} + +/* + * sys_sysinfo - fill in sysinfo struct + */ +asmlinkage long sys_sysinfo(struct sysinfo __user *info) +{ + struct sysinfo val; + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + unsigned long seq; + + memset((char *)&val, 0, sizeof(struct sysinfo)); + + do { + struct timespec tp; + seq = read_seqbegin(&xtime_lock); + + /* + * This is annoying. The below is the same thing + * posix_get_clock_monotonic() does, but it wants to + * take the lock which we want to cover the loads stuff + * too. + */ + + getnstimeofday(&tp); + tp.tv_sec += wall_to_monotonic.tv_sec; + tp.tv_nsec += wall_to_monotonic.tv_nsec; + if (tp.tv_nsec - NSEC_PER_SEC >= 0) { + tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; + tp.tv_sec++; + } + val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + val.procs = nr_threads; + } while (read_seqretry(&xtime_lock, seq)); + + si_meminfo(&val); + si_swapinfo(&val); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen <andersee@debian.org> + */ + + mem_total = val.totalram + val.totalswap; + if (mem_total < val.totalram || mem_total < val.totalswap) + goto out; + bitcount = 0; + mem_unit = val.mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; + } + + /* + * If mem_total did not overflow, multiply all memory values by + * val.mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ + + val.mem_unit = 1; + val.totalram <<= bitcount; + val.freeram <<= bitcount; + val.sharedram <<= bitcount; + val.bufferram <<= bitcount; + val.totalswap <<= bitcount; + val.freeswap <<= bitcount; + val.totalhigh <<= bitcount; + val.freehigh <<= bitcount; + + out: + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +static void __devinit init_timers_cpu(int cpu) +{ + int j; + tvec_base_t *base; + + base = &per_cpu(tvec_bases, cpu); + spin_lock_init(&base->lock); + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; +} + +#ifdef CONFIG_HOTPLUG_CPU +static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) +{ + struct timer_list *timer; + + while (!list_empty(head)) { + timer = list_entry(head->next, struct timer_list, entry); + /* We're locking backwards from __mod_timer order here, + beware deadlock. */ + if (!spin_trylock(&timer->lock)) + return 0; + list_del(&timer->entry); + internal_add_timer(new_base, timer); + timer->base = new_base; + spin_unlock(&timer->lock); + } + return 1; +} + +static void __devinit migrate_timers(int cpu) +{ + tvec_base_t *old_base; + tvec_base_t *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = &per_cpu(tvec_bases, cpu); + new_base = &get_cpu_var(tvec_bases); + + local_irq_disable(); +again: + /* Prevent deadlocks via ordering by old_base < new_base. */ + if (old_base < new_base) { + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + } else { + spin_lock(&old_base->lock); + spin_lock(&new_base->lock); + } + + if (old_base->running_timer) + BUG(); + for (i = 0; i < TVR_SIZE; i++) + if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) + goto unlock_again; + for (i = 0; i < TVN_SIZE; i++) + if (!migrate_timer_list(new_base, old_base->tv2.vec + i) + || !migrate_timer_list(new_base, old_base->tv3.vec + i) + || !migrate_timer_list(new_base, old_base->tv4.vec + i) + || !migrate_timer_list(new_base, old_base->tv5.vec + i)) + goto unlock_again; + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); + local_irq_enable(); + put_cpu_var(tvec_bases); + return; + +unlock_again: + /* Avoid deadlock with __mod_timer, by backing off. */ + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); + cpu_relax(); + goto again; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __devinit timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch(action) { + case CPU_UP_PREPARE: + init_timers_cpu(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + migrate_timers(cpu); + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata timers_nb = { + .notifier_call = timer_cpu_notify, +}; + + +void __init init_timers(void) +{ + timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&timers_nb); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); +} + +#ifdef CONFIG_TIME_INTERPOLATION + +struct time_interpolator *time_interpolator; +static struct time_interpolator *time_interpolator_list; +static DEFINE_SPINLOCK(time_interpolator_lock); + +static inline u64 time_interpolator_get_cycles(unsigned int src) +{ + unsigned long (*x)(void); + + switch (src) + { + case TIME_SOURCE_FUNCTION: + x = time_interpolator->addr; + return x(); + + case TIME_SOURCE_MMIO64 : + return readq((void __iomem *) time_interpolator->addr); + + case TIME_SOURCE_MMIO32 : + return readl((void __iomem *) time_interpolator->addr); + + default: return get_cycles(); + } +} + +static inline u64 time_interpolator_get_counter(void) +{ + unsigned int src = time_interpolator->source; + + if (time_interpolator->jitter) + { + u64 lcycle; + u64 now; + + do { + lcycle = time_interpolator->last_cycle; + now = time_interpolator_get_cycles(src); + if (lcycle && time_after(lcycle, now)) + return lcycle; + /* Keep track of the last timer value returned. The use of cmpxchg here + * will cause contention in an SMP environment. + */ + } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); + return now; + } + else + return time_interpolator_get_cycles(src); +} + +void time_interpolator_reset(void) +{ + time_interpolator->offset = 0; + time_interpolator->last_counter = time_interpolator_get_counter(); +} + +#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) + +unsigned long time_interpolator_get_offset(void) +{ + /* If we do not have a time interpolator set up then just return zero */ + if (!time_interpolator) + return 0; + + return time_interpolator->offset + + GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); +} + +#define INTERPOLATOR_ADJUST 65536 +#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST + +static void time_interpolator_update(long delta_nsec) +{ + u64 counter; + unsigned long offset; + + /* If there is no time interpolator set up then do nothing */ + if (!time_interpolator) + return; + + /* The interpolator compensates for late ticks by accumulating + * the late time in time_interpolator->offset. A tick earlier than + * expected will lead to a reset of the offset and a corresponding + * jump of the clock forward. Again this only works if the + * interpolator clock is running slightly slower than the regular clock + * and the tuning logic insures that. + */ + + counter = time_interpolator_get_counter(); + offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); + + if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) + time_interpolator->offset = offset - delta_nsec; + else { + time_interpolator->skips++; + time_interpolator->ns_skipped += delta_nsec - offset; + time_interpolator->offset = 0; + } + time_interpolator->last_counter = counter; + + /* Tuning logic for time interpolator invoked every minute or so. + * Decrease interpolator clock speed if no skips occurred and an offset is carried. + * Increase interpolator clock speed if we skip too much time. + */ + if (jiffies % INTERPOLATOR_ADJUST == 0) + { + if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) + time_interpolator->nsec_per_cyc--; + if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) + time_interpolator->nsec_per_cyc++; + time_interpolator->skips = 0; + time_interpolator->ns_skipped = 0; + } +} + +static inline int +is_better_time_interpolator(struct time_interpolator *new) +{ + if (!time_interpolator) + return 1; + return new->frequency > 2*time_interpolator->frequency || + (unsigned long)new->drift < (unsigned long)time_interpolator->drift; +} + +void +register_time_interpolator(struct time_interpolator *ti) +{ + unsigned long flags; + + /* Sanity check */ + if (ti->frequency == 0 || ti->mask == 0) + BUG(); + + ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; + spin_lock(&time_interpolator_lock); + write_seqlock_irqsave(&xtime_lock, flags); + if (is_better_time_interpolator(ti)) { + time_interpolator = ti; + time_interpolator_reset(); + } + write_sequnlock_irqrestore(&xtime_lock, flags); + + ti->next = time_interpolator_list; + time_interpolator_list = ti; + spin_unlock(&time_interpolator_lock); +} + +void +unregister_time_interpolator(struct time_interpolator *ti) +{ + struct time_interpolator *curr, **prev; + unsigned long flags; + + spin_lock(&time_interpolator_lock); + prev = &time_interpolator_list; + for (curr = *prev; curr; curr = curr->next) { + if (curr == ti) { + *prev = curr->next; + break; + } + prev = &curr->next; + } + + write_seqlock_irqsave(&xtime_lock, flags); + if (ti == time_interpolator) { + /* we lost the best time-interpolator: */ + time_interpolator = NULL; + /* find the next-best interpolator */ + for (curr = time_interpolator_list; curr; curr = curr->next) + if (is_better_time_interpolator(curr)) + time_interpolator = curr; + time_interpolator_reset(); + } + write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock(&time_interpolator_lock); +} +#endif /* CONFIG_TIME_INTERPOLATION */ + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) { + set_current_state(TASK_UNINTERRUPTIBLE); + timeout = schedule_timeout(timeout); + } +} + +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) { + set_current_state(TASK_INTERRUPTIBLE); + timeout = schedule_timeout(timeout); + } + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); diff --git a/kernel/uid16.c b/kernel/uid16.c new file mode 100644 index 00000000000..f669941e8b2 --- /dev/null +++ b/kernel/uid16.c @@ -0,0 +1,196 @@ +/* + * Wrapper functions for 16bit uid back compatibility. All nicely tied + * together in the faint hope we can take the out in five years time. + */ + +#include <linux/mm.h> +#include <linux/utsname.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/prctl.h> +#include <linux/init.h> +#include <linux/highuid.h> +#include <linux/security.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> + +asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) +{ + return sys_chown(filename, low2highuid(user), low2highgid(group)); +} + +asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) +{ + return sys_lchown(filename, low2highuid(user), low2highgid(group)); +} + +asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) +{ + return sys_fchown(fd, low2highuid(user), low2highgid(group)); +} + +asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) +{ + return sys_setregid(low2highgid(rgid), low2highgid(egid)); +} + +asmlinkage long sys_setgid16(old_gid_t gid) +{ + return sys_setgid(low2highgid(gid)); +} + +asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) +{ + return sys_setreuid(low2highuid(ruid), low2highuid(euid)); +} + +asmlinkage long sys_setuid16(old_uid_t uid) +{ + return sys_setuid(low2highuid(uid)); +} + +asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) +{ + return sys_setresuid(low2highuid(ruid), low2highuid(euid), + low2highuid(suid)); +} + +asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) +{ + int retval; + + if (!(retval = put_user(high2lowuid(current->uid), ruid)) && + !(retval = put_user(high2lowuid(current->euid), euid))) + retval = put_user(high2lowuid(current->suid), suid); + + return retval; +} + +asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) +{ + return sys_setresgid(low2highgid(rgid), low2highgid(egid), + low2highgid(sgid)); +} + +asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) +{ + int retval; + + if (!(retval = put_user(high2lowgid(current->gid), rgid)) && + !(retval = put_user(high2lowgid(current->egid), egid))) + retval = put_user(high2lowgid(current->sgid), sgid); + + return retval; +} + +asmlinkage long sys_setfsuid16(old_uid_t uid) +{ + return sys_setfsuid(low2highuid(uid)); +} + +asmlinkage long sys_setfsgid16(old_gid_t gid) +{ + return sys_setfsgid(low2highgid(gid)); +} + +static int groups16_to_user(old_gid_t __user *grouplist, + struct group_info *group_info) +{ + int i; + old_gid_t group; + + for (i = 0; i < group_info->ngroups; i++) { + group = high2lowgid(GROUP_AT(group_info, i)); + if (put_user(group, grouplist+i)) + return -EFAULT; + } + + return 0; +} + +static int groups16_from_user(struct group_info *group_info, + old_gid_t __user *grouplist) +{ + int i; + old_gid_t group; + + for (i = 0; i < group_info->ngroups; i++) { + if (get_user(group, grouplist+i)) + return -EFAULT; + GROUP_AT(group_info, i) = low2highgid(group); + } + + return 0; +} + +asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) +{ + int i = 0; + + if (gidsetsize < 0) + return -EINVAL; + + get_group_info(current->group_info); + i = current->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups16_to_user(grouplist, current->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + put_group_info(current->group_info); + return i; +} + +asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) +{ + struct group_info *group_info; + int retval; + + if (!capable(CAP_SETGID)) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups16_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +asmlinkage long sys_getuid16(void) +{ + return high2lowuid(current->uid); +} + +asmlinkage long sys_geteuid16(void) +{ + return high2lowuid(current->euid); +} + +asmlinkage long sys_getgid16(void) +{ + return high2lowgid(current->gid); +} + +asmlinkage long sys_getegid16(void) +{ + return high2lowgid(current->egid); +} diff --git a/kernel/user.c b/kernel/user.c new file mode 100644 index 00000000000..734575d5576 --- /dev/null +++ b/kernel/user.c @@ -0,0 +1,189 @@ +/* + * The "user cache". + * + * (C) Copyright 1991-2000 Linus Torvalds + * + * We have a per-user structure to keep track of how many + * processes, files etc the user has claimed, in order to be + * able to have per-user limits for system resources. + */ + +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/key.h> + +/* + * UID task count cache, to get fast user lookup in "alloc_uid" + * when changing user ID's (ie setuid() and friends). + */ + +#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) +#define UIDHASH_SZ (1 << UIDHASH_BITS) +#define UIDHASH_MASK (UIDHASH_SZ - 1) +#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) +#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) + +static kmem_cache_t *uid_cachep; +static struct list_head uidhash_table[UIDHASH_SZ]; +static DEFINE_SPINLOCK(uidhash_lock); + +struct user_struct root_user = { + .__count = ATOMIC_INIT(1), + .processes = ATOMIC_INIT(1), + .files = ATOMIC_INIT(0), + .sigpending = ATOMIC_INIT(0), + .mq_bytes = 0, + .locked_shm = 0, +#ifdef CONFIG_KEYS + .uid_keyring = &root_user_keyring, + .session_keyring = &root_session_keyring, +#endif +}; + +/* + * These routines must be called with the uidhash spinlock held! + */ +static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) +{ + list_add(&up->uidhash_list, hashent); +} + +static inline void uid_hash_remove(struct user_struct *up) +{ + list_del(&up->uidhash_list); +} + +static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) +{ + struct list_head *up; + + list_for_each(up, hashent) { + struct user_struct *user; + + user = list_entry(up, struct user_struct, uidhash_list); + + if(user->uid == uid) { + atomic_inc(&user->__count); + return user; + } + } + + return NULL; +} + +/* + * Locate the user_struct for the passed UID. If found, take a ref on it. The + * caller must undo that ref with free_uid(). + * + * If the user_struct could not be found, return NULL. + */ +struct user_struct *find_user(uid_t uid) +{ + struct user_struct *ret; + + spin_lock(&uidhash_lock); + ret = uid_hash_find(uid, uidhashentry(uid)); + spin_unlock(&uidhash_lock); + return ret; +} + +void free_uid(struct user_struct *up) +{ + if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + uid_hash_remove(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); + spin_unlock(&uidhash_lock); + } +} + +struct user_struct * alloc_uid(uid_t uid) +{ + struct list_head *hashent = uidhashentry(uid); + struct user_struct *up; + + spin_lock(&uidhash_lock); + up = uid_hash_find(uid, hashent); + spin_unlock(&uidhash_lock); + + if (!up) { + struct user_struct *new; + + new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); + if (!new) + return NULL; + new->uid = uid; + atomic_set(&new->__count, 1); + atomic_set(&new->processes, 0); + atomic_set(&new->files, 0); + atomic_set(&new->sigpending, 0); + + new->mq_bytes = 0; + new->locked_shm = 0; + + if (alloc_uid_keyring(new) < 0) { + kmem_cache_free(uid_cachep, new); + return NULL; + } + + /* + * Before adding this, check whether we raced + * on adding the same user already.. + */ + spin_lock(&uidhash_lock); + up = uid_hash_find(uid, hashent); + if (up) { + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + } else { + uid_hash_insert(new, hashent); + up = new; + } + spin_unlock(&uidhash_lock); + + } + return up; +} + +void switch_uid(struct user_struct *new_user) +{ + struct user_struct *old_user; + + /* What if a process setreuid()'s and this brings the + * new uid over his NPROC rlimit? We can check this now + * cheaply with the new uid cache, so if it matters + * we should be checking for it. -DaveM + */ + old_user = current->user; + atomic_inc(&new_user->processes); + atomic_dec(&old_user->processes); + switch_uid_keyring(new_user); + current->user = new_user; + free_uid(old_user); + suid_keys(current); +} + + +static int __init uid_cache_init(void) +{ + int n; + + uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + + for(n = 0; n < UIDHASH_SZ; ++n) + INIT_LIST_HEAD(uidhash_table + n); + + /* Insert the root user immediately (init already runs as root) */ + spin_lock(&uidhash_lock); + uid_hash_insert(&root_user, uidhashentry(0)); + spin_unlock(&uidhash_lock); + + return 0; +} + +module_init(uid_cache_init); diff --git a/kernel/wait.c b/kernel/wait.c new file mode 100644 index 00000000000..791681cfea9 --- /dev/null +++ b/kernel/wait.c @@ -0,0 +1,246 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 William Irwin, Oracle + */ +#include <linux/config.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/wait.h> +#include <linux/hash.h> + +void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the the critical region). + */ +void fastcall +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + /* + * don't alter the task state if this is just going to + * queue an async wait queue callback + */ + if (is_sync_wait(wait)) + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void fastcall +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + /* + * don't alter the task state if this is just going to + * queue an async wait queue callback + */ + if (is_sync_wait(wait)) + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched fastcall +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched fastcall out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched fastcall +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) { + if ((ret = (*action)(q->key.flags))) + break; + } + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void fastcall wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(bit_waitqueue); diff --git a/kernel/workqueue.c b/kernel/workqueue.c new file mode 100644 index 00000000000..52ef419d274 --- /dev/null +++ b/kernel/workqueue.c @@ -0,0 +1,555 @@ +/* + * linux/kernel/workqueue.c + * + * Generic mechanism for defining kernel helper threads for running + * arbitrary tasks in process context. + * + * Started by Ingo Molnar, Copyright (C) 2002 + * + * Derived from the taskqueue/keventd code by: + * + * David Woodhouse <dwmw2@infradead.org> + * Andrew Morton <andrewm@uow.edu.au> + * Kai Petzke <wpp@marie.physik.tu-berlin.de> + * Theodore Ts'o <tytso@mit.edu> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/completion.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/kthread.h> + +/* + * The per-CPU workqueue (if single thread, we always use cpu 0's). + * + * The sequence counters are for flush_scheduled_work(). It wants to wait + * until until all currently-scheduled works are completed, but it doesn't + * want to be livelocked by new, incoming ones. So it waits until + * remove_sequence is >= the insert_sequence which pertained when + * flush_scheduled_work() was called. + */ +struct cpu_workqueue_struct { + + spinlock_t lock; + + long remove_sequence; /* Least-recently added (next to run) */ + long insert_sequence; /* Next to add */ + + struct list_head worklist; + wait_queue_head_t more_work; + wait_queue_head_t work_done; + + struct workqueue_struct *wq; + task_t *thread; + + int run_depth; /* Detect run_workqueue() recursion depth */ +} ____cacheline_aligned; + +/* + * The externally visible workqueue abstraction is an array of + * per-CPU workqueues: + */ +struct workqueue_struct { + struct cpu_workqueue_struct cpu_wq[NR_CPUS]; + const char *name; + struct list_head list; /* Empty if single thread */ +}; + +/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove + threads to each one as cpus come/go. */ +static DEFINE_SPINLOCK(workqueue_lock); +static LIST_HEAD(workqueues); + +/* If it's single threaded, it isn't in the list of workqueues. */ +static inline int is_single_threaded(struct workqueue_struct *wq) +{ + return list_empty(&wq->list); +} + +/* Preempt must be disabled. */ +static void __queue_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + unsigned long flags; + + spin_lock_irqsave(&cwq->lock, flags); + work->wq_data = cwq; + list_add_tail(&work->entry, &cwq->worklist); + cwq->insert_sequence++; + wake_up(&cwq->more_work); + spin_unlock_irqrestore(&cwq->lock, flags); +} + +/* + * Queue work on a workqueue. Return non-zero if it was successfully + * added. + * + * We queue the work to the CPU it was submitted, but there is no + * guarantee that it will be processed by that CPU. + */ +int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) +{ + int ret = 0, cpu = get_cpu(); + + if (!test_and_set_bit(0, &work->pending)) { + if (unlikely(is_single_threaded(wq))) + cpu = 0; + BUG_ON(!list_empty(&work->entry)); + __queue_work(wq->cpu_wq + cpu, work); + ret = 1; + } + put_cpu(); + return ret; +} + +static void delayed_work_timer_fn(unsigned long __data) +{ + struct work_struct *work = (struct work_struct *)__data; + struct workqueue_struct *wq = work->wq_data; + int cpu = smp_processor_id(); + + if (unlikely(is_single_threaded(wq))) + cpu = 0; + + __queue_work(wq->cpu_wq + cpu, work); +} + +int fastcall queue_delayed_work(struct workqueue_struct *wq, + struct work_struct *work, unsigned long delay) +{ + int ret = 0; + struct timer_list *timer = &work->timer; + + if (!test_and_set_bit(0, &work->pending)) { + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + /* This stores wq for the moment, for the timer_fn */ + work->wq_data = wq; + timer->expires = jiffies + delay; + timer->data = (unsigned long)work; + timer->function = delayed_work_timer_fn; + add_timer(timer); + ret = 1; + } + return ret; +} + +static inline void run_workqueue(struct cpu_workqueue_struct *cwq) +{ + unsigned long flags; + + /* + * Keep taking off work from the queue until + * done. + */ + spin_lock_irqsave(&cwq->lock, flags); + cwq->run_depth++; + if (cwq->run_depth > 3) { + /* morton gets to eat his hat */ + printk("%s: recursion depth exceeded: %d\n", + __FUNCTION__, cwq->run_depth); + dump_stack(); + } + while (!list_empty(&cwq->worklist)) { + struct work_struct *work = list_entry(cwq->worklist.next, + struct work_struct, entry); + void (*f) (void *) = work->func; + void *data = work->data; + + list_del_init(cwq->worklist.next); + spin_unlock_irqrestore(&cwq->lock, flags); + + BUG_ON(work->wq_data != cwq); + clear_bit(0, &work->pending); + f(data); + + spin_lock_irqsave(&cwq->lock, flags); + cwq->remove_sequence++; + wake_up(&cwq->work_done); + } + cwq->run_depth--; + spin_unlock_irqrestore(&cwq->lock, flags); +} + +static int worker_thread(void *__cwq) +{ + struct cpu_workqueue_struct *cwq = __cwq; + DECLARE_WAITQUEUE(wait, current); + struct k_sigaction sa; + sigset_t blocked; + + current->flags |= PF_NOFREEZE; + + set_user_nice(current, -5); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* SIG_IGN makes children autoreap: see do_notify_parent(). */ + sa.sa.sa_handler = SIG_IGN; + sa.sa.sa_flags = 0; + siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); + do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + add_wait_queue(&cwq->more_work, &wait); + if (list_empty(&cwq->worklist)) + schedule(); + else + __set_current_state(TASK_RUNNING); + remove_wait_queue(&cwq->more_work, &wait); + + if (!list_empty(&cwq->worklist)) + run_workqueue(cwq); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +{ + if (cwq->thread == current) { + /* + * Probably keventd trying to flush its own queue. So simply run + * it by hand rather than deadlocking. + */ + run_workqueue(cwq); + } else { + DEFINE_WAIT(wait); + long sequence_needed; + + spin_lock_irq(&cwq->lock); + sequence_needed = cwq->insert_sequence; + + while (sequence_needed - cwq->remove_sequence > 0) { + prepare_to_wait(&cwq->work_done, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&cwq->lock); + schedule(); + spin_lock_irq(&cwq->lock); + } + finish_wait(&cwq->work_done, &wait); + spin_unlock_irq(&cwq->lock); + } +} + +/* + * flush_workqueue - ensure that any scheduled work has run to completion. + * + * Forces execution of the workqueue and blocks until its completion. + * This is typically used in driver shutdown handlers. + * + * This function will sample each workqueue's current insert_sequence number and + * will sleep until the head sequence is greater than or equal to that. This + * means that we sleep until all works which were queued on entry have been + * handled, but we are not livelocked by new incoming ones. + * + * This function used to run the workqueues itself. Now we just wait for the + * helper threads to do it. + */ +void fastcall flush_workqueue(struct workqueue_struct *wq) +{ + might_sleep(); + + if (is_single_threaded(wq)) { + /* Always use cpu 0's area. */ + flush_cpu_workqueue(wq->cpu_wq + 0); + } else { + int cpu; + + lock_cpu_hotplug(); + for_each_online_cpu(cpu) + flush_cpu_workqueue(wq->cpu_wq + cpu); + unlock_cpu_hotplug(); + } +} + +static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, + int cpu) +{ + struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct task_struct *p; + + spin_lock_init(&cwq->lock); + cwq->wq = wq; + cwq->thread = NULL; + cwq->insert_sequence = 0; + cwq->remove_sequence = 0; + INIT_LIST_HEAD(&cwq->worklist); + init_waitqueue_head(&cwq->more_work); + init_waitqueue_head(&cwq->work_done); + + if (is_single_threaded(wq)) + p = kthread_create(worker_thread, cwq, "%s", wq->name); + else + p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); + if (IS_ERR(p)) + return NULL; + cwq->thread = p; + return p; +} + +struct workqueue_struct *__create_workqueue(const char *name, + int singlethread) +{ + int cpu, destroy = 0; + struct workqueue_struct *wq; + struct task_struct *p; + + BUG_ON(strlen(name) > 10); + + wq = kmalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return NULL; + memset(wq, 0, sizeof(*wq)); + + wq->name = name; + /* We don't need the distraction of CPUs appearing and vanishing. */ + lock_cpu_hotplug(); + if (singlethread) { + INIT_LIST_HEAD(&wq->list); + p = create_workqueue_thread(wq, 0); + if (!p) + destroy = 1; + else + wake_up_process(p); + } else { + spin_lock(&workqueue_lock); + list_add(&wq->list, &workqueues); + spin_unlock(&workqueue_lock); + for_each_online_cpu(cpu) { + p = create_workqueue_thread(wq, cpu); + if (p) { + kthread_bind(p, cpu); + wake_up_process(p); + } else + destroy = 1; + } + } + unlock_cpu_hotplug(); + + /* + * Was there any error during startup? If yes then clean up: + */ + if (destroy) { + destroy_workqueue(wq); + wq = NULL; + } + return wq; +} + +static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) +{ + struct cpu_workqueue_struct *cwq; + unsigned long flags; + struct task_struct *p; + + cwq = wq->cpu_wq + cpu; + spin_lock_irqsave(&cwq->lock, flags); + p = cwq->thread; + cwq->thread = NULL; + spin_unlock_irqrestore(&cwq->lock, flags); + if (p) + kthread_stop(p); +} + +void destroy_workqueue(struct workqueue_struct *wq) +{ + int cpu; + + flush_workqueue(wq); + + /* We don't need the distraction of CPUs appearing and vanishing. */ + lock_cpu_hotplug(); + if (is_single_threaded(wq)) + cleanup_workqueue_thread(wq, 0); + else { + for_each_online_cpu(cpu) + cleanup_workqueue_thread(wq, cpu); + spin_lock(&workqueue_lock); + list_del(&wq->list); + spin_unlock(&workqueue_lock); + } + unlock_cpu_hotplug(); + kfree(wq); +} + +static struct workqueue_struct *keventd_wq; + +int fastcall schedule_work(struct work_struct *work) +{ + return queue_work(keventd_wq, work); +} + +int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) +{ + return queue_delayed_work(keventd_wq, work, delay); +} + +int schedule_delayed_work_on(int cpu, + struct work_struct *work, unsigned long delay) +{ + int ret = 0; + struct timer_list *timer = &work->timer; + + if (!test_and_set_bit(0, &work->pending)) { + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + /* This stores keventd_wq for the moment, for the timer_fn */ + work->wq_data = keventd_wq; + timer->expires = jiffies + delay; + timer->data = (unsigned long)work; + timer->function = delayed_work_timer_fn; + add_timer_on(timer, cpu); + ret = 1; + } + return ret; +} + +void flush_scheduled_work(void) +{ + flush_workqueue(keventd_wq); +} + +/** + * cancel_rearming_delayed_workqueue - reliably kill off a delayed + * work whose handler rearms the delayed work. + * @wq: the controlling workqueue structure + * @work: the delayed work struct + */ +static void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, + struct work_struct *work) +{ + while (!cancel_delayed_work(work)) + flush_workqueue(wq); +} + +/** + * cancel_rearming_delayed_work - reliably kill off a delayed keventd + * work whose handler rearms the delayed work. + * @work: the delayed work struct + */ +void cancel_rearming_delayed_work(struct work_struct *work) +{ + cancel_rearming_delayed_workqueue(keventd_wq, work); +} +EXPORT_SYMBOL(cancel_rearming_delayed_work); + +int keventd_up(void) +{ + return keventd_wq != NULL; +} + +int current_is_keventd(void) +{ + struct cpu_workqueue_struct *cwq; + int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int ret = 0; + + BUG_ON(!keventd_wq); + + cwq = keventd_wq->cpu_wq + cpu; + if (current == cwq->thread) + ret = 1; + + return ret; + +} + +#ifdef CONFIG_HOTPLUG_CPU +/* Take the work from this (downed) CPU. */ +static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) +{ + struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + LIST_HEAD(list); + struct work_struct *work; + + spin_lock_irq(&cwq->lock); + list_splice_init(&cwq->worklist, &list); + + while (!list_empty(&list)) { + printk("Taking work for %s\n", wq->name); + work = list_entry(list.next,struct work_struct,entry); + list_del(&work->entry); + __queue_work(wq->cpu_wq + smp_processor_id(), work); + } + spin_unlock_irq(&cwq->lock); +} + +/* We're holding the cpucontrol mutex here */ +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + struct workqueue_struct *wq; + + switch (action) { + case CPU_UP_PREPARE: + /* Create a new workqueue thread for it. */ + list_for_each_entry(wq, &workqueues, list) { + if (create_workqueue_thread(wq, hotcpu) < 0) { + printk("workqueue for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + } + break; + + case CPU_ONLINE: + /* Kick off worker threads. */ + list_for_each_entry(wq, &workqueues, list) { + kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); + wake_up_process(wq->cpu_wq[hotcpu].thread); + } + break; + + case CPU_UP_CANCELED: + list_for_each_entry(wq, &workqueues, list) { + /* Unbind so it can run. */ + kthread_bind(wq->cpu_wq[hotcpu].thread, + smp_processor_id()); + cleanup_workqueue_thread(wq, hotcpu); + } + break; + + case CPU_DEAD: + list_for_each_entry(wq, &workqueues, list) + cleanup_workqueue_thread(wq, hotcpu); + list_for_each_entry(wq, &workqueues, list) + take_over_work(wq, hotcpu); + break; + } + + return NOTIFY_OK; +} +#endif + +void init_workqueues(void) +{ + hotcpu_notifier(workqueue_cpu_callback, 0); + keventd_wq = create_workqueue("events"); + BUG_ON(!keventd_wq); +} + +EXPORT_SYMBOL_GPL(__create_workqueue); +EXPORT_SYMBOL_GPL(queue_work); +EXPORT_SYMBOL_GPL(queue_delayed_work); +EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL_GPL(destroy_workqueue); + +EXPORT_SYMBOL(schedule_work); +EXPORT_SYMBOL(schedule_delayed_work); +EXPORT_SYMBOL(schedule_delayed_work_on); +EXPORT_SYMBOL(flush_scheduled_work); |