From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:05 -0500 Subject: nmi_watchdog: Config option to enable new nmi_watchdog These are the bits that enable the new nmi_watchdog and safely isolate the old nmi_watchdog. Only one or the other can run, not both at the same time. Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b752e807add..a42ff0bef70 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -47,4 +47,8 @@ static inline bool trigger_all_cpu_backtrace(void) } #endif +#ifdef CONFIG_NMI_WATCHDOG +int hw_nmi_is_cpu_stuck(struct pt_regs *); +#endif + #endif -- cgit v1.2.3-70-g09d2 From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:19 -0500 Subject: nmi_watchdog: Compile and portability fixes The original patch was x86_64 centric. Changed the code to make it less so. ested by building and running on a powerpc. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++----- include/linux/nmi.h | 9 ++++++++ kernel/nmi_watchdog.c | 52 ++++++++++++++++++++++++++++++++++--------- kernel/sysctl.c | 15 ++++++++++++- 5 files changed, 82 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 93da9c3f334..5b41b0feb6d 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); +#if !defined(CONFIG_NMI_WATCHDOG) extern int nmi_watchdog_enabled; +#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c0e6a410d0..312d772c5c3 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); */ static inline unsigned int get_timer_irqs(int cpu) { - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; + unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; + +#if defined(CONFIG_X86_LOCAL_APIC) + irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif + + return irqs; } static inline int mce_in_progress(void) @@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } } +u64 hw_nmi_get_sample_period(void) +{ + return cpu_khz * 1000; +} + void arch_trigger_all_cpu_backtrace(void) { int i; @@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void) } /* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) unsigned int nmi_watchdog = NMI_NONE; EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); -int nmi_watchdog_enabled; int unknown_nmi_panic; void cpu_nmi_set_wd_enabled(void) { return; } -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } void stop_apic_nmi_watchdog(void *unused) { return; } void setup_apic_nmi_watchdog(void *unused) { return; } int __init check_nmi_watchdog(void) { return 0; } diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a42ff0bef70..794e7354c5b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -20,10 +20,14 @@ extern void touch_nmi_watchdog(void); extern void acpi_nmi_disable(void); extern void acpi_nmi_enable(void); #else +#ifndef CONFIG_NMI_WATCHDOG static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); } +#else +extern void touch_nmi_watchdog(void); +#endif static inline void acpi_nmi_disable(void) { } static inline void acpi_nmi_enable(void) { } #endif @@ -49,6 +53,11 @@ static inline bool trigger_all_cpu_backtrace(void) #ifdef CONFIG_NMI_WATCHDOG int hw_nmi_is_cpu_stuck(struct pt_regs *); +u64 hw_nmi_get_sample_period(void); +extern int nmi_watchdog_enabled; +struct ctl_table; +extern int proc_nmi_enabled(struct ctl_table *, int , + void __user *, size_t *, loff_t *); #endif #endif diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 36817b214d6..73c1954a97b 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -30,6 +30,8 @@ static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev); static DEFINE_PER_CPU(int, nmi_watchdog_touch); static DEFINE_PER_CPU(long, alert_counter); +static int panic_on_timeout; + void touch_nmi_watchdog(void) { __raw_get_cpu_var(nmi_watchdog_touch) = 1; @@ -46,19 +48,49 @@ void touch_all_nmi_watchdog(void) touch_softlockup_watchdog(); } +static int __init setup_nmi_watchdog(char *str) +{ + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + return 1; +} +__setup("nmi_watchdog=", setup_nmi_watchdog); + #ifdef CONFIG_SYSCTL /* * proc handler for /proc/sys/kernel/nmi_watchdog */ +int nmi_watchdog_enabled; + int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int cpu; - if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) + if (!write) { + struct perf_event *event; + for_each_online_cpu(cpu) { + event = per_cpu(nmi_watchdog_ev, cpu); + if (event->state > PERF_EVENT_STATE_OFF) { + nmi_watchdog_enabled = 1; + break; + } + } + proc_dointvec(table, write, buffer, length, ppos); + return 0; + } + + if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) { nmi_watchdog_enabled = 0; - else - nmi_watchdog_enabled = 1; + proc_dointvec(table, write, buffer, length, ppos); + printk("NMI watchdog failed configuration, can not be enabled\n"); + return 0; + } touch_all_nmi_watchdog(); proc_dointvec(table, write, buffer, length, ppos); @@ -81,8 +113,6 @@ struct perf_event_attr wd_attr = { .disabled = 1, }; -static int panic_on_timeout; - void wd_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -103,11 +133,11 @@ void wd_overflow(struct perf_event *event, int nmi, */ per_cpu(alert_counter,cpu) += 1; if (per_cpu(alert_counter,cpu) == 5) { - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); + if (panic_on_timeout) { + panic("NMI Watchdog detected LOCKUP on cpu %d", cpu); + } else { + WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu); + } } } else { per_cpu(alert_counter,cpu) = 0; @@ -133,7 +163,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE: case CPU_ONLINE_FROZEN: /* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */ - wd_attr.sample_period = cpu_khz * 1000; + wd_attr.sample_period = hw_nmi_get_sample_period(); event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); if (IS_ERR(event)) { printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b244846..ac72c9e6bd9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -60,6 +60,10 @@ #include #endif +#ifdef CONFIG_NMI_WATCHDOG +#include +#endif + #if defined(CONFIG_SYSCTL) @@ -692,7 +696,16 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +#if defined(CONFIG_NMI_WATCHDOG) + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_nmi_enabled, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, -- cgit v1.2.3-70-g09d2 From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 22 Feb 2010 18:09:03 -0500 Subject: nmi_watchdog: Clean up various small details Mostly copy/paste whitespace damage with a couple of nitpicks by the checkpatch script. Fix the struct definition as requested by Ingo too. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/apic/hw_nmi.c | 14 +++++------ arch/x86/kernel/traps.c | 6 ++-- include/linux/nmi.h | 2 - kernel/nmi_watchdog.c | 51 ++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) --- arch/x86/kernel/apic/hw_nmi.c | 14 ++++++------ arch/x86/kernel/traps.c | 6 ++--- include/linux/nmi.h | 2 +- kernel/nmi_watchdog.c | 51 +++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 0b4d205a6b8..e8b78a0be5d 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu) irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; #endif - return irqs; + return irqs; } static inline int mce_in_progress(void) { #if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; + return atomic_read(&mce_entry) > 0; #endif - return 0; + return 0; } int hw_nmi_is_cpu_stuck(struct pt_regs *regs) @@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; /* We determine if the cpu is stuck by checking whether any * interrupts have happened since we last checked. Of course @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return cpu_khz * 1000; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 973cbc4f044..bdc7fab3ef3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; #ifdef CONFIG_X86_LOCAL_APIC - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; #ifndef CONFIG_NMI_WATCHDOG /* diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 794e7354c5b..22cc7960b64 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -57,7 +57,7 @@ u64 hw_nmi_get_sample_period(void); extern int nmi_watchdog_enabled; struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); + void __user *, size_t *, loff_t *); #endif #endif diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 3c75cbf3acb..0a6f57f537a 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -50,31 +50,31 @@ void touch_all_nmi_watchdog(void) static int __init setup_nmi_watchdog(char *str) { - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - return 1; + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + return 1; } __setup("nmi_watchdog=", setup_nmi_watchdog); struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, }; struct perf_event_attr wd_sw_attr = { - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_CPU_CLOCK, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, }; void wd_overflow(struct perf_event *event, int nmi, @@ -95,16 +95,15 @@ void wd_overflow(struct perf_event *event, int nmi, * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - per_cpu(alert_counter,cpu) += 1; - if (per_cpu(alert_counter,cpu) == 5) { - if (panic_on_timeout) { + per_cpu(alert_counter, cpu) += 1; + if (per_cpu(alert_counter, cpu) == 5) { + if (panic_on_timeout) panic("NMI Watchdog detected LOCKUP on cpu %d", cpu); - } else { + else WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu); - } } } else { - per_cpu(alert_counter,cpu) = 0; + per_cpu(alert_counter, cpu) = 0; } return; @@ -126,7 +125,7 @@ static int enable_nmi_watchdog(int cpu) event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); if (IS_ERR(event)) { /* hardware doesn't exist or not supported, fallback to software events */ - printk("nmi_watchdog: hardware not available, trying software events\n"); + printk(KERN_INFO "nmi_watchdog: hardware not available, trying software events\n"); wd_attr = &wd_sw_attr; wd_attr->sample_period = NSEC_PER_SEC; event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); @@ -182,7 +181,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, if (nmi_watchdog_enabled) { for_each_online_cpu(cpu) if (enable_nmi_watchdog(cpu)) { - printk("NMI watchdog failed configuration, " + printk(KERN_ERR "NMI watchdog failed configuration, " " can not be enabled\n"); } } else { -- cgit v1.2.3-70-g09d2 From 4e639fdf0d0d745648aa62228ab8a0d9c03a563f Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Thu, 25 Feb 2010 15:37:17 -0500 Subject: ibft: Update iBFT handling for v1.03 of the spec. - Use struct acpi_table_ibft instead of struct ibft_table_header - Don't do reserve_ibft_region() on UEFI machines (section 1.4.3.1) - If ibft_addr isn't initialized when ibft_init() is called, check for ACPI-based tables. - Fix compiler error when CONFIG_ACPI is not defined. Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Peter Jones Signed-off-by: Mike Christie --- drivers/firmware/iscsi_ibft.c | 30 ++++++++++++++++++------------ drivers/firmware/iscsi_ibft_find.c | 35 ++++++++++++++++++++++++++++++----- include/linux/iscsi_ibft.h | 12 ++---------- 3 files changed, 50 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c index ed2801c378d..b3ab24f9d78 100644 --- a/drivers/firmware/iscsi_ibft.c +++ b/drivers/firmware/iscsi_ibft.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Red Hat, Inc. + * Copyright 2007-2010 Red Hat, Inc. * by Peter Jones * Copyright 2008 IBM, Inc. * by Konrad Rzeszutek @@ -19,6 +19,9 @@ * * Changelog: * + * 06 Jan 2010 - Peter Jones + * New changelog entries are in the git log from now on. Not here. + * * 14 Mar 2008 - Konrad Rzeszutek * Updated comments and copyrights. (v0.4.9) * @@ -78,9 +81,10 @@ #include #include #include +#include -#define IBFT_ISCSI_VERSION "0.4.9" -#define IBFT_ISCSI_DATE "2008-Mar-14" +#define IBFT_ISCSI_VERSION "0.5.0" +#define IBFT_ISCSI_DATE "2010-Feb-25" MODULE_AUTHOR("Peter Jones and \ Konrad Rzeszutek "); @@ -238,7 +242,7 @@ static const char *ibft_initiator_properties[] = */ struct ibft_kobject { - struct ibft_table_header *header; + struct acpi_table_ibft *header; union { struct ibft_initiator *initiator; struct ibft_nic *nic; @@ -536,12 +540,13 @@ static int __init ibft_check_device(void) u8 *pos; u8 csum = 0; - len = ibft_addr->length; + len = ibft_addr->header.length; /* Sanity checking of iBFT. */ - if (ibft_addr->revision != 1) { + if (ibft_addr->header.revision != 1) { printk(KERN_ERR "iBFT module supports only revision 1, " \ - "while this is %d.\n", ibft_addr->revision); + "while this is %d.\n", + ibft_addr->header.revision); return -ENOENT; } for (pos = (u8 *)ibft_addr; pos < (u8 *)ibft_addr + len; pos++) @@ -558,7 +563,7 @@ static int __init ibft_check_device(void) /* * Helper function for ibft_register_kobjects. */ -static int __init ibft_create_kobject(struct ibft_table_header *header, +static int __init ibft_create_kobject(struct acpi_table_ibft *header, struct ibft_hdr *hdr, struct list_head *list) { @@ -596,7 +601,7 @@ static int __init ibft_create_kobject(struct ibft_table_header *header, default: printk(KERN_ERR "iBFT has unknown structure type (%d). " \ "Report this bug to %.6s!\n", hdr->id, - header->oem_id); + header->header.oem_id); rc = 1; break; } @@ -649,7 +654,7 @@ out_invalid_struct: * found add them on the passed-in list. We do not support the other * fields at this point, so they are skipped. */ -static int __init ibft_register_kobjects(struct ibft_table_header *header, +static int __init ibft_register_kobjects(struct acpi_table_ibft *header, struct list_head *list) { struct ibft_control *control = NULL; @@ -660,7 +665,7 @@ static int __init ibft_register_kobjects(struct ibft_table_header *header, control = (void *)header + sizeof(*header); end = (void *)control + control->hdr.length; - eot_offset = (void *)header + header->length - (void *)control; + eot_offset = (void *)header + header->header.length - (void *)control; rc = ibft_verify_hdr("control", (struct ibft_hdr *)control, id_control, sizeof(*control)); @@ -672,7 +677,8 @@ static int __init ibft_register_kobjects(struct ibft_table_header *header, } for (ptr = &control->initiator_off; ptr < end; ptr += sizeof(u16)) { offset = *(u16 *)ptr; - if (offset && offset < header->length && offset < eot_offset) { + if (offset && offset < header->header.length && + offset < eot_offset) { rc = ibft_create_kobject(header, (void *)header + offset, list); diff --git a/drivers/firmware/iscsi_ibft_find.c b/drivers/firmware/iscsi_ibft_find.c index d6470ef36e4..dd85555d329 100644 --- a/drivers/firmware/iscsi_ibft_find.c +++ b/drivers/firmware/iscsi_ibft_find.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Red Hat, Inc. + * Copyright 2007-2010 Red Hat, Inc. * by Peter Jones * Copyright 2007 IBM, Inc. * by Konrad Rzeszutek @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -30,13 +31,15 @@ #include #include #include +#include +#include #include /* * Physical location of iSCSI Boot Format Table. */ -struct ibft_table_header *ibft_addr; +struct acpi_table_ibft *ibft_addr; EXPORT_SYMBOL_GPL(ibft_addr); #define IBFT_SIGN "iBFT" @@ -46,6 +49,13 @@ EXPORT_SYMBOL_GPL(ibft_addr); #define VGA_MEM 0xA0000 /* VGA buffer */ #define VGA_SIZE 0x20000 /* 128kB */ +#ifdef CONFIG_ACPI +static int __init acpi_find_ibft(struct acpi_table_header *header) +{ + ibft_addr = (struct acpi_table_ibft *)header; + return 0; +} +#endif /* CONFIG_ACPI */ /* * Routine used to find the iSCSI Boot Format Table. The logical @@ -59,6 +69,11 @@ unsigned long __init find_ibft_region(unsigned long *sizep) ibft_addr = NULL; + /* iBFT 1.03 section 1.4.3.1 mandates that UEFI machines will + * only use ACPI for this */ + if (efi_enabled) + return 0; + for (pos = IBFT_START; pos < IBFT_END; pos += 16) { /* The table can't be inside the VGA BIOS reserved space, * so skip that area */ @@ -72,14 +87,24 @@ unsigned long __init find_ibft_region(unsigned long *sizep) /* if the length of the table extends past 1M, * the table cannot be valid. */ if (pos + len <= (IBFT_END-1)) { - ibft_addr = (struct ibft_table_header *)virt; + ibft_addr = (struct acpi_table_ibft *)virt; break; } } } +#ifdef CONFIG_ACPI + /* + * One spec says "IBFT", the other says "iBFT". We have to check + * for both. + */ + if (!ibft_addr) + acpi_table_parse(ACPI_SIG_IBFT, acpi_find_ibft); + if (!ibft_addr) + acpi_table_parse("iBFT", acpi_find_ibft); +#endif /* CONFIG_ACPI */ if (ibft_addr) { - *sizep = PAGE_ALIGN(len); - return pos; + *sizep = PAGE_ALIGN(ibft_addr->header.length); + return (u64)isa_virt_to_bus(ibft_addr); } *sizep = 0; diff --git a/include/linux/iscsi_ibft.h b/include/linux/iscsi_ibft.h index d2e4042f8f5..8ba7e5b9d62 100644 --- a/include/linux/iscsi_ibft.h +++ b/include/linux/iscsi_ibft.h @@ -21,21 +21,13 @@ #ifndef ISCSI_IBFT_H #define ISCSI_IBFT_H -struct ibft_table_header { - char signature[4]; - u32 length; - u8 revision; - u8 checksum; - char oem_id[6]; - char oem_table_id[8]; - char reserved[24]; -} __attribute__((__packed__)); +#include /* * Logical location of iSCSI Boot Format Table. * If the value is NULL there is no iBFT on the machine. */ -extern struct ibft_table_header *ibft_addr; +extern struct acpi_table_ibft *ibft_addr; /* * Routine used to find and reserve the iSCSI Boot Format Table. The -- cgit v1.2.3-70-g09d2 From ba4ee30c6c797de148dcc7254cf6d531aba71d9b Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 12 Apr 2010 18:06:17 +0000 Subject: ibft: separate ibft parsing from sysfs interface Not all iscsi drivers support ibft. For drivers like be2iscsi that do not but are bootable through a vendor firmware specific format/process this patch moves the sysfs interface from the ibft code to a lib module. This then allows userspace tools to search for iscsi boot info in a common place and in a common format. ibft iscsi boot info is exported in the same place as it was before: /sys/firmware/ibft. vendor/fw boot info gets export in /sys/firmware/iscsi_bootX, where X is the scsi host number of the HBA. Underneath these parent dirs, the target, ethernet, and initiator dirs are the same as they were before. Signed-off-by: Mike Christie Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Peter Jones --- drivers/firmware/Kconfig | 8 + drivers/firmware/Makefile | 1 + drivers/firmware/iscsi_boot_sysfs.c | 481 ++++++++++++++++++++++++++++++++++++ include/linux/iscsi_boot_sysfs.h | 123 +++++++++ 4 files changed, 613 insertions(+) create mode 100644 drivers/firmware/iscsi_boot_sysfs.c create mode 100644 include/linux/iscsi_boot_sysfs.h (limited to 'include') diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 1b03ba1d083..571d2182613 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -122,6 +122,14 @@ config ISCSI_IBFT_FIND is necessary for iSCSI Boot Firmware Table Attributes module to work properly. +config ISCSI_BOOT_SYSFS + tristate "iSCSI Boot Sysfs Interface" + default n + help + This option enables support for exposing iSCSI boot information + via sysfs to userspace. If you wish to export this information, + say Y. Otherwise, say N. + config ISCSI_IBFT tristate "iSCSI Boot Firmware Table Attributes module" depends on ISCSI_IBFT_FIND diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index 1c3c17343db..5fe7e166292 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -10,4 +10,5 @@ obj-$(CONFIG_DCDBAS) += dcdbas.o obj-$(CONFIG_DMIID) += dmi-id.o obj-$(CONFIG_ISCSI_IBFT_FIND) += iscsi_ibft_find.o obj-$(CONFIG_ISCSI_IBFT) += iscsi_ibft.o +obj-$(CONFIG_ISCSI_BOOT_SYSFS) += iscsi_boot_sysfs.o obj-$(CONFIG_FIRMWARE_MEMMAP) += memmap.o diff --git a/drivers/firmware/iscsi_boot_sysfs.c b/drivers/firmware/iscsi_boot_sysfs.c new file mode 100644 index 00000000000..df6bff7366c --- /dev/null +++ b/drivers/firmware/iscsi_boot_sysfs.c @@ -0,0 +1,481 @@ +/* + * Export the iSCSI boot info to userland via sysfs. + * + * Copyright (C) 2010 Red Hat, Inc. All rights reserved. + * Copyright (C) 2010 Mike Christie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + + +MODULE_AUTHOR("Mike Christie "); +MODULE_DESCRIPTION("sysfs interface and helpers to export iSCSI boot information"); +MODULE_LICENSE("GPL"); +/* + * The kobject and attribute structures. + */ +struct iscsi_boot_attr { + struct attribute attr; + int type; + ssize_t (*show) (void *data, int type, char *buf); +}; + +/* + * The routine called for all sysfs attributes. + */ +static ssize_t iscsi_boot_show_attribute(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct iscsi_boot_kobj *boot_kobj = + container_of(kobj, struct iscsi_boot_kobj, kobj); + struct iscsi_boot_attr *boot_attr = + container_of(attr, struct iscsi_boot_attr, attr); + ssize_t ret = -EIO; + char *str = buf; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (boot_kobj->show) + ret = boot_kobj->show(boot_kobj->data, boot_attr->type, str); + return ret; +} + +static const struct sysfs_ops iscsi_boot_attr_ops = { + .show = iscsi_boot_show_attribute, +}; + +static void iscsi_boot_kobj_release(struct kobject *kobj) +{ + struct iscsi_boot_kobj *boot_kobj = + container_of(kobj, struct iscsi_boot_kobj, kobj); + + kfree(boot_kobj->data); + kfree(boot_kobj); +} + +static struct kobj_type iscsi_boot_ktype = { + .release = iscsi_boot_kobj_release, + .sysfs_ops = &iscsi_boot_attr_ops, +}; + +#define iscsi_boot_rd_attr(fnname, sysfs_name, attr_type) \ +static struct iscsi_boot_attr iscsi_boot_attr_##fnname = { \ + .attr = { .name = __stringify(sysfs_name), .mode = 0444 }, \ + .type = attr_type, \ +} + +/* Target attrs */ +iscsi_boot_rd_attr(tgt_index, index, ISCSI_BOOT_TGT_INDEX); +iscsi_boot_rd_attr(tgt_flags, flags, ISCSI_BOOT_TGT_FLAGS); +iscsi_boot_rd_attr(tgt_ip, ip-addr, ISCSI_BOOT_TGT_IP_ADDR); +iscsi_boot_rd_attr(tgt_port, port, ISCSI_BOOT_TGT_PORT); +iscsi_boot_rd_attr(tgt_lun, lun, ISCSI_BOOT_TGT_LUN); +iscsi_boot_rd_attr(tgt_chap, chap-type, ISCSI_BOOT_TGT_CHAP_TYPE); +iscsi_boot_rd_attr(tgt_nic, nic-assoc, ISCSI_BOOT_TGT_NIC_ASSOC); +iscsi_boot_rd_attr(tgt_name, target-name, ISCSI_BOOT_TGT_NAME); +iscsi_boot_rd_attr(tgt_chap_name, chap-name, ISCSI_BOOT_TGT_CHAP_NAME); +iscsi_boot_rd_attr(tgt_chap_secret, chap-secret, ISCSI_BOOT_TGT_CHAP_SECRET); +iscsi_boot_rd_attr(tgt_chap_rev_name, rev-chap-name, + ISCSI_BOOT_TGT_REV_CHAP_NAME); +iscsi_boot_rd_attr(tgt_chap_rev_secret, rev-chap-name-secret, + ISCSI_BOOT_TGT_REV_CHAP_SECRET); + +static struct attribute *target_attrs[] = { + &iscsi_boot_attr_tgt_index.attr, + &iscsi_boot_attr_tgt_flags.attr, + &iscsi_boot_attr_tgt_ip.attr, + &iscsi_boot_attr_tgt_port.attr, + &iscsi_boot_attr_tgt_lun.attr, + &iscsi_boot_attr_tgt_chap.attr, + &iscsi_boot_attr_tgt_nic.attr, + &iscsi_boot_attr_tgt_name.attr, + &iscsi_boot_attr_tgt_chap_name.attr, + &iscsi_boot_attr_tgt_chap_secret.attr, + &iscsi_boot_attr_tgt_chap_rev_name.attr, + &iscsi_boot_attr_tgt_chap_rev_secret.attr, + NULL +}; + +static mode_t iscsi_boot_tgt_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct iscsi_boot_kobj *boot_kobj = + container_of(kobj, struct iscsi_boot_kobj, kobj); + + if (attr == &iscsi_boot_attr_tgt_index.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_INDEX); + else if (attr == &iscsi_boot_attr_tgt_flags.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_FLAGS); + else if (attr == &iscsi_boot_attr_tgt_ip.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_IP_ADDR); + else if (attr == &iscsi_boot_attr_tgt_port.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_PORT); + else if (attr == &iscsi_boot_attr_tgt_lun.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_LUN); + else if (attr == &iscsi_boot_attr_tgt_chap.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_CHAP_TYPE); + else if (attr == &iscsi_boot_attr_tgt_nic.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_NIC_ASSOC); + else if (attr == &iscsi_boot_attr_tgt_name.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_NAME); + else if (attr == &iscsi_boot_attr_tgt_chap_name.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_CHAP_NAME); + else if (attr == &iscsi_boot_attr_tgt_chap_secret.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_CHAP_SECRET); + else if (attr == &iscsi_boot_attr_tgt_chap_rev_name.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_REV_CHAP_NAME); + else if (attr == &iscsi_boot_attr_tgt_chap_rev_secret.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_TGT_REV_CHAP_SECRET); + return 0; +} + +static struct attribute_group iscsi_boot_target_attr_group = { + .attrs = target_attrs, + .is_visible = iscsi_boot_tgt_attr_is_visible, +}; + +/* Ethernet attrs */ +iscsi_boot_rd_attr(eth_index, index, ISCSI_BOOT_ETH_INDEX); +iscsi_boot_rd_attr(eth_flags, flags, ISCSI_BOOT_ETH_FLAGS); +iscsi_boot_rd_attr(eth_ip, ip-addr, ISCSI_BOOT_ETH_IP_ADDR); +iscsi_boot_rd_attr(eth_subnet, subnet-mask, ISCSI_BOOT_ETH_SUBNET_MASK); +iscsi_boot_rd_attr(eth_origin, origin, ISCSI_BOOT_ETH_ORIGIN); +iscsi_boot_rd_attr(eth_gateway, gateway, ISCSI_BOOT_ETH_GATEWAY); +iscsi_boot_rd_attr(eth_primary_dns, primary-dns, ISCSI_BOOT_ETH_PRIMARY_DNS); +iscsi_boot_rd_attr(eth_secondary_dns, secondary-dns, + ISCSI_BOOT_ETH_SECONDARY_DNS); +iscsi_boot_rd_attr(eth_dhcp, dhcp, ISCSI_BOOT_ETH_DHCP); +iscsi_boot_rd_attr(eth_vlan, vlan, ISCSI_BOOT_ETH_VLAN); +iscsi_boot_rd_attr(eth_mac, mac, ISCSI_BOOT_ETH_MAC); +iscsi_boot_rd_attr(eth_hostname, hostname, ISCSI_BOOT_ETH_HOSTNAME); + +static struct attribute *ethernet_attrs[] = { + &iscsi_boot_attr_eth_index.attr, + &iscsi_boot_attr_eth_flags.attr, + &iscsi_boot_attr_eth_ip.attr, + &iscsi_boot_attr_eth_subnet.attr, + &iscsi_boot_attr_eth_origin.attr, + &iscsi_boot_attr_eth_gateway.attr, + &iscsi_boot_attr_eth_primary_dns.attr, + &iscsi_boot_attr_eth_secondary_dns.attr, + &iscsi_boot_attr_eth_dhcp.attr, + &iscsi_boot_attr_eth_vlan.attr, + &iscsi_boot_attr_eth_mac.attr, + &iscsi_boot_attr_eth_hostname.attr, + NULL +}; + +static mode_t iscsi_boot_eth_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct iscsi_boot_kobj *boot_kobj = + container_of(kobj, struct iscsi_boot_kobj, kobj); + + if (attr == &iscsi_boot_attr_eth_index.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_INDEX); + else if (attr == &iscsi_boot_attr_eth_flags.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_FLAGS); + else if (attr == &iscsi_boot_attr_eth_ip.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_IP_ADDR); + else if (attr == &iscsi_boot_attr_eth_subnet.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_SUBNET_MASK); + else if (attr == &iscsi_boot_attr_eth_origin.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_ORIGIN); + else if (attr == &iscsi_boot_attr_eth_gateway.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_GATEWAY); + else if (attr == &iscsi_boot_attr_eth_primary_dns.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_PRIMARY_DNS); + else if (attr == &iscsi_boot_attr_eth_secondary_dns.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_SECONDARY_DNS); + else if (attr == &iscsi_boot_attr_eth_dhcp.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_DHCP); + else if (attr == &iscsi_boot_attr_eth_vlan.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_VLAN); + else if (attr == &iscsi_boot_attr_eth_mac.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_MAC); + else if (attr == &iscsi_boot_attr_eth_hostname.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_ETH_HOSTNAME); + return 0; +} + +static struct attribute_group iscsi_boot_ethernet_attr_group = { + .attrs = ethernet_attrs, + .is_visible = iscsi_boot_eth_attr_is_visible, +}; + +/* Initiator attrs */ +iscsi_boot_rd_attr(ini_index, index, ISCSI_BOOT_INI_INDEX); +iscsi_boot_rd_attr(ini_flags, flags, ISCSI_BOOT_INI_FLAGS); +iscsi_boot_rd_attr(ini_isns, isns-server, ISCSI_BOOT_INI_ISNS_SERVER); +iscsi_boot_rd_attr(ini_slp, slp-server, ISCSI_BOOT_INI_SLP_SERVER); +iscsi_boot_rd_attr(ini_primary_radius, pri-radius-server, + ISCSI_BOOT_INI_PRI_RADIUS_SERVER); +iscsi_boot_rd_attr(ini_secondary_radius, sec-radius-server, + ISCSI_BOOT_INI_SEC_RADIUS_SERVER); +iscsi_boot_rd_attr(ini_name, initiator-name, ISCSI_BOOT_INI_INITIATOR_NAME); + +static struct attribute *initiator_attrs[] = { + &iscsi_boot_attr_ini_index.attr, + &iscsi_boot_attr_ini_flags.attr, + &iscsi_boot_attr_ini_isns.attr, + &iscsi_boot_attr_ini_slp.attr, + &iscsi_boot_attr_ini_primary_radius.attr, + &iscsi_boot_attr_ini_secondary_radius.attr, + &iscsi_boot_attr_ini_name.attr, + NULL +}; + +static mode_t iscsi_boot_ini_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct iscsi_boot_kobj *boot_kobj = + container_of(kobj, struct iscsi_boot_kobj, kobj); + + if (attr == &iscsi_boot_attr_ini_index.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_INDEX); + if (attr == &iscsi_boot_attr_ini_flags.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_FLAGS); + if (attr == &iscsi_boot_attr_ini_isns.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_ISNS_SERVER); + if (attr == &iscsi_boot_attr_ini_slp.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_SLP_SERVER); + if (attr == &iscsi_boot_attr_ini_primary_radius.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_PRI_RADIUS_SERVER); + if (attr == &iscsi_boot_attr_ini_secondary_radius.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_SEC_RADIUS_SERVER); + if (attr == &iscsi_boot_attr_ini_name.attr) + return boot_kobj->is_visible(boot_kobj->data, + ISCSI_BOOT_INI_INITIATOR_NAME); + + return 0; +} + +static struct attribute_group iscsi_boot_initiator_attr_group = { + .attrs = initiator_attrs, + .is_visible = iscsi_boot_ini_attr_is_visible, +}; + +static struct iscsi_boot_kobj * +iscsi_boot_create_kobj(struct iscsi_boot_kset *boot_kset, + struct attribute_group *attr_group, + const char *name, int index, void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)) +{ + struct iscsi_boot_kobj *boot_kobj; + + boot_kobj = kzalloc(sizeof(*boot_kobj), GFP_KERNEL); + if (!boot_kobj) + return NULL; + INIT_LIST_HEAD(&boot_kobj->list); + + boot_kobj->kobj.kset = boot_kset->kset; + if (kobject_init_and_add(&boot_kobj->kobj, &iscsi_boot_ktype, + NULL, name, index)) { + kfree(boot_kobj); + return NULL; + } + boot_kobj->data = data; + boot_kobj->show = show; + boot_kobj->is_visible = is_visible; + + if (sysfs_create_group(&boot_kobj->kobj, attr_group)) { + /* + * We do not want to free this because the caller + * will assume that since the creation call failed + * the boot kobj was not setup and the normal release + * path is not being run. + */ + boot_kobj->data = NULL; + kobject_put(&boot_kobj->kobj); + return NULL; + } + boot_kobj->attr_group = attr_group; + + kobject_uevent(&boot_kobj->kobj, KOBJ_ADD); + /* Nothing broke so lets add it to the list. */ + list_add_tail(&boot_kobj->list, &boot_kset->kobj_list); + return boot_kobj; +} + +static void iscsi_boot_remove_kobj(struct iscsi_boot_kobj *boot_kobj) +{ + list_del(&boot_kobj->list); + sysfs_remove_group(&boot_kobj->kobj, boot_kobj->attr_group); + kobject_put(&boot_kobj->kobj); +} + +/** + * iscsi_boot_create_target() - create boot target sysfs dir + * @boot_kset: boot kset + * @index: the target id + * @data: driver specific data for target + * @show: attr show function + * @is_visible: attr visibility function + * + * Note: The boot sysfs lib will free the data passed in for the caller + * when all refs to the target kobject have been released. + */ +struct iscsi_boot_kobj * +iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index, + void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)) +{ + return iscsi_boot_create_kobj(boot_kset, &iscsi_boot_target_attr_group, + "target%d", index, data, show, is_visible); +} +EXPORT_SYMBOL_GPL(iscsi_boot_create_target); + +/** + * iscsi_boot_create_initiator() - create boot initiator sysfs dir + * @boot_kset: boot kset + * @index: the initiator id + * @data: driver specific data + * @show: attr show function + * @is_visible: attr visibility function + * + * Note: The boot sysfs lib will free the data passed in for the caller + * when all refs to the initiator kobject have been released. + */ +struct iscsi_boot_kobj * +iscsi_boot_create_initiator(struct iscsi_boot_kset *boot_kset, int index, + void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)) +{ + return iscsi_boot_create_kobj(boot_kset, + &iscsi_boot_initiator_attr_group, + "initiator", index, data, show, + is_visible); +} +EXPORT_SYMBOL_GPL(iscsi_boot_create_initiator); + +/** + * iscsi_boot_create_ethernet() - create boot ethernet sysfs dir + * @boot_kset: boot kset + * @index: the ethernet device id + * @data: driver specific data + * @show: attr show function + * @is_visible: attr visibility function + * + * Note: The boot sysfs lib will free the data passed in for the caller + * when all refs to the ethernet kobject have been released. + */ +struct iscsi_boot_kobj * +iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index, + void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)) +{ + return iscsi_boot_create_kobj(boot_kset, + &iscsi_boot_ethernet_attr_group, + "ethernet%d", index, data, show, + is_visible); +} +EXPORT_SYMBOL_GPL(iscsi_boot_create_ethernet); + +/** + * iscsi_boot_create_kset() - creates root sysfs tree + * @set_name: name of root dir + */ +struct iscsi_boot_kset *iscsi_boot_create_kset(const char *set_name) +{ + struct iscsi_boot_kset *boot_kset; + + boot_kset = kzalloc(sizeof(*boot_kset), GFP_KERNEL); + if (!boot_kset) + return NULL; + + boot_kset->kset = kset_create_and_add(set_name, NULL, firmware_kobj); + if (!boot_kset->kset) { + kfree(boot_kset); + return NULL; + } + + INIT_LIST_HEAD(&boot_kset->kobj_list); + return boot_kset; +} +EXPORT_SYMBOL_GPL(iscsi_boot_create_kset); + +/** + * iscsi_boot_create_host_kset() - creates root sysfs tree for a scsi host + * @hostno: host number of scsi host + */ +struct iscsi_boot_kset *iscsi_boot_create_host_kset(unsigned int hostno) +{ + struct iscsi_boot_kset *boot_kset; + char *set_name; + + set_name = kasprintf(GFP_KERNEL, "iscsi_boot%u", hostno); + if (!set_name) + return NULL; + + boot_kset = iscsi_boot_create_kset(set_name); + kfree(set_name); + return boot_kset; +} +EXPORT_SYMBOL_GPL(iscsi_boot_create_host_kset); + +/** + * iscsi_boot_destroy_kset() - destroy kset and kobjects under it + * @boot_kset: boot kset + * + * This will remove the kset and kobjects and attrs under it. + */ +void iscsi_boot_destroy_kset(struct iscsi_boot_kset *boot_kset) +{ + struct iscsi_boot_kobj *boot_kobj, *tmp_kobj; + + list_for_each_entry_safe(boot_kobj, tmp_kobj, + &boot_kset->kobj_list, list) + iscsi_boot_remove_kobj(boot_kobj); + + kset_unregister(boot_kset->kset); +} +EXPORT_SYMBOL_GPL(iscsi_boot_destroy_kset); diff --git a/include/linux/iscsi_boot_sysfs.h b/include/linux/iscsi_boot_sysfs.h new file mode 100644 index 00000000000..f1e6c184f14 --- /dev/null +++ b/include/linux/iscsi_boot_sysfs.h @@ -0,0 +1,123 @@ +/* + * Export the iSCSI boot info to userland via sysfs. + * + * Copyright (C) 2010 Red Hat, Inc. All rights reserved. + * Copyright (C) 2010 Mike Christie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef _ISCSI_BOOT_SYSFS_ +#define _ISCSI_BOOT_SYSFS_ + +/* + * The text attributes names for each of the kobjects. +*/ +enum iscsi_boot_eth_properties_enum { + ISCSI_BOOT_ETH_INDEX, + ISCSI_BOOT_ETH_FLAGS, + ISCSI_BOOT_ETH_IP_ADDR, + ISCSI_BOOT_ETH_SUBNET_MASK, + ISCSI_BOOT_ETH_ORIGIN, + ISCSI_BOOT_ETH_GATEWAY, + ISCSI_BOOT_ETH_PRIMARY_DNS, + ISCSI_BOOT_ETH_SECONDARY_DNS, + ISCSI_BOOT_ETH_DHCP, + ISCSI_BOOT_ETH_VLAN, + ISCSI_BOOT_ETH_MAC, + /* eth_pci_bdf - this is replaced by link to the device itself. */ + ISCSI_BOOT_ETH_HOSTNAME, + ISCSI_BOOT_ETH_END_MARKER, +}; + +enum iscsi_boot_tgt_properties_enum { + ISCSI_BOOT_TGT_INDEX, + ISCSI_BOOT_TGT_FLAGS, + ISCSI_BOOT_TGT_IP_ADDR, + ISCSI_BOOT_TGT_PORT, + ISCSI_BOOT_TGT_LUN, + ISCSI_BOOT_TGT_CHAP_TYPE, + ISCSI_BOOT_TGT_NIC_ASSOC, + ISCSI_BOOT_TGT_NAME, + ISCSI_BOOT_TGT_CHAP_NAME, + ISCSI_BOOT_TGT_CHAP_SECRET, + ISCSI_BOOT_TGT_REV_CHAP_NAME, + ISCSI_BOOT_TGT_REV_CHAP_SECRET, + ISCSI_BOOT_TGT_END_MARKER, +}; + +enum iscsi_boot_initiator_properties_enum { + ISCSI_BOOT_INI_INDEX, + ISCSI_BOOT_INI_FLAGS, + ISCSI_BOOT_INI_ISNS_SERVER, + ISCSI_BOOT_INI_SLP_SERVER, + ISCSI_BOOT_INI_PRI_RADIUS_SERVER, + ISCSI_BOOT_INI_SEC_RADIUS_SERVER, + ISCSI_BOOT_INI_INITIATOR_NAME, + ISCSI_BOOT_INI_END_MARKER, +}; + +struct attribute_group; + +struct iscsi_boot_kobj { + struct kobject kobj; + struct attribute_group *attr_group; + struct list_head list; + + /* + * Pointer to store driver specific info. If set this will + * be freed for the LLD when the kobj release function is called. + */ + void *data; + /* + * Driver specific show function. + * + * The enum of the type. This can be any value of the above + * properties. + */ + ssize_t (*show) (void *data, int type, char *buf); + + /* + * Drivers specific visibility function. + * The function should return if they the attr should be readable + * writable or should not be shown. + * + * The enum of the type. This can be any value of the above + * properties. + */ + mode_t (*is_visible) (void *data, int type); +}; + +struct iscsi_boot_kset { + struct list_head kobj_list; + struct kset *kset; +}; + +struct iscsi_boot_kobj * +iscsi_boot_create_initiator(struct iscsi_boot_kset *boot_kset, int index, + void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)); + +struct iscsi_boot_kobj * +iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index, + void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)); +struct iscsi_boot_kobj * +iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index, + void *data, + ssize_t (*show) (void *data, int type, char *buf), + mode_t (*is_visible) (void *data, int type)); + +struct iscsi_boot_kset *iscsi_boot_create_kset(const char *set_name); +struct iscsi_boot_kset *iscsi_boot_create_host_kset(unsigned int hostno); +void iscsi_boot_destroy_kset(struct iscsi_boot_kset *boot_kset); + +#endif -- cgit v1.2.3-70-g09d2 From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:44 -0400 Subject: lockup_detector: Combine nmi_watchdog and softlockup detector The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 2 + arch/x86/include/asm/nmi.h | 2 +- arch/x86/kernel/apic/Makefile | 4 +- arch/x86/kernel/apic/hw_nmi.c | 2 +- arch/x86/kernel/traps.c | 4 +- include/linux/nmi.h | 8 +- include/linux/sched.h | 6 + init/Kconfig | 5 +- kernel/Makefile | 3 +- kernel/sysctl.c | 21 +- kernel/watchdog.c | 592 ++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 30 +- 12 files changed, 650 insertions(+), 29 deletions(-) create mode 100644 kernel/watchdog.c (limited to 'include') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 839b21b0699..dfe8d1c226c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file nousb [USB] Disable the USB subsystem + nowatchdog [KNL] Disable the lockup detector. + nowb [ARM] nox2apic [X86-64,APIC] Do not enable x2APIC mode. diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5b41b0feb6d..932f0f86b4b 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); -#if !defined(CONFIG_NMI_WATCHDOG) +#if !defined(CONFIG_LOCKUP_DETECTOR) extern int nmi_watchdog_enabled; #endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 1a4512e48d2..52f32e0ea19 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_NMI_WATCHDOG),y) +ifneq ($(CONFIG_LOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o +obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index e8b78a0be5d..79425f96fce 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return (u64)(cpu_khz) * 1000 * 60; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bdc7fab3ef3..bd347c2b34d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_NMI_WATCHDOG */ +#endif /* !CONFIG_LOCKUP_DETECTOR */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 22cc7960b64..abd48aacaf7 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void); extern void acpi_nmi_disable(void); extern void acpi_nmi_enable(void); #else -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); @@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void) } #endif -#ifdef CONFIG_NMI_WATCHDOG +#ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(void); -extern int nmi_watchdog_enabled; +extern int watchdog_enabled; struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , +extern int proc_dowatchdog_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index dad7f668ebf..37efe8fa530 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_LOCKUP_DETECTOR +extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); +#endif + /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) diff --git a/init/Kconfig b/init/Kconfig index c6c8903cb53..e44e25422f2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -944,8 +944,11 @@ config PERF_USE_VMALLOC config PERF_EVENTS_NMI bool + depends on PERF_EVENTS help - Arch has support for nmi_watchdog + System hardware can generate an NMI using the perf event + subsystem. Also has support for calculating CPU cycle events + to determine how many clock cycles in a given period. menu "Kernel Performance Events And Counters" diff --git a/kernel/Makefile b/kernel/Makefile index d5c30060ac1..6adeafc3e25 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o -obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o +obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a38af430f0d..0f9adda85f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -74,7 +74,7 @@ #include #endif -#ifdef CONFIG_NMI_WATCHDOG +#ifdef CONFIG_LOCKUP_DETECTOR #include #endif @@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_NMI_WATCHDOG) +#if defined(CONFIG_LOCKUP_DETECTOR) { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, + .procname = "watchdog", + .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_nmi_enabled, + .proc_handler = proc_dowatchdog_enabled, + }, + { + .procname = "watchdog_thresh", + .data = &softlockup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog_thresh, + .extra1 = &neg_one, + .extra2 = &sixty, }, #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG) +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 00000000000..6b7fad8497a --- /dev/null +++ b/kernel/watchdog.c @@ -0,0 +1,592 @@ +/* + * Detect hard and soft lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +int watchdog_enabled; +int __read_mostly softlockup_thresh = 60; + +static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, soft_watchdog_warn); +#ifdef CONFIG_PERF_EVENTS_NMI +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +#endif + +static int __read_mostly did_panic; +static int __initdata no_watchdog; + + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +#ifdef CONFIG_PERF_EVENTS_NMI +static int hardlockup_panic; + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); +#endif + +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + +/* deprecated */ +static int __init nosoftlockup_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); +/* */ + + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(int this_cpu) +{ + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static unsigned long get_sample_period(void) +{ + /* + * convert softlockup_thresh from seconds to ns + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning + */ + return softlockup_thresh / 5 * NSEC_PER_SEC; +} + +/* Commands for resetting the watchdog */ +static void __touch_watchdog(void) +{ + int this_cpu = raw_smp_processor_id(); + + __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); +} + +void touch_watchdog(void) +{ + __get_cpu_var(watchdog_touch_ts) = 0; +} +EXPORT_SYMBOL(touch_watchdog); + +void touch_all_watchdog(void) +{ + int cpu; + + /* + * this is done lockless + * do we care if a 0 races with a timestamp? + * all it means is the softlock check starts one cycle later + */ + for_each_online_cpu(cpu) + per_cpu(watchdog_touch_ts, cpu) = 0; +} + +void touch_nmi_watchdog(void) +{ + touch_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +void touch_all_nmi_watchdog(void) +{ + touch_all_watchdog(); +} + +void touch_softlockup_watchdog(void) +{ + touch_watchdog(); +} + +void touch_all_softlockup_watchdogs(void) +{ + touch_all_watchdog(); +} + +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlockup_touch_sync) = true; + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} + +void softlockup_tick(void) +{ +} + +#ifdef CONFIG_PERF_EVENTS_NMI +/* watchdog detector functions */ +static int is_hardlockup(int cpu) +{ + unsigned long hrint = per_cpu(hrtimer_interrupts, cpu); + + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + return 1; + + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + return 0; +} +#endif + +static int is_softlockup(unsigned long touch_ts, int cpu) +{ + unsigned long now = get_timestamp(cpu); + + /* Warn about unreasonable delays: */ + if (time_after(now, touch_ts + softlockup_thresh)) + return now - touch_ts; + + return 0; +} + +static int +watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = watchdog_panic, +}; + +#ifdef CONFIG_PERF_EVENTS_NMI +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +void watchdog_overflow_callback(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long touch_ts = per_cpu(watchdog_touch_ts, this_cpu); + + if (touch_ts == 0) { + __touch_watchdog(); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup(this_cpu)) { + /* only print hardlockups once */ + if (__get_cpu_var(hard_watchdog_warn) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + + __get_cpu_var(hard_watchdog_warn) = true; + return; + } + + __get_cpu_var(hard_watchdog_warn) = false; + return; +} +static void watchdog_interrupt_count(void) +{ + __get_cpu_var(hrtimer_interrupts)++; +} +#else +static inline void watchdog_interrupt_count(void) { return; } +#endif /* CONFIG_PERF_EVENTS_NMI */ + +/* watchdog kicker functions */ +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +{ + int this_cpu = smp_processor_id(); + unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); + struct pt_regs *regs = get_irq_regs(); + int duration; + + /* kick the hardlockup detector */ + watchdog_interrupt_count(); + + /* kick the softlockup detector */ + wake_up_process(__get_cpu_var(softlockup_watchdog)); + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + if (touch_ts == 0) { + if (unlikely(per_cpu(softlockup_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlockup_touch_sync, this_cpu) = false; + sched_clock_tick(); + } + __touch_watchdog(); + return HRTIMER_RESTART; + } + + /* check for a softlockup + * This is done by making sure a high priority task is + * being scheduled. The task touches the watchdog to + * indicate it is getting cpu time. If it hasn't then + * this is a good indication some task is hogging the cpu + */ + duration = is_softlockup(touch_ts, this_cpu); + if (unlikely(duration)) { + /* only warn once */ + if (__get_cpu_var(soft_watchdog_warn) == true) + return HRTIMER_RESTART; + + printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + this_cpu, duration, + current->comm, task_pid_nr(current)); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (softlockup_panic) + panic("softlockup: hung tasks"); + __get_cpu_var(soft_watchdog_warn) = true; + } else + __get_cpu_var(soft_watchdog_warn) = false; + + return HRTIMER_RESTART; +} + + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *__bind_cpu) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, (unsigned long)__bind_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* initialize timestamp */ + __touch_watchdog(); + + /* kick off the timer for the hardlockup detector */ + /* done here because hrtimer_start can only pin to smp_processor_id() */ + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); + + set_current_state(TASK_INTERRUPTIBLE); + /* + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in softlockup_tick(). + */ + while (!kthread_should_stop()) { + __touch_watchdog(); + schedule(); + + if (kthread_should_stop()) + break; + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + + +#ifdef CONFIG_PERF_EVENTS_NMI +static int watchdog_nmi_enable(int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + /* Try to register using hardware perf events */ + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + if (!IS_ERR(event)) { + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + goto out_save; + } + + printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + return -1; + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +static void watchdog_nmi_disable(int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + return; +} +#else +static int watchdog_nmi_enable(int cpu) { return 0; } +static void watchdog_nmi_disable(int cpu) { return; } +#endif /* CONFIG_PERF_EVENTS_NMI */ + +/* prepare/enable/disable routines */ +static int watchdog_prepare_cpu(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + WARN_ON(per_cpu(softlockup_watchdog, cpu)); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + + return 0; +} + +static int watchdog_enable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + + /* enable the perf event */ + if (watchdog_nmi_enable(cpu) != 0) + return -1; + + /* create the watchdog thread */ + if (!p) { + p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + return -1; + } + kthread_bind(p, cpu); + per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(softlockup_watchdog, cpu) = p; + wake_up_process(p); + } + + return 0; +} + +static void watchdog_disable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + /* + * cancel the timer first to stop incrementing the stats + * and waking up the kthread + */ + hrtimer_cancel(hrtimer); + + /* disable the perf event */ + watchdog_nmi_disable(cpu); + + /* stop the watchdog thread */ + if (p) { + per_cpu(softlockup_watchdog, cpu) = NULL; + kthread_stop(p); + } + + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; +} + +static void watchdog_enable_all_cpus(void) +{ + int cpu; + int result; + + for_each_online_cpu(cpu) + result += watchdog_enable(cpu); + + if (result) + printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + +} + +static void watchdog_disable_all_cpus(void) +{ + int cpu; + + for_each_online_cpu(cpu) + watchdog_disable(cpu); + + /* if all watchdogs are disabled, then they are disabled for the system */ + watchdog_enabled = 0; +} + + +/* sysctl functions */ +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ + +int proc_dowatchdog_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, buffer, length, ppos); + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + return 0; +} + +int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +/* stub functions */ +int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos); +} +/* end of stub functions */ +#endif /* CONFIG_SYSCTL */ + + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (watchdog_prepare_cpu(hotcpu)) + return NOTIFY_BAD; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (watchdog_enable(hotcpu)) + return NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + watchdog_disable(hotcpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + watchdog_disable(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __init spawn_watchdog_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + if (no_watchdog) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + WARN_ON(err == NOTIFY_BAD); + + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; +} +early_initcall(spawn_watchdog_task); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 220ae6063b6..49e285dcaf5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -153,7 +153,7 @@ config DEBUG_SHIRQ points; some don't and need to be caught. config DETECT_SOFTLOCKUP - bool "Detect Soft Lockups" + bool depends on DEBUG_KERNEL && !S390 default y help @@ -171,17 +171,27 @@ config DETECT_SOFTLOCKUP can be detected via the NMI-watchdog, on platforms that support it.) -config NMI_WATCHDOG - bool "Detect Hard Lockups with an NMI Watchdog" - depends on DEBUG_KERNEL && PERF_EVENTS && PERF_EVENTS_NMI +config LOCKUP_DETECTOR + bool "Detect Hard and Soft Lockups" + depends on DEBUG_KERNEL + default DETECT_SOFTLOCKUP help - Say Y here to enable the kernel to use the NMI as a watchdog - to detect hard lockups. This is useful when a cpu hangs for no - reason but can still respond to NMIs. A backtrace is displayed - for reviewing and reporting. + Say Y here to enable the kernel to act as a watchdog to detect + hard and soft lockups. + + Softlockups are bugs that cause the kernel to loop in kernel + mode for more than 60 seconds, without giving other tasks a + chance to run. The current stack trace is displayed upon + detection and the system will stay locked up. + + Hardlockups are bugs that cause the CPU to loop in kernel mode + for more than 60 seconds, without letting other interrupts have a + chance to run. The current stack trace is displayed upon detection + and the system will stay locked up. - The overhead should be minimal, just an extra NMI every few - seconds. + The overhead should be minimal. A periodic hrtimer runs to + generate interrupts and kick the watchdog task every 10-12 seconds. + An NMI is generated every 60 seconds or so to check for hardlockups. config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" -- cgit v1.2.3-70-g09d2 From 332fbdbca3f7716c5620970755ae054d213bcc4e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:45 -0400 Subject: lockup_detector: Touch_softlockup cleanups and softlockup_tick removal Just some code cleanup to make touch_softlockup clearer and remove the softlockup_tick function as it is no longer needed. Also remove the /proc softlockup_thres call as it has been changed to watchdog_thres. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-3-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 16 +++------------- kernel/sysctl.c | 9 --------- kernel/timer.c | 1 - kernel/watchdog.c | 35 +++-------------------------------- 4 files changed, 6 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 37efe8fa530..33f9b2ad0bb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -312,19 +312,15 @@ extern void scheduler_tick(void); extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP -extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); -extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; #else -static inline void softlockup_tick(void) -{ -} static inline void touch_softlockup_watchdog(void) { } @@ -346,12 +342,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos); #endif -#ifdef CONFIG_LOCKUP_DETECTOR -extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); -#endif - /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f9adda85f9..999bc3fccf4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -817,15 +817,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "softlockup_thresh", - .data = &softlockup_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dosoftlockup_thresh, - .extra1 = &neg_one, - .extra2 = &sixty, - }, #endif #ifdef CONFIG_DETECT_HUNG_TASK { diff --git a/kernel/timer.c b/kernel/timer.c index aeb6a54f277..e8de5eb07a0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1225,7 +1225,6 @@ void run_local_timers(void) { hrtimer_run_queues(); raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); } /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6b7fad8497a..f1541b7e324 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -119,13 +119,12 @@ static void __touch_watchdog(void) __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); } -void touch_watchdog(void) +void touch_softlockup_watchdog(void) { __get_cpu_var(watchdog_touch_ts) = 0; } -EXPORT_SYMBOL(touch_watchdog); -void touch_all_watchdog(void) +void touch_all_softlockup_watchdogs(void) { int cpu; @@ -140,35 +139,16 @@ void touch_all_watchdog(void) void touch_nmi_watchdog(void) { - touch_watchdog(); + touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); -void touch_all_nmi_watchdog(void) -{ - touch_all_watchdog(); -} - -void touch_softlockup_watchdog(void) -{ - touch_watchdog(); -} - -void touch_all_softlockup_watchdogs(void) -{ - touch_all_watchdog(); -} - void touch_softlockup_watchdog_sync(void) { __raw_get_cpu_var(softlockup_touch_sync) = true; __raw_get_cpu_var(watchdog_touch_ts) = 0; } -void softlockup_tick(void) -{ -} - #ifdef CONFIG_PERF_EVENTS_NMI /* watchdog detector functions */ static int is_hardlockup(int cpu) @@ -522,15 +502,6 @@ int proc_dowatchdog_thresh(struct ctl_table *table, int write, { return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } - -/* stub functions */ -int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos); -} -/* end of stub functions */ #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3-70-g09d2 From 19cc36c0f0457e5c6629ec24036fbbe8255c88ec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 13 May 2010 02:30:49 +0200 Subject: lockup_detector: Fix forgotten config conversion Fix forgotten CONFIG_DETECT_SOFTLOCKUP -> CONFIG_LOCKUP_DETECTOR in sched.h Fixes: arch/x86/built-in.o: In function `touch_nmi_watchdog': (.text+0x1bd59): undefined reference to `touch_softlockup_watchdog' kernel/built-in.o: In function `show_state_filter': (.text+0x10d01): undefined reference to `touch_all_softlockup_watchdogs' kernel/built-in.o: In function `sched_clock_idle_wakeup_event': (.text+0x362f9): undefined reference to `touch_softlockup_watchdog' kernel/built-in.o: In function `timekeeping_resume': timekeeping.c:(.text+0x38757): undefined reference to `touch_softlockup_watchdog' kernel/built-in.o: In function `tick_nohz_handler': tick-sched.c:(.text+0x3e5b9): undefined reference to `touch_softlockup_watchdog' kernel/built-in.o: In function `tick_sched_timer': tick-sched.c:(.text+0x3e671): undefined reference to `touch_softlockup_watchdog' kernel/built-in.o: In function `tick_check_idle': (.text+0x3e90b): undefined reference to `touch_softlockup_watchdog' Signed-off-by: Frederic Weisbecker Cc: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 33f9b2ad0bb..3958e0cd24f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -311,7 +311,7 @@ extern void scheduler_tick(void); extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_LOCKUP_DETECTOR extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); -- cgit v1.2.3-70-g09d2 From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 14 May 2010 11:11:21 -0400 Subject: lockup_detector: Cross arch compile fixes Combining the softlockup and hardlockup code causes watchdog.c to build even without the hardlockup detection support. So if an arch, that has the previous and the new nmi watchdog implementations cohabiting, wants to know if the generic one is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check. We need to use CONFIG_HARDLOCKUP_DETECTOR instead. Fixes: kernel/built-in.o: In function `touch_nmi_watchdog': (.text+0x449bc): multiple definition of `touch_nmi_watchdog' arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov LKML-Reference: <20100514151121.GR15159@redhat.com> [ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI] Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/Makefile | 4 ++-- include/linux/nmi.h | 2 +- kernel/watchdog.c | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52f32e0ea19..910f20b457c 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_LOCKUP_DETECTOR),y) +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/include/linux/nmi.h b/include/linux/nmi.h index abd48aacaf7..06aab5eee13 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void); extern void acpi_nmi_disable(void); extern void acpi_nmi_enable(void); #else -#ifndef CONFIG_LOCKUP_DETECTOR +#ifndef CONFIG_HARDLOCKUP_DETECTOR static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 83fb63155cb..e53622c1465 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,13 +31,13 @@ int watchdog_enabled; int __read_mostly softlockup_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, soft_watchdog_warn); #ifdef CONFIG_HARDLOCKUP_DETECTOR +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); @@ -139,6 +139,7 @@ void touch_all_softlockup_watchdogs(void) per_cpu(watchdog_touch_ts, cpu) = 0; } +#ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { __get_cpu_var(watchdog_nmi_touch) = true; @@ -146,6 +147,8 @@ void touch_nmi_watchdog(void) } EXPORT_SYMBOL(touch_nmi_watchdog); +#endif + void touch_softlockup_watchdog_sync(void) { __raw_get_cpu_var(softlockup_touch_sync) = true; -- cgit v1.2.3-70-g09d2 From b6f4bb383d69cac46f17e2305720f9a3d426c5ed Mon Sep 17 00:00:00 2001 From: "apatard@mandriva.com" Date: Sat, 15 May 2010 17:30:01 +0200 Subject: ASoC: Add SOC_DOUBLE_R_SX_TLV control This patch is adding a new control which has the following capabilities: - tlv - variable data size (for instance, 7 ou 8 bit) - double mixer - data range centered around 0 Signed-off-by: Arnaud Patard Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- include/sound/soc.h | 21 ++++++++++++ sound/soc/soc-core.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 697e7ffe39d..65e9d03ed4f 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -170,6 +170,21 @@ .get = xhandler_get, .put = xhandler_put, \ .private_value = (unsigned long)&xenum } +#define SOC_DOUBLE_R_SX_TLV(xname, xreg_left, xreg_right, xshift,\ + xmin, xmax, tlv_array) \ +{ .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname), \ + .access = SNDRV_CTL_ELEM_ACCESS_TLV_READ | \ + SNDRV_CTL_ELEM_ACCESS_READWRITE, \ + .tlv.p = (tlv_array), \ + .info = snd_soc_info_volsw_2r_sx, \ + .get = snd_soc_get_volsw_2r_sx, \ + .put = snd_soc_put_volsw_2r_sx, \ + .private_value = (unsigned long)&(struct soc_mixer_control) \ + {.reg = xreg_left, \ + .rreg = xreg_right, .shift = xshift, \ + .min = xmin, .max = xmax} } + + /* * Simplified versions of above macros, declaring a struct and calculating * ARRAY_SIZE internally @@ -329,6 +344,12 @@ int snd_soc_put_volsw_s8(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol); int snd_soc_limit_volume(struct snd_soc_codec *codec, const char *name, int max); +int snd_soc_info_volsw_2r_sx(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo); +int snd_soc_get_volsw_2r_sx(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol); +int snd_soc_put_volsw_2r_sx(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol); /** * struct snd_soc_jack_pin - Describes a pin to update based on jack detection diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index e1043f64473..6220bc1ee42 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2351,6 +2351,101 @@ int snd_soc_limit_volume(struct snd_soc_codec *codec, } EXPORT_SYMBOL_GPL(snd_soc_limit_volume); +/** + * snd_soc_info_volsw_2r_sx - double with tlv and variable data size + * mixer info callback + * @kcontrol: mixer control + * @uinfo: control element information + * + * Returns 0 for success. + */ +int snd_soc_info_volsw_2r_sx(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + struct soc_mixer_control *mc = + (struct soc_mixer_control *)kcontrol->private_value; + int max = mc->max; + int min = mc->min; + + uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 2; + uinfo->value.integer.min = 0; + uinfo->value.integer.max = max-min; + + return 0; +} +EXPORT_SYMBOL_GPL(snd_soc_info_volsw_2r_sx); + +/** + * snd_soc_get_volsw_2r_sx - double with tlv and variable data size + * mixer get callback + * @kcontrol: mixer control + * @uinfo: control element information + * + * Returns 0 for success. + */ +int snd_soc_get_volsw_2r_sx(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mc = + (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); + unsigned int mask = (1<shift)-1; + int min = mc->min; + int val = snd_soc_read(codec, mc->reg) & mask; + int valr = snd_soc_read(codec, mc->rreg) & mask; + + ucontrol->value.integer.value[0] = ((val & 0xff)-min); + ucontrol->value.integer.value[1] = ((valr & 0xff)-min); + return 0; +} +EXPORT_SYMBOL_GPL(snd_soc_get_volsw_2r_sx); + +/** + * snd_soc_put_volsw_2r_sx - double with tlv and variable data size + * mixer put callback + * @kcontrol: mixer control + * @uinfo: control element information + * + * Returns 0 for success. + */ +int snd_soc_put_volsw_2r_sx(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mc = + (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); + unsigned int mask = (1<shift)-1; + int min = mc->min; + int ret; + unsigned int val, valr, oval, ovalr; + + val = ((ucontrol->value.integer.value[0]+min) & 0xff); + val &= mask; + valr = ((ucontrol->value.integer.value[1]+min) & 0xff); + valr &= mask; + + oval = snd_soc_read(codec, mc->reg) & mask; + ovalr = snd_soc_read(codec, mc->rreg) & mask; + + ret = 0; + if (oval != val) { + ret = snd_soc_write(codec, mc->reg, val); + if (ret < 0) + return 0; + ret = 1; + } + if (ovalr != valr) { + ret = snd_soc_write(codec, mc->rreg, valr); + if (ret < 0) + return 0; + ret = 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(snd_soc_put_volsw_2r_sx); + /** * snd_soc_dai_set_sysclk - configure DAI system or master clock. * @dai: DAI -- cgit v1.2.3-70-g09d2 From 15c0cee6c809a137e0fc7f1d2b0867cc03473c0c Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Fri, 28 May 2010 11:43:45 -0400 Subject: ALSA: pcm: Define G723 3-bit and 5-bit formats This defines the 24bps and 40bps (8khz sample rate) G.723 codec formats. They are going to be used once I submit the driver for an mpeg4/g723 compression card. I've updated the signed value to -1 as per Takashi's comments since these are non-linear formats. Signed-off-by: Ben Collins Signed-off-by: Takashi Iwai --- include/sound/asound.h | 6 +++++- include/sound/pcm.h | 4 ++++ sound/core/pcm_misc.c | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/asound.h b/include/sound/asound.h index 9f1eecf99e6..a1803ecea34 100644 --- a/include/sound/asound.h +++ b/include/sound/asound.h @@ -212,7 +212,11 @@ typedef int __bitwise snd_pcm_format_t; #define SNDRV_PCM_FORMAT_S18_3BE ((__force snd_pcm_format_t) 41) /* in three bytes */ #define SNDRV_PCM_FORMAT_U18_3LE ((__force snd_pcm_format_t) 42) /* in three bytes */ #define SNDRV_PCM_FORMAT_U18_3BE ((__force snd_pcm_format_t) 43) /* in three bytes */ -#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_U18_3BE +#define SNDRV_PCM_FORMAT_G723_24 ((__force snd_pcm_format_t) 44) /* 8 samples in 3 bytes */ +#define SNDRV_PCM_FORMAT_G723_24_1B ((__force snd_pcm_format_t) 45) /* 1 sample in 1 byte */ +#define SNDRV_PCM_FORMAT_G723_40 ((__force snd_pcm_format_t) 46) /* 8 Samples in 5 bytes */ +#define SNDRV_PCM_FORMAT_G723_40_1B ((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */ +#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_G723_40_1B #ifdef SNDRV_LITTLE_ENDIAN #define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE diff --git a/include/sound/pcm.h b/include/sound/pcm.h index dd76cdede64..07fd630db88 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -174,6 +174,10 @@ struct snd_pcm_ops { #define SNDRV_PCM_FMTBIT_U18_3LE (1ULL << SNDRV_PCM_FORMAT_U18_3LE) #define SNDRV_PCM_FMTBIT_S18_3BE (1ULL << SNDRV_PCM_FORMAT_S18_3BE) #define SNDRV_PCM_FMTBIT_U18_3BE (1ULL << SNDRV_PCM_FORMAT_U18_3BE) +#define SNDRV_PCM_FMTBIT_G723_24 (1ULL << SNDRV_PCM_FORMAT_G723_24) +#define SNDRV_PCM_FMTBIT_G723_24_1B (1ULL << SNDRV_PCM_FORMAT_G723_24_1B) +#define SNDRV_PCM_FMTBIT_G723_40 (1ULL << SNDRV_PCM_FORMAT_G723_40) +#define SNDRV_PCM_FMTBIT_G723_40_1B (1ULL << SNDRV_PCM_FORMAT_G723_40_1B) #ifdef SNDRV_LITTLE_ENDIAN #define SNDRV_PCM_FMTBIT_S16 SNDRV_PCM_FMTBIT_S16_LE diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c index ea2bf82c937..434af3c56d5 100644 --- a/sound/core/pcm_misc.c +++ b/sound/core/pcm_misc.c @@ -128,6 +128,14 @@ static struct pcm_format_data pcm_formats[SNDRV_PCM_FORMAT_LAST+1] = { .width = 4, .phys = 4, .le = -1, .signd = -1, .silence = {}, }, + [SNDRV_PCM_FORMAT_G723_24] = { + .width = 3, .phys = 3, .le = -1, .signd = -1, + .silence = {}, + }, + [SNDRV_PCM_FORMAT_G723_40] = { + .width = 5, .phys = 5, .le = -1, .signd = -1, + .silence = {}, + }, /* FIXME: the following three formats are not defined properly yet */ [SNDRV_PCM_FORMAT_MPEG] = { .le = -1, .signd = -1, @@ -186,6 +194,14 @@ static struct pcm_format_data pcm_formats[SNDRV_PCM_FORMAT_LAST+1] = { .width = 18, .phys = 24, .le = 0, .signd = 0, .silence = { 0x02, 0x00, 0x00 }, }, + [SNDRV_PCM_FORMAT_G723_24_1B] = { + .width = 3, .phys = 8, .le = -1, .signd = -1, + .silence = {}, + }, + [SNDRV_PCM_FORMAT_G723_40_1B] = { + .width = 5, .phys = 8, .le = -1, .signd = -1, + .silence = {}, + }, }; -- cgit v1.2.3-70-g09d2 From 06c4648d46d1b757d6b9591a86810be79818b60c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 26 May 2010 00:09:42 +0000 Subject: arp_notify: allow drivers to explicitly request a notification event. Currently such notifications are only generated when the device comes up or the address changes. However one use case for these notifications is to enable faster network recovery after a virtual machine migration (by causing switches to relearn their MAC tables). A migration appears to the network stack as a temporary loss of carrier and therefore does not trigger either of the current conditions. Rather than adding carrier up as a trigger (which can cause issues when interfaces a flapping) simply add an interface which the driver can use to explicitly trigger the notification. Signed-off-by: Ian Campbell Cc: Stephen Hemminger Cc: Jeremy Fitzhardinge Cc: David S. Miller Cc: netdev@vger.kernel.org Cc: stable@kernel.org Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ include/linux/notifier.h | 1 + net/ipv4/devinet.c | 1 + net/sched/sch_generic.c | 18 ++++++++++++++++++ 4 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 40291f37502..a24916156f4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1772,6 +1772,8 @@ extern void netif_carrier_on(struct net_device *dev); extern void netif_carrier_off(struct net_device *dev); +extern void netif_notify_peers(struct net_device *dev); + /** * netif_dormant_on - mark device as dormant. * @dev: network device diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 540703b555c..22c2abb6197 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -210,6 +210,7 @@ static inline int notifier_to_errno(int ret) #define NETDEV_POST_INIT 0x0010 #define NETDEV_UNREGISTER_BATCH 0x0011 #define NETDEV_BONDING_DESLAVE 0x0012 +#define NETDEV_NOTIFY_PEERS 0x0012 #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 382bc768ed5..da14c49284f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1081,6 +1081,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } ip_mc_up(in_dev); /* fall through */ + case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEADDR: /* Send gratuitous ARP to notify of link change */ if (IN_DEV_ARP_NOTIFY(in_dev)) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a63029ef3ed..bd1892fe4b2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -327,6 +327,24 @@ void netif_carrier_off(struct net_device *dev) } EXPORT_SYMBOL(netif_carrier_off); +/** + * netif_notify_peers - notify network peers about existence of @dev + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void netif_notify_peers(struct net_device *dev) +{ + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(netif_notify_peers); + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. -- cgit v1.2.3-70-g09d2 From 38117d1495e587fbb10d6e55733139a27893cef5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 31 May 2010 00:28:35 -0700 Subject: net: Fix NETDEV_NOTIFY_PEERS to not conflict with NETDEV_BONDING_DESLAVE. Signed-off-by: David S. Miller --- include/linux/notifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 22c2abb6197..b2f1a4d8355 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -210,7 +210,7 @@ static inline int notifier_to_errno(int ret) #define NETDEV_POST_INIT 0x0010 #define NETDEV_UNREGISTER_BATCH 0x0011 #define NETDEV_BONDING_DESLAVE 0x0012 -#define NETDEV_NOTIFY_PEERS 0x0012 +#define NETDEV_NOTIFY_PEERS 0x0013 #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN -- cgit v1.2.3-70-g09d2 From ea762b047e13ba1cba4d58323b5c00a566610198 Mon Sep 17 00:00:00 2001 From: "apatard@mandriva.com" Date: Thu, 27 May 2010 14:57:40 +0200 Subject: ASoC: Add SND_SOC_DAPM_PRE_POST_PMD event Some systems codecs need to configure some registers before and after powering down some of their part. As a convenience add a macro for that. Signed-off-by: Arnaud Patard Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 66ff4c124db..c5d9987bc89 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -273,6 +273,8 @@ #define SND_SOC_DAPM_POST_PMD 0x8 /* after widget power down */ #define SND_SOC_DAPM_PRE_REG 0x10 /* before audio path setup */ #define SND_SOC_DAPM_POST_REG 0x20 /* after audio path setup */ +#define SND_SOC_DAPM_PRE_POST_PMD \ + (SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD) /* convenience event type detection */ #define SND_SOC_DAPM_EVENT_ON(e) \ -- cgit v1.2.3-70-g09d2 From 01d73a6967f12fe6c4bbde1834a9fe662264a2eb Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Thu, 27 May 2010 13:40:24 -0600 Subject: drm: Remove drm_resource wrappers Remove the drm_resource wrappers and directly use the actual PCI and/or platform functions in their place. [airlied: fixup nouveau properly to build] Signed-off-by: Jordan Crouse Reviewed-by: Matt Turner Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_bufs.c | 13 ------------- drivers/gpu/drm/i915/i915_dma.c | 6 +++--- drivers/gpu/drm/mga/mga_dma.c | 4 ++-- drivers/gpu/drm/nouveau/nouveau_bo.c | 2 +- drivers/gpu/drm/nouveau/nouveau_channel.c | 3 ++- drivers/gpu/drm/nouveau/nouveau_mem.c | 16 +++++++++------- drivers/gpu/drm/nouveau/nv20_graph.c | 4 ++-- drivers/gpu/drm/nouveau/nv40_graph.c | 2 +- drivers/gpu/drm/nouveau/nv50_instmem.c | 2 +- drivers/gpu/drm/radeon/evergreen.c | 4 ++-- drivers/gpu/drm/radeon/r100.c | 4 ++-- drivers/gpu/drm/radeon/r600.c | 4 ++-- drivers/gpu/drm/radeon/radeon_bios.c | 2 +- drivers/gpu/drm/radeon/radeon_cp.c | 8 ++++---- drivers/gpu/drm/radeon/radeon_device.c | 4 ++-- drivers/gpu/drm/radeon/rs600.c | 4 ++-- drivers/gpu/drm/radeon/rs690.c | 4 ++-- drivers/gpu/drm/radeon/rv770.c | 4 ++-- drivers/gpu/drm/savage/savage_bci.c | 24 +++++++++++++----------- include/drm/drmP.h | 4 ---- 20 files changed, 53 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c index f7ba82ebf65..7783035871e 100644 --- a/drivers/gpu/drm/drm_bufs.c +++ b/drivers/gpu/drm/drm_bufs.c @@ -39,19 +39,6 @@ #include #include "drmP.h" -resource_size_t drm_get_resource_start(struct drm_device *dev, unsigned int resource) -{ - return pci_resource_start(dev->pdev, resource); -} -EXPORT_SYMBOL(drm_get_resource_start); - -resource_size_t drm_get_resource_len(struct drm_device *dev, unsigned int resource) -{ - return pci_resource_len(dev->pdev, resource); -} - -EXPORT_SYMBOL(drm_get_resource_len); - static struct drm_map_list *drm_find_matching_map(struct drm_device *dev, struct drm_local_map *map) { diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 2a6b5de5ae5..9fe2d08d9e9 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1429,7 +1429,7 @@ static int i915_load_modeset_init(struct drm_device *dev, int fb_bar = IS_I9XX(dev) ? 2 : 0; int ret = 0; - dev->mode_config.fb_base = drm_get_resource_start(dev, fb_bar) & + dev->mode_config.fb_base = pci_resource_start(dev->pdev, fb_bar) & 0xff000000; /* Basic memrange allocator for stolen space (aka vram) */ @@ -1612,8 +1612,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) /* Add register map (needed for suspend/resume) */ mmio_bar = IS_I9XX(dev) ? 0 : 1; - base = drm_get_resource_start(dev, mmio_bar); - size = drm_get_resource_len(dev, mmio_bar); + base = pci_resource_start(dev->pdev, mmio_bar); + size = pci_resource_len(dev->pdev, mmio_bar); if (i915_get_bridge_dev(dev)) { ret = -EIO; diff --git a/drivers/gpu/drm/mga/mga_dma.c b/drivers/gpu/drm/mga/mga_dma.c index 3c917fb3a60..ccc129c328a 100644 --- a/drivers/gpu/drm/mga/mga_dma.c +++ b/drivers/gpu/drm/mga/mga_dma.c @@ -405,8 +405,8 @@ int mga_driver_load(struct drm_device * dev, unsigned long flags) dev_priv->usec_timeout = MGA_DEFAULT_USEC_TIMEOUT; dev_priv->chipset = flags; - dev_priv->mmio_base = drm_get_resource_start(dev, 1); - dev_priv->mmio_size = drm_get_resource_len(dev, 1); + dev_priv->mmio_base = pci_resource_start(dev->pdev, 1); + dev_priv->mmio_size = pci_resource_len(dev->pdev, 1); dev->counters += 3; dev->types[6] = _DRM_STAT_IRQ; diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 6f3c1952237..9f5ab467775 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -783,7 +783,7 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *mem) break; case TTM_PL_VRAM: mem->bus.offset = mem->mm_node->start << PAGE_SHIFT; - mem->bus.base = drm_get_resource_start(dev, 1); + mem->bus.base = pci_resource_start(dev->pdev, 1); mem->bus.is_iomem = true; break; default: diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c b/drivers/gpu/drm/nouveau/nouveau_channel.c index 1fc57ef5829..06555c7cde5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_channel.c +++ b/drivers/gpu/drm/nouveau/nouveau_channel.c @@ -62,7 +62,8 @@ nouveau_channel_pushbuf_ctxdma_init(struct nouveau_channel *chan) * VRAM. */ ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY, - drm_get_resource_start(dev, 1), + pci_resource_start(dev->pdev, + 1), dev_priv->fb_available_size, NV_DMA_ACCESS_RO, NV_DMA_TARGET_PCI, &pushbuf); diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c index 775a7017af6..37c7bf8e829 100644 --- a/drivers/gpu/drm/nouveau/nouveau_mem.c +++ b/drivers/gpu/drm/nouveau/nouveau_mem.c @@ -471,8 +471,9 @@ void nouveau_mem_close(struct drm_device *dev) } if (dev_priv->fb_mtrr) { - drm_mtrr_del(dev_priv->fb_mtrr, drm_get_resource_start(dev, 1), - drm_get_resource_len(dev, 1), DRM_MTRR_WC); + drm_mtrr_del(dev_priv->fb_mtrr, + pci_resource_start(dev->pdev, 1), + pci_resource_len(dev->pdev, 1), DRM_MTRR_WC); dev_priv->fb_mtrr = 0; } } @@ -632,7 +633,7 @@ nouveau_mem_init(struct drm_device *dev) struct ttm_bo_device *bdev = &dev_priv->ttm.bdev; int ret, dma_bits = 32; - dev_priv->fb_phys = drm_get_resource_start(dev, 1); + dev_priv->fb_phys = pci_resource_start(dev->pdev, 1); dev_priv->gart_info.type = NOUVEAU_GART_NONE; if (dev_priv->card_type >= NV_50 && @@ -664,8 +665,9 @@ nouveau_mem_init(struct drm_device *dev) dev_priv->fb_available_size = dev_priv->vram_size; dev_priv->fb_mappable_pages = dev_priv->fb_available_size; - if (dev_priv->fb_mappable_pages > drm_get_resource_len(dev, 1)) - dev_priv->fb_mappable_pages = drm_get_resource_len(dev, 1); + if (dev_priv->fb_mappable_pages > pci_resource_len(dev->pdev, 1)) + dev_priv->fb_mappable_pages = + pci_resource_len(dev->pdev, 1); dev_priv->fb_mappable_pages >>= PAGE_SHIFT; /* remove reserved space at end of vram from available amount */ @@ -717,8 +719,8 @@ nouveau_mem_init(struct drm_device *dev) return ret; } - dev_priv->fb_mtrr = drm_mtrr_add(drm_get_resource_start(dev, 1), - drm_get_resource_len(dev, 1), + dev_priv->fb_mtrr = drm_mtrr_add(pci_resource_start(dev->pdev, 1), + pci_resource_len(dev->pdev, 1), DRM_MTRR_WC); return 0; diff --git a/drivers/gpu/drm/nouveau/nv20_graph.c b/drivers/gpu/drm/nouveau/nv20_graph.c index d6fc0a82f03..fe2349b115f 100644 --- a/drivers/gpu/drm/nouveau/nv20_graph.c +++ b/drivers/gpu/drm/nouveau/nv20_graph.c @@ -616,7 +616,7 @@ nv20_graph_init(struct drm_device *dev) nv_wr32(dev, NV10_PGRAPH_SURFACE, tmp); /* begin RAM config */ - vramsz = drm_get_resource_len(dev, 0) - 1; + vramsz = pci_resource_len(dev->pdev, 0) - 1; nv_wr32(dev, 0x4009A4, nv_rd32(dev, NV04_PFB_CFG0)); nv_wr32(dev, 0x4009A8, nv_rd32(dev, NV04_PFB_CFG1)); nv_wr32(dev, NV10_PGRAPH_RDI_INDEX, 0x00EA0000); @@ -717,7 +717,7 @@ nv30_graph_init(struct drm_device *dev) nv_wr32(dev, 0x0040075c , 0x00000001); /* begin RAM config */ - /* vramsz = drm_get_resource_len(dev, 0) - 1; */ + /* vramsz = pci_resource_len(dev->pdev, 0) - 1; */ nv_wr32(dev, 0x4009A4, nv_rd32(dev, NV04_PFB_CFG0)); nv_wr32(dev, 0x4009A8, nv_rd32(dev, NV04_PFB_CFG1)); if (dev_priv->chipset != 0x34) { diff --git a/drivers/gpu/drm/nouveau/nv40_graph.c b/drivers/gpu/drm/nouveau/nv40_graph.c index 704a25d04ac..65b13b54c5a 100644 --- a/drivers/gpu/drm/nouveau/nv40_graph.c +++ b/drivers/gpu/drm/nouveau/nv40_graph.c @@ -367,7 +367,7 @@ nv40_graph_init(struct drm_device *dev) nv40_graph_set_region_tiling(dev, i, 0, 0, 0); /* begin RAM config */ - vramsz = drm_get_resource_len(dev, 0) - 1; + vramsz = pci_resource_len(dev->pdev, 0) - 1; switch (dev_priv->chipset) { case 0x40: nv_wr32(dev, 0x4009A4, nv_rd32(dev, NV04_PFB_CFG0)); diff --git a/drivers/gpu/drm/nouveau/nv50_instmem.c b/drivers/gpu/drm/nouveau/nv50_instmem.c index 5f21df31f3a..71c01b6e573 100644 --- a/drivers/gpu/drm/nouveau/nv50_instmem.c +++ b/drivers/gpu/drm/nouveau/nv50_instmem.c @@ -241,7 +241,7 @@ nv50_instmem_init(struct drm_device *dev) return ret; BAR0_WI32(priv->fb_bar->gpuobj, 0x00, 0x7fc00000); BAR0_WI32(priv->fb_bar->gpuobj, 0x04, 0x40000000 + - drm_get_resource_len(dev, 1) - 1); + pci_resource_len(dev->pdev, 1) - 1); BAR0_WI32(priv->fb_bar->gpuobj, 0x08, 0x40000000); BAR0_WI32(priv->fb_bar->gpuobj, 0x0c, 0x00000000); BAR0_WI32(priv->fb_bar->gpuobj, 0x10, 0x00000000); diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index 8c8e4d3cbaa..a4745e49ecf 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -1300,8 +1300,8 @@ int evergreen_mc_init(struct radeon_device *rdev) } rdev->mc.vram_width = numchan * chansize; /* Could aper size report 0 ? */ - rdev->mc.aper_base = drm_get_resource_start(rdev->ddev, 0); - rdev->mc.aper_size = drm_get_resource_len(rdev->ddev, 0); + rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); + rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); /* Setup GPU memory space */ /* size in MB on evergreen */ rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE) * 1024 * 1024; diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index cc004b05d63..c485c2cec4d 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -2284,8 +2284,8 @@ void r100_vram_init_sizes(struct radeon_device *rdev) u64 config_aper_size; /* work out accessible VRAM */ - rdev->mc.aper_base = drm_get_resource_start(rdev->ddev, 0); - rdev->mc.aper_size = drm_get_resource_len(rdev->ddev, 0); + rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); + rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); rdev->mc.visible_vram_size = r100_get_accessible_vram(rdev); /* FIXME we don't use the second aperture yet when we could use it */ if (rdev->mc.visible_vram_size > rdev->mc.aper_size) diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 44e96a2ae25..4959619f885 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -1118,8 +1118,8 @@ int r600_mc_init(struct radeon_device *rdev) } rdev->mc.vram_width = numchan * chansize; /* Could aper size report 0 ? */ - rdev->mc.aper_base = drm_get_resource_start(rdev->ddev, 0); - rdev->mc.aper_size = drm_get_resource_len(rdev->ddev, 0); + rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); + rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); /* Setup GPU memory space */ rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE); diff --git a/drivers/gpu/drm/radeon/radeon_bios.c b/drivers/gpu/drm/radeon/radeon_bios.c index fbba938f804..91f5b5a29a9 100644 --- a/drivers/gpu/drm/radeon/radeon_bios.c +++ b/drivers/gpu/drm/radeon/radeon_bios.c @@ -49,7 +49,7 @@ static bool igp_read_bios_from_vram(struct radeon_device *rdev) resource_size_t size = 256 * 1024; /* ??? */ rdev->bios = NULL; - vram_base = drm_get_resource_start(rdev->ddev, 0); + vram_base = pci_resource_start(rdev->pdev, 0); bios = ioremap(vram_base, size); if (!bios) { return false; diff --git a/drivers/gpu/drm/radeon/radeon_cp.c b/drivers/gpu/drm/radeon/radeon_cp.c index 2f042a3c0e6..eb6b9eed734 100644 --- a/drivers/gpu/drm/radeon/radeon_cp.c +++ b/drivers/gpu/drm/radeon/radeon_cp.c @@ -2120,8 +2120,8 @@ int radeon_driver_load(struct drm_device *dev, unsigned long flags) else dev_priv->flags |= RADEON_IS_PCI; - ret = drm_addmap(dev, drm_get_resource_start(dev, 2), - drm_get_resource_len(dev, 2), _DRM_REGISTERS, + ret = drm_addmap(dev, pci_resource_start(dev->pdev, 2), + pci_resource_len(dev->pdev, 2), _DRM_REGISTERS, _DRM_READ_ONLY | _DRM_DRIVER, &dev_priv->mmio); if (ret != 0) return ret; @@ -2194,9 +2194,9 @@ int radeon_driver_firstopen(struct drm_device *dev) dev_priv->gart_info.table_size = RADEON_PCIGART_TABLE_SIZE; - dev_priv->fb_aper_offset = drm_get_resource_start(dev, 0); + dev_priv->fb_aper_offset = pci_resource_start(dev->pdev, 0); ret = drm_addmap(dev, dev_priv->fb_aper_offset, - drm_get_resource_len(dev, 0), _DRM_FRAME_BUFFER, + pci_resource_len(dev->pdev, 0), _DRM_FRAME_BUFFER, _DRM_WRITE_COMBINING, &map); if (ret != 0) return ret; diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index fdc3fdf78ac..2a897a7ca26 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -648,8 +648,8 @@ int radeon_device_init(struct radeon_device *rdev, /* Registers mapping */ /* TODO: block userspace mapping of io register */ - rdev->rmmio_base = drm_get_resource_start(rdev->ddev, 2); - rdev->rmmio_size = drm_get_resource_len(rdev->ddev, 2); + rdev->rmmio_base = pci_resource_start(rdev->pdev, 2); + rdev->rmmio_size = pci_resource_len(rdev->pdev, 2); rdev->rmmio = ioremap(rdev->rmmio_base, rdev->rmmio_size); if (rdev->rmmio == NULL) { return -ENOMEM; diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c index 79887cac5b5..340c7611f2a 100644 --- a/drivers/gpu/drm/radeon/rs600.c +++ b/drivers/gpu/drm/radeon/rs600.c @@ -685,8 +685,8 @@ void rs600_mc_init(struct radeon_device *rdev) { u64 base; - rdev->mc.aper_base = drm_get_resource_start(rdev->ddev, 0); - rdev->mc.aper_size = drm_get_resource_len(rdev->ddev, 0); + rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); + rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); rdev->mc.vram_is_ddr = true; rdev->mc.vram_width = 128; rdev->mc.real_vram_size = RREG32(RADEON_CONFIG_MEMSIZE); diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c index bcc33195ebc..a18ba98885f 100644 --- a/drivers/gpu/drm/radeon/rs690.c +++ b/drivers/gpu/drm/radeon/rs690.c @@ -151,8 +151,8 @@ void rs690_mc_init(struct radeon_device *rdev) rdev->mc.vram_width = 128; rdev->mc.real_vram_size = RREG32(RADEON_CONFIG_MEMSIZE); rdev->mc.mc_vram_size = rdev->mc.real_vram_size; - rdev->mc.aper_base = drm_get_resource_start(rdev->ddev, 0); - rdev->mc.aper_size = drm_get_resource_len(rdev->ddev, 0); + rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); + rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); rdev->mc.visible_vram_size = rdev->mc.aper_size; base = RREG32_MC(R_000100_MCCFG_FB_LOCATION); base = G_000100_MC_FB_START(base) << 16; diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c index 253f24aec03..5c7f0b97c6a 100644 --- a/drivers/gpu/drm/radeon/rv770.c +++ b/drivers/gpu/drm/radeon/rv770.c @@ -908,8 +908,8 @@ int rv770_mc_init(struct radeon_device *rdev) } rdev->mc.vram_width = numchan * chansize; /* Could aper size report 0 ? */ - rdev->mc.aper_base = drm_get_resource_start(rdev->ddev, 0); - rdev->mc.aper_size = drm_get_resource_len(rdev->ddev, 0); + rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); + rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); /* Setup GPU memory space */ rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE); diff --git a/drivers/gpu/drm/savage/savage_bci.c b/drivers/gpu/drm/savage/savage_bci.c index 2d0c9ca484c..f576232846c 100644 --- a/drivers/gpu/drm/savage/savage_bci.c +++ b/drivers/gpu/drm/savage/savage_bci.c @@ -573,13 +573,13 @@ int savage_driver_firstopen(struct drm_device *dev) dev_priv->mtrr[2].handle = -1; if (S3_SAVAGE3D_SERIES(dev_priv->chipset)) { fb_rsrc = 0; - fb_base = drm_get_resource_start(dev, 0); + fb_base = pci_resource_start(dev->pdev, 0); fb_size = SAVAGE_FB_SIZE_S3; mmio_base = fb_base + SAVAGE_FB_SIZE_S3; aper_rsrc = 0; aperture_base = fb_base + SAVAGE_APERTURE_OFFSET; /* this should always be true */ - if (drm_get_resource_len(dev, 0) == 0x08000000) { + if (pci_resource_len(dev->pdev, 0) == 0x08000000) { /* Don't make MMIO write-cobining! We need 3 * MTRRs. */ dev_priv->mtrr[0].base = fb_base; @@ -599,18 +599,19 @@ int savage_driver_firstopen(struct drm_device *dev) dev_priv->mtrr[2].size, DRM_MTRR_WC); } else { DRM_ERROR("strange pci_resource_len %08llx\n", - (unsigned long long)drm_get_resource_len(dev, 0)); + (unsigned long long) + pci_resource_len(dev->pdev, 0)); } } else if (dev_priv->chipset != S3_SUPERSAVAGE && dev_priv->chipset != S3_SAVAGE2000) { - mmio_base = drm_get_resource_start(dev, 0); + mmio_base = pci_resource_start(dev->pdev, 0); fb_rsrc = 1; - fb_base = drm_get_resource_start(dev, 1); + fb_base = pci_resource_start(dev->pdev, 1); fb_size = SAVAGE_FB_SIZE_S4; aper_rsrc = 1; aperture_base = fb_base + SAVAGE_APERTURE_OFFSET; /* this should always be true */ - if (drm_get_resource_len(dev, 1) == 0x08000000) { + if (pci_resource_len(dev->pdev, 1) == 0x08000000) { /* Can use one MTRR to cover both fb and * aperture. */ dev_priv->mtrr[0].base = fb_base; @@ -620,15 +621,16 @@ int savage_driver_firstopen(struct drm_device *dev) dev_priv->mtrr[0].size, DRM_MTRR_WC); } else { DRM_ERROR("strange pci_resource_len %08llx\n", - (unsigned long long)drm_get_resource_len(dev, 1)); + (unsigned long long) + pci_resource_len(dev->pdev, 1)); } } else { - mmio_base = drm_get_resource_start(dev, 0); + mmio_base = pci_resource_start(dev->pdev, 0); fb_rsrc = 1; - fb_base = drm_get_resource_start(dev, 1); - fb_size = drm_get_resource_len(dev, 1); + fb_base = pci_resource_start(dev->pdev, 1); + fb_size = pci_resource_len(dev->pdev, 1); aper_rsrc = 2; - aperture_base = drm_get_resource_start(dev, 2); + aperture_base = pci_resource_start(dev->pdev, 2); /* Automatic MTRR setup will do the right thing. */ } diff --git a/include/drm/drmP.h b/include/drm/drmP.h index c1b987158df..8f7f5cb4a86 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -1273,10 +1273,6 @@ extern int drm_freebufs(struct drm_device *dev, void *data, extern int drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_order(unsigned long size); -extern resource_size_t drm_get_resource_start(struct drm_device *dev, - unsigned int resource); -extern resource_size_t drm_get_resource_len(struct drm_device *dev, - unsigned int resource); /* DMA support (drm_dma.h) */ extern int drm_dma_setup(struct drm_device *dev); -- cgit v1.2.3-70-g09d2 From dcdb167402cbdca1d021bdfa5f63995ee0a79317 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Thu, 27 May 2010 13:40:25 -0600 Subject: drm: Add support for platform devices to register as DRM devices Allow platform devices without PCI resources to be DRM devices. [airlied: fixup warnings with dev pointers] Signed-off-by: Jordan Crouse Signed-off-by: Dave Airlie --- drivers/gpu/drm/Kconfig | 4 +- drivers/gpu/drm/Makefile | 2 +- drivers/gpu/drm/drm_drv.c | 37 ++------- drivers/gpu/drm/drm_edid.c | 4 +- drivers/gpu/drm/drm_info.c | 23 ++++-- drivers/gpu/drm/drm_ioctl.c | 71 +++++++++++------ drivers/gpu/drm/drm_irq.c | 15 ++-- drivers/gpu/drm/drm_pci.c | 143 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/drm_platform.c | 122 +++++++++++++++++++++++++++++ drivers/gpu/drm/drm_stub.c | 89 +-------------------- drivers/gpu/drm/drm_sysfs.c | 3 +- drivers/gpu/drm/i915/i915_dma.c | 1 + drivers/gpu/drm/i915/i915_drv.c | 2 +- drivers/gpu/drm/nouveau/nouveau_drv.c | 2 +- drivers/gpu/drm/radeon/radeon_drv.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 2 +- include/drm/drmP.h | 52 +++++++++++-- 17 files changed, 402 insertions(+), 172 deletions(-) create mode 100644 drivers/gpu/drm/drm_platform.c (limited to 'include') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 88910e5a2c7..520ab23d8a3 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -6,7 +6,7 @@ # menuconfig DRM tristate "Direct Rendering Manager (XFree86 4.1.0 and higher DRI support)" - depends on (AGP || AGP=n) && PCI && !EMULATED_CMPXCHG && MMU + depends on (AGP || AGP=n) && !EMULATED_CMPXCHG && MMU select I2C select I2C_ALGOBIT select SLOW_WORK @@ -17,7 +17,7 @@ menuconfig DRM These modules provide support for synchronization, security, and DMA transfers. Please see for more details. You should also select and configure AGP - (/dev/agpgart) support. + (/dev/agpgart) support if it is available for your platform. config DRM_KMS_HELPER tristate diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index abe3f446ca4..b4b2b480d0c 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -9,7 +9,7 @@ drm-y := drm_auth.o drm_buffer.o drm_bufs.o drm_cache.o \ drm_drv.o drm_fops.o drm_gem.o drm_ioctl.o drm_irq.o \ drm_lock.o drm_memory.o drm_proc.o drm_stub.o drm_vm.o \ drm_agpsupport.o drm_scatter.o ati_pcigart.o drm_pci.o \ - drm_sysfs.o drm_hashtab.o drm_sman.o drm_mm.o \ + drm_platform.o drm_sysfs.o drm_hashtab.o drm_sman.o drm_mm.o \ drm_crtc.o drm_modes.o drm_edid.o \ drm_info.o drm_debugfs.o drm_encoder_slave.o diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 4a66201edae..510bc87d98f 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -243,47 +243,20 @@ int drm_lastclose(struct drm_device * dev) * * Initializes an array of drm_device structures, and attempts to * initialize all available devices, using consecutive minors, registering the - * stubs and initializing the AGP device. + * stubs and initializing the device. * * Expands the \c DRIVER_PREINIT and \c DRIVER_POST_INIT macros before and * after the initialization for driver customization. */ int drm_init(struct drm_driver *driver) { - struct pci_dev *pdev = NULL; - const struct pci_device_id *pid; - int i; - DRM_DEBUG("\n"); - INIT_LIST_HEAD(&driver->device_list); - if (driver->driver_features & DRIVER_MODESET) - return pci_register_driver(&driver->pci_driver); - - /* If not using KMS, fall back to stealth mode manual scanning. */ - for (i = 0; driver->pci_driver.id_table[i].vendor != 0; i++) { - pid = &driver->pci_driver.id_table[i]; - - /* Loop around setting up a DRM device for each PCI device - * matching our ID and device class. If we had the internal - * function that pci_get_subsys and pci_get_class used, we'd - * be able to just pass pid in instead of doing a two-stage - * thing. - */ - pdev = NULL; - while ((pdev = - pci_get_subsys(pid->vendor, pid->device, pid->subvendor, - pid->subdevice, pdev)) != NULL) { - if ((pdev->class & pid->class_mask) != pid->class) - continue; - - /* stealth mode requires a manual probe */ - pci_dev_get(pdev); - drm_get_dev(pdev, pid, driver); - } - } - return 0; + if (driver->driver_features & DRIVER_USE_PLATFORM_DEVICE) + return drm_platform_init(driver); + else + return drm_pci_init(driver); } EXPORT_SYMBOL(drm_init); diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index c1981861bbb..83d8072066c 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -282,7 +282,7 @@ drm_do_get_edid(struct drm_connector *connector, struct i2c_adapter *adapter) return block; carp: - dev_warn(&connector->dev->pdev->dev, "%s: EDID block %d invalid.\n", + dev_warn(connector->dev->dev, "%s: EDID block %d invalid.\n", drm_get_connector_name(connector), j); out: @@ -1626,7 +1626,7 @@ int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid) return 0; } if (!drm_edid_is_valid(edid)) { - dev_warn(&connector->dev->pdev->dev, "%s: EDID invalid.\n", + dev_warn(connector->dev->dev, "%s: EDID invalid.\n", drm_get_connector_name(connector)); return 0; } diff --git a/drivers/gpu/drm/drm_info.c b/drivers/gpu/drm/drm_info.c index f0f6c6b93f3..2ef2c782724 100644 --- a/drivers/gpu/drm/drm_info.c +++ b/drivers/gpu/drm/drm_info.c @@ -51,13 +51,24 @@ int drm_name_info(struct seq_file *m, void *data) if (!master) return 0; - if (master->unique) { - seq_printf(m, "%s %s %s\n", - dev->driver->pci_driver.name, - pci_name(dev->pdev), master->unique); + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) { + if (master->unique) { + seq_printf(m, "%s %s %s\n", + dev->driver->platform_device->name, + dev_name(dev->dev), master->unique); + } else { + seq_printf(m, "%s\n", + dev->driver->platform_device->name); + } } else { - seq_printf(m, "%s %s\n", dev->driver->pci_driver.name, - pci_name(dev->pdev)); + if (master->unique) { + seq_printf(m, "%s %s %s\n", + dev->driver->pci_driver.name, + dev_name(dev->dev), master->unique); + } else { + seq_printf(m, "%s %s\n", dev->driver->pci_driver.name, + dev_name(dev->dev)); + } } return 0; diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 9b9ff46c237..76d3d18056d 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -132,32 +132,57 @@ static int drm_set_busid(struct drm_device *dev, struct drm_file *file_priv) struct drm_master *master = file_priv->master; int len; - if (master->unique != NULL) - return -EBUSY; - - master->unique_len = 40; - master->unique_size = master->unique_len; - master->unique = kmalloc(master->unique_size, GFP_KERNEL); - if (master->unique == NULL) - return -ENOMEM; + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) { + master->unique_len = 10 + strlen(dev->platformdev->name); + master->unique = kmalloc(master->unique_len + 1, GFP_KERNEL); + + if (master->unique == NULL) + return -ENOMEM; + + len = snprintf(master->unique, master->unique_len, + "platform:%s", dev->platformdev->name); + + if (len > master->unique_len) + DRM_ERROR("Unique buffer overflowed\n"); + + dev->devname = + kmalloc(strlen(dev->platformdev->name) + + master->unique_len + 2, GFP_KERNEL); + + if (dev->devname == NULL) + return -ENOMEM; + + sprintf(dev->devname, "%s@%s", dev->platformdev->name, + master->unique); + + } else { + master->unique_len = 40; + master->unique_size = master->unique_len; + master->unique = kmalloc(master->unique_size, GFP_KERNEL); + if (master->unique == NULL) + return -ENOMEM; + + len = snprintf(master->unique, master->unique_len, + "pci:%04x:%02x:%02x.%d", + drm_get_pci_domain(dev), + dev->pdev->bus->number, + PCI_SLOT(dev->pdev->devfn), + PCI_FUNC(dev->pdev->devfn)); + if (len >= master->unique_len) + DRM_ERROR("buffer overflow"); + else + master->unique_len = len; - len = snprintf(master->unique, master->unique_len, "pci:%04x:%02x:%02x.%d", - drm_get_pci_domain(dev), - dev->pdev->bus->number, - PCI_SLOT(dev->pdev->devfn), - PCI_FUNC(dev->pdev->devfn)); - if (len >= master->unique_len) - DRM_ERROR("buffer overflow"); - else - master->unique_len = len; + dev->devname = + kmalloc(strlen(dev->driver->pci_driver.name) + + master->unique_len + 2, GFP_KERNEL); - dev->devname = kmalloc(strlen(dev->driver->pci_driver.name) + - master->unique_len + 2, GFP_KERNEL); - if (dev->devname == NULL) - return -ENOMEM; + if (dev->devname == NULL) + return -ENOMEM; - sprintf(dev->devname, "%s@%s", dev->driver->pci_driver.name, - master->unique); + sprintf(dev->devname, "%s@%s", dev->driver->pci_driver.name, + master->unique); + } return 0; } diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index a263b7070fc..6353b625e09 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -57,6 +57,9 @@ int drm_irq_by_busid(struct drm_device *dev, void *data, { struct drm_irq_busid *p = data; + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) + return -EINVAL; + if (!drm_core_check_feature(dev, DRIVER_HAVE_IRQ)) return -EINVAL; @@ -211,7 +214,7 @@ int drm_irq_install(struct drm_device *dev) if (!drm_core_check_feature(dev, DRIVER_HAVE_IRQ)) return -EINVAL; - if (dev->pdev->irq == 0) + if (drm_dev_to_irq(dev) == 0) return -EINVAL; mutex_lock(&dev->struct_mutex); @@ -229,7 +232,7 @@ int drm_irq_install(struct drm_device *dev) dev->irq_enabled = 1; mutex_unlock(&dev->struct_mutex); - DRM_DEBUG("irq=%d\n", dev->pdev->irq); + DRM_DEBUG("irq=%d\n", drm_dev_to_irq(dev)); /* Before installing handler */ dev->driver->irq_preinstall(dev); @@ -302,14 +305,14 @@ int drm_irq_uninstall(struct drm_device * dev) if (!irq_enabled) return -EINVAL; - DRM_DEBUG("irq=%d\n", dev->pdev->irq); + DRM_DEBUG("irq=%d\n", drm_dev_to_irq(dev)); if (!drm_core_check_feature(dev, DRIVER_MODESET)) vga_client_register(dev->pdev, NULL, NULL, NULL); dev->driver->irq_uninstall(dev); - free_irq(dev->pdev->irq, dev); + free_irq(drm_dev_to_irq(dev), dev); return 0; } @@ -341,7 +344,7 @@ int drm_control(struct drm_device *dev, void *data, if (drm_core_check_feature(dev, DRIVER_MODESET)) return 0; if (dev->if_version < DRM_IF_VERSION(1, 2) && - ctl->irq != dev->pdev->irq) + ctl->irq != drm_dev_to_irq(dev)) return -EINVAL; return drm_irq_install(dev); case DRM_UNINST_HANDLER: @@ -651,7 +654,7 @@ int drm_wait_vblank(struct drm_device *dev, void *data, int ret = 0; unsigned int flags, seq, crtc; - if ((!dev->pdev->irq) || (!dev->irq_enabled)) + if ((!drm_dev_to_irq(dev)) || (!dev->irq_enabled)) return -EINVAL; if (vblwait->request.type & _DRM_VBLANK_SIGNAL) diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c index 2ea9ad4a8d6..e20f78b542a 100644 --- a/drivers/gpu/drm/drm_pci.c +++ b/drivers/gpu/drm/drm_pci.c @@ -124,4 +124,147 @@ void drm_pci_free(struct drm_device * dev, drm_dma_handle_t * dmah) EXPORT_SYMBOL(drm_pci_free); +#ifdef CONFIG_PCI +/** + * Register. + * + * \param pdev - PCI device structure + * \param ent entry from the PCI ID table with device type flags + * \return zero on success or a negative number on failure. + * + * Attempt to gets inter module "drm" information. If we are first + * then register the character device and inter module information. + * Try and register, if we fail to register, backout previous work. + */ +int drm_get_pci_dev(struct pci_dev *pdev, const struct pci_device_id *ent, + struct drm_driver *driver) +{ + struct drm_device *dev; + int ret; + + DRM_DEBUG("\n"); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + ret = pci_enable_device(pdev); + if (ret) + goto err_g1; + + pci_set_master(pdev); + + dev->pdev = pdev; + dev->dev = &pdev->dev; + + dev->pci_device = pdev->device; + dev->pci_vendor = pdev->vendor; + +#ifdef __alpha__ + dev->hose = pdev->sysdata; +#endif + + if ((ret = drm_fill_in_dev(dev, ent, driver))) { + printk(KERN_ERR "DRM: Fill_in_dev failed.\n"); + goto err_g2; + } + + if (drm_core_check_feature(dev, DRIVER_MODESET)) { + pci_set_drvdata(pdev, dev); + ret = drm_get_minor(dev, &dev->control, DRM_MINOR_CONTROL); + if (ret) + goto err_g2; + } + + if ((ret = drm_get_minor(dev, &dev->primary, DRM_MINOR_LEGACY))) + goto err_g3; + + if (dev->driver->load) { + ret = dev->driver->load(dev, ent->driver_data); + if (ret) + goto err_g4; + } + + /* setup the grouping for the legacy output */ + if (drm_core_check_feature(dev, DRIVER_MODESET)) { + ret = drm_mode_group_init_legacy_group(dev, + &dev->primary->mode_group); + if (ret) + goto err_g4; + } + + list_add_tail(&dev->driver_item, &driver->device_list); + + DRM_INFO("Initialized %s %d.%d.%d %s for %s on minor %d\n", + driver->name, driver->major, driver->minor, driver->patchlevel, + driver->date, pci_name(pdev), dev->primary->index); + + return 0; + +err_g4: + drm_put_minor(&dev->primary); +err_g3: + if (drm_core_check_feature(dev, DRIVER_MODESET)) + drm_put_minor(&dev->control); +err_g2: + pci_disable_device(pdev); +err_g1: + kfree(dev); + return ret; +} +EXPORT_SYMBOL(drm_get_pci_dev); + +/** + * PCI device initialization. Called via drm_init at module load time, + * + * \return zero on success or a negative number on failure. + * + * Initializes a drm_device structures,registering the + * stubs and initializing the AGP device. + * + * Expands the \c DRIVER_PREINIT and \c DRIVER_POST_INIT macros before and + * after the initialization for driver customization. + */ +int drm_pci_init(struct drm_driver *driver) +{ + struct pci_dev *pdev = NULL; + const struct pci_device_id *pid; + int i; + + if (driver->driver_features & DRIVER_MODESET) + return pci_register_driver(&driver->pci_driver); + + /* If not using KMS, fall back to stealth mode manual scanning. */ + for (i = 0; driver->pci_driver.id_table[i].vendor != 0; i++) { + pid = &driver->pci_driver.id_table[i]; + + /* Loop around setting up a DRM device for each PCI device + * matching our ID and device class. If we had the internal + * function that pci_get_subsys and pci_get_class used, we'd + * be able to just pass pid in instead of doing a two-stage + * thing. + */ + pdev = NULL; + while ((pdev = + pci_get_subsys(pid->vendor, pid->device, pid->subvendor, + pid->subdevice, pdev)) != NULL) { + if ((pdev->class & pid->class_mask) != pid->class) + continue; + + /* stealth mode requires a manual probe */ + pci_dev_get(pdev); + drm_get_pci_dev(pdev, pid, driver); + } + } + return 0; +} + +#else + +int drm_pci_init(struct drm_driver *driver) +{ + return -1; +} + +#endif /*@}*/ diff --git a/drivers/gpu/drm/drm_platform.c b/drivers/gpu/drm/drm_platform.c new file mode 100644 index 00000000000..460e9a3afa8 --- /dev/null +++ b/drivers/gpu/drm/drm_platform.c @@ -0,0 +1,122 @@ +/* + * Derived from drm_pci.c + * + * Copyright 2003 José Fonseca. + * Copyright 2003 Leif Delgass. + * Copyright (c) 2009, Code Aurora Forum. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "drmP.h" + +/** + * Register. + * + * \param platdev - Platform device struture + * \return zero on success or a negative number on failure. + * + * Attempt to gets inter module "drm" information. If we are first + * then register the character device and inter module information. + * Try and register, if we fail to register, backout previous work. + */ + +int drm_get_platform_dev(struct platform_device *platdev, + struct drm_driver *driver) +{ + struct drm_device *dev; + int ret; + + DRM_DEBUG("\n"); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->platformdev = platdev; + dev->dev = &platdev->dev; + + ret = drm_fill_in_dev(dev, NULL, driver); + + if (ret) { + printk(KERN_ERR "DRM: Fill_in_dev failed.\n"); + goto err_g1; + } + + if (drm_core_check_feature(dev, DRIVER_MODESET)) { + dev_set_drvdata(&platdev->dev, dev); + ret = drm_get_minor(dev, &dev->control, DRM_MINOR_CONTROL); + if (ret) + goto err_g1; + } + + ret = drm_get_minor(dev, &dev->primary, DRM_MINOR_LEGACY); + if (ret) + goto err_g2; + + if (dev->driver->load) { + ret = dev->driver->load(dev, 0); + if (ret) + goto err_g3; + } + + /* setup the grouping for the legacy output */ + if (drm_core_check_feature(dev, DRIVER_MODESET)) { + ret = drm_mode_group_init_legacy_group(dev, + &dev->primary->mode_group); + if (ret) + goto err_g3; + } + + list_add_tail(&dev->driver_item, &driver->device_list); + + DRM_INFO("Initialized %s %d.%d.%d %s on minor %d\n", + driver->name, driver->major, driver->minor, driver->patchlevel, + driver->date, dev->primary->index); + + return 0; + +err_g3: + drm_put_minor(&dev->primary); +err_g2: + if (drm_core_check_feature(dev, DRIVER_MODESET)) + drm_put_minor(&dev->control); +err_g1: + kfree(dev); + return ret; +} +EXPORT_SYMBOL(drm_get_platform_dev); + +/** + * Platform device initialization. Called via drm_init at module load time, + * + * \return zero on success or a negative number on failure. + * + * Initializes a drm_device structures,registering the + * stubs + * + * Expands the \c DRIVER_PREINIT and \c DRIVER_POST_INIT macros before and + * after the initialization for driver customization. + */ + +int drm_platform_init(struct drm_driver *driver) +{ + return drm_get_platform_dev(driver->platform_device, driver); +} diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c index a0c365f2e52..63575e2fa88 100644 --- a/drivers/gpu/drm/drm_stub.c +++ b/drivers/gpu/drm/drm_stub.c @@ -224,7 +224,7 @@ int drm_dropmaster_ioctl(struct drm_device *dev, void *data, return 0; } -static int drm_fill_in_dev(struct drm_device * dev, struct pci_dev *pdev, +int drm_fill_in_dev(struct drm_device *dev, const struct pci_device_id *ent, struct drm_driver *driver) { @@ -245,14 +245,6 @@ static int drm_fill_in_dev(struct drm_device * dev, struct pci_dev *pdev, idr_init(&dev->drw_idr); - dev->pdev = pdev; - dev->pci_device = pdev->device; - dev->pci_vendor = pdev->vendor; - -#ifdef __alpha__ - dev->hose = pdev->sysdata; -#endif - if (drm_ht_create(&dev->map_hash, 12)) { return -ENOMEM; } @@ -321,7 +313,7 @@ static int drm_fill_in_dev(struct drm_device * dev, struct pci_dev *pdev, * create the proc init entry via proc_init(). This routines assigns * minor numbers to secondary heads of multi-headed cards */ -static int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type) +int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type) { struct drm_minor *new_minor; int ret; @@ -387,83 +379,6 @@ err_idr: return ret; } -/** - * Register. - * - * \param pdev - PCI device structure - * \param ent entry from the PCI ID table with device type flags - * \return zero on success or a negative number on failure. - * - * Attempt to gets inter module "drm" information. If we are first - * then register the character device and inter module information. - * Try and register, if we fail to register, backout previous work. - */ -int drm_get_dev(struct pci_dev *pdev, const struct pci_device_id *ent, - struct drm_driver *driver) -{ - struct drm_device *dev; - int ret; - - DRM_DEBUG("\n"); - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - ret = pci_enable_device(pdev); - if (ret) - goto err_g1; - - pci_set_master(pdev); - if ((ret = drm_fill_in_dev(dev, pdev, ent, driver))) { - printk(KERN_ERR "DRM: Fill_in_dev failed.\n"); - goto err_g2; - } - - if (drm_core_check_feature(dev, DRIVER_MODESET)) { - pci_set_drvdata(pdev, dev); - ret = drm_get_minor(dev, &dev->control, DRM_MINOR_CONTROL); - if (ret) - goto err_g2; - } - - if ((ret = drm_get_minor(dev, &dev->primary, DRM_MINOR_LEGACY))) - goto err_g3; - - if (dev->driver->load) { - ret = dev->driver->load(dev, ent->driver_data); - if (ret) - goto err_g4; - } - - /* setup the grouping for the legacy output */ - if (drm_core_check_feature(dev, DRIVER_MODESET)) { - ret = drm_mode_group_init_legacy_group(dev, &dev->primary->mode_group); - if (ret) - goto err_g4; - } - - list_add_tail(&dev->driver_item, &driver->device_list); - - DRM_INFO("Initialized %s %d.%d.%d %s for %s on minor %d\n", - driver->name, driver->major, driver->minor, driver->patchlevel, - driver->date, pci_name(pdev), dev->primary->index); - - return 0; - -err_g4: - drm_put_minor(&dev->primary); -err_g3: - if (drm_core_check_feature(dev, DRIVER_MODESET)) - drm_put_minor(&dev->control); -err_g2: - pci_disable_device(pdev); -err_g1: - kfree(dev); - return ret; -} -EXPORT_SYMBOL(drm_get_dev); - /** * Put a secondary minor number. * diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c index 3a3a451d0bf..14d9d829ef2 100644 --- a/drivers/gpu/drm/drm_sysfs.c +++ b/drivers/gpu/drm/drm_sysfs.c @@ -488,7 +488,8 @@ int drm_sysfs_device_add(struct drm_minor *minor) int err; char *minor_str; - minor->kdev.parent = &minor->dev->pdev->dev; + minor->kdev.parent = minor->dev->dev; + minor->kdev.class = drm_class; minor->kdev.release = drm_sysfs_device_release; minor->kdev.devt = minor->device; diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 9fe2d08d9e9..9bed5617e0e 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -34,6 +34,7 @@ #include "i915_drm.h" #include "i915_drv.h" #include "i915_trace.h" +#include #include #include #include diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 5c51e45ab68..b7aecf5ea1f 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -435,7 +435,7 @@ int i965_reset(struct drm_device *dev, u8 flags) static int __devinit i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - return drm_get_dev(pdev, ent, &driver); + return drm_get_pci_dev(pdev, ent, &driver); } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c index c6079e36669..f60a2b2ae44 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.c +++ b/drivers/gpu/drm/nouveau/nouveau_drv.c @@ -132,7 +132,7 @@ static struct drm_driver driver; static int __devinit nouveau_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - return drm_get_dev(pdev, ent, &driver); + return drm_get_pci_dev(pdev, ent, &driver); } static void diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 4afba1eca2a..683e281b409 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -236,7 +236,7 @@ static struct drm_driver kms_driver; static int __devinit radeon_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - return drm_get_dev(pdev, ent, &kms_driver); + return drm_get_pci_dev(pdev, ent, &kms_driver); } static void diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 0c9c0811f42..f7f248dbff5 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -758,7 +758,7 @@ static struct drm_driver driver = { static int vmw_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - return drm_get_dev(pdev, ent, &driver); + return drm_get_pci_dev(pdev, ent, &driver); } static int __init vmwgfx_init(void) diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 8f7f5cb4a86..6235169d595 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -9,6 +9,7 @@ /* * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * Copyright (c) 2009-2010, Code Aurora Forum. * All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -48,6 +49,7 @@ #include #include #include +#include #include #include #include /* For (un)lock_kernel */ @@ -144,6 +146,7 @@ extern void drm_ut_debug_printk(unsigned int request_level, #define DRIVER_IRQ_VBL2 0x800 #define DRIVER_GEM 0x1000 #define DRIVER_MODESET 0x2000 +#define DRIVER_USE_PLATFORM_DEVICE 0x4000 /***********************************************************************/ /** \name Begin the DRM... */ @@ -823,6 +826,7 @@ struct drm_driver { int num_ioctls; struct file_operations fops; struct pci_driver pci_driver; + struct platform_device *platform_device; /* List of devices hanging off this driver */ struct list_head device_list; }; @@ -1015,12 +1019,16 @@ struct drm_device { struct drm_agp_head *agp; /**< AGP data */ + struct device *dev; /**< Device structure */ struct pci_dev *pdev; /**< PCI device structure */ int pci_vendor; /**< PCI vendor id */ int pci_device; /**< PCI device id */ #ifdef __alpha__ struct pci_controller *hose; #endif + + struct platform_device *platformdev; /**< Platform device struture */ + struct drm_sg_mem *sg; /**< Scatter gather memory */ int num_crtcs; /**< Number of CRTCs on this device */ void *dev_private; /**< device private data */ @@ -1060,17 +1068,21 @@ struct drm_device { }; -static inline int drm_dev_to_irq(struct drm_device *dev) -{ - return dev->pdev->irq; -} - static __inline__ int drm_core_check_feature(struct drm_device *dev, int feature) { return ((dev->driver->driver_features & feature) ? 1 : 0); } + +static inline int drm_dev_to_irq(struct drm_device *dev) +{ + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) + return platform_get_irq(dev->platformdev, 0); + else + return dev->pdev->irq; +} + #ifdef __alpha__ #define drm_get_pci_domain(dev) dev->hose->index #else @@ -1347,8 +1359,11 @@ extern int drm_dropmaster_ioctl(struct drm_device *dev, void *data, struct drm_master *drm_master_create(struct drm_minor *minor); extern struct drm_master *drm_master_get(struct drm_master *master); extern void drm_master_put(struct drm_master **master); -extern int drm_get_dev(struct pci_dev *pdev, const struct pci_device_id *ent, - struct drm_driver *driver); +extern int drm_get_pci_dev(struct pci_dev *pdev, + const struct pci_device_id *ent, + struct drm_driver *driver); +extern int drm_get_platform_dev(struct platform_device *pdev, + struct drm_driver *driver); extern void drm_put_dev(struct drm_device *dev); extern int drm_put_minor(struct drm_minor **minor); extern unsigned int drm_debug; @@ -1525,6 +1540,9 @@ static __inline__ struct drm_local_map *drm_core_findmap(struct drm_device *dev, static __inline__ int drm_device_is_agp(struct drm_device *dev) { + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) + return 0; + if (dev->driver->device_is_agp != NULL) { int err = (*dev->driver->device_is_agp) (dev); @@ -1538,7 +1556,10 @@ static __inline__ int drm_device_is_agp(struct drm_device *dev) static __inline__ int drm_device_is_pcie(struct drm_device *dev) { - return pci_find_capability(dev->pdev, PCI_CAP_ID_EXP); + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) + return 0; + else + return pci_find_capability(dev->pdev, PCI_CAP_ID_EXP); } static __inline__ void drm_core_dropmap(struct drm_local_map *map) @@ -1546,6 +1567,21 @@ static __inline__ void drm_core_dropmap(struct drm_local_map *map) } #include "drm_mem_util.h" + +static inline void *drm_get_device(struct drm_device *dev) +{ + if (drm_core_check_feature(dev, DRIVER_USE_PLATFORM_DEVICE)) + return dev->platformdev; + else + return dev->pdev; +} + +extern int drm_platform_init(struct drm_driver *driver); +extern int drm_pci_init(struct drm_driver *driver); +extern int drm_fill_in_dev(struct drm_device *dev, + const struct pci_device_id *ent, + struct drm_driver *driver); +int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type); /*@}*/ #endif /* __KERNEL__ */ -- cgit v1.2.3-70-g09d2 From bc135b23d01acf7ee926aaf75b0020c86d3869f9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jun 2010 03:23:51 -0700 Subject: net: Define accessors to manipulate QDISC_STATE_RUNNING Define three helpers to manipulate QDISC_STATE_RUNNIG flag, that a second patch will move on another location. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 +- include/net/sch_generic.h | 15 +++++++++++++++ net/core/dev.c | 4 ++-- net/sched/sch_generic.c | 4 ++-- 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9d4d87cc970..d9549af6929 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -95,7 +95,7 @@ extern void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { - if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) + if (qdisc_run_begin(q)) __qdisc_run(q); } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 03ca5d82675..9707daed761 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -76,6 +76,21 @@ struct Qdisc { struct rcu_head rcu_head; }; +static inline bool qdisc_is_running(struct Qdisc *qdisc) +{ + return test_bit(__QDISC_STATE_RUNNING, &qdisc->state); +} + +static inline bool qdisc_run_begin(struct Qdisc *qdisc) +{ + return !test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state); +} + +static inline void qdisc_run_end(struct Qdisc *qdisc) +{ + clear_bit(__QDISC_STATE_RUNNING, &qdisc->state); +} + struct Qdisc_class_ops { /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); diff --git a/net/core/dev.c b/net/core/dev.c index 983a3c1d65c..2733226d90b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2047,7 +2047,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, kfree_skb(skb); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) { + qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - @@ -2059,7 +2059,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (sch_direct_xmit(skb, q, dev, txq, root_lock)) __qdisc_run(q); else - clear_bit(__QDISC_STATE_RUNNING, &q->state); + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index bd1892fe4b2..37b86eab677 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -205,7 +205,7 @@ void __qdisc_run(struct Qdisc *q) } } - clear_bit(__QDISC_STATE_RUNNING, &q->state); + qdisc_run_end(q); } unsigned long dev_trans_start(struct net_device *dev) @@ -797,7 +797,7 @@ static bool some_qdisc_is_busy(struct net_device *dev) spin_lock_bh(root_lock); - val = (test_bit(__QDISC_STATE_RUNNING, &q->state) || + val = (qdisc_is_running(q) || test_bit(__QDISC_STATE_SCHED, &q->state)); spin_unlock_bh(root_lock); -- cgit v1.2.3-70-g09d2 From 371121057607e3127e19b3fa094330181b5b031e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jun 2010 03:24:13 -0700 Subject: net: QDISC_STATE_RUNNING dont need atomic bit ops __QDISC_STATE_RUNNING is always changed while qdisc lock is held. We can avoid two atomic operations in xmit path, if we move this bit in a new __state container. Location of this __state container is carefully chosen so that fast path only dirties one qdisc cache line. THROTTLED bit could later be moved into this __state location too, to avoid dirtying first qdisc cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9707daed761..b3591e4a514 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -23,11 +23,17 @@ struct qdisc_rate_table { }; enum qdisc_state_t { - __QDISC_STATE_RUNNING, __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, }; +/* + * following bits are only changed while qdisc lock is held + */ +enum qdisc___state_t { + __QDISC___STATE_RUNNING, +}; + struct qdisc_size_table { struct list_head list; struct tc_sizespec szopts; @@ -72,23 +78,24 @@ struct Qdisc { unsigned long state; struct sk_buff_head q; struct gnet_stats_basic_packed bstats; + unsigned long __state; struct gnet_stats_queue qstats; struct rcu_head rcu_head; }; static inline bool qdisc_is_running(struct Qdisc *qdisc) { - return test_bit(__QDISC_STATE_RUNNING, &qdisc->state); + return test_bit(__QDISC___STATE_RUNNING, &qdisc->__state); } static inline bool qdisc_run_begin(struct Qdisc *qdisc) { - return !test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state); + return !__test_and_set_bit(__QDISC___STATE_RUNNING, &qdisc->__state); } static inline void qdisc_run_end(struct Qdisc *qdisc) { - clear_bit(__QDISC_STATE_RUNNING, &qdisc->state); + __clear_bit(__QDISC___STATE_RUNNING, &qdisc->__state); } struct Qdisc_class_ops { -- cgit v1.2.3-70-g09d2 From 79640a4ca6955e3ebdb7038508fa7a0cd7fa5527 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jun 2010 05:09:29 -0700 Subject: net: add additional lock to qdisc to increase throughput When many cpus compete for sending frames on a given qdisc, the qdisc spinlock suffers from very high contention. The cpu owning __QDISC_STATE_RUNNING bit has same priority to acquire the lock, and cannot dequeue packets fast enough, since it must wait for this lock for each dequeued packet. One solution to this problem is to force all cpus spinning on a second lock before trying to get the main lock, when/if they see __QDISC_STATE_RUNNING already set. The owning cpu then compete with at most one other cpu for the main lock, allowing for higher dequeueing rate. Based on a previous patch from Alexander Duyck. I added the heuristic to avoid the atomic in fast path, and put the new lock far away from the cache line used by the dequeue worker. Also try to release the busylock lock as late as possible. Tests with following script gave a boost from ~50.000 pps to ~600.000 pps on a dual quad core machine (E5450 @3.00GHz), tg3 driver. (A single netperf flow can reach ~800.000 pps on this platform) for j in `seq 0 3`; do for i in `seq 0 7`; do netperf -H 192.168.0.1 -t UDP_STREAM -l 60 -N -T $i -- -m 6 & done done Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- net/core/dev.c | 29 +++++++++++++++++++++++++---- net/sched/sch_generic.c | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b3591e4a514..b35301b0c7b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -80,7 +80,8 @@ struct Qdisc { struct gnet_stats_basic_packed bstats; unsigned long __state; struct gnet_stats_queue qstats; - struct rcu_head rcu_head; + struct rcu_head rcu_head; + spinlock_t busylock; }; static inline bool qdisc_is_running(struct Qdisc *qdisc) diff --git a/net/core/dev.c b/net/core/dev.c index 2733226d90b..ffca5c1066f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2040,8 +2040,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); + bool contended = qdisc_is_running(q); int rc; + /* + * Heuristic to force contended enqueues to serialize on a + * separate lock before trying to get qdisc main lock. + * This permits __QDISC_STATE_RUNNING owner to get the lock more often + * and dequeue packets faster. + */ + if (unlikely(contended)) + spin_lock(&q->busylock); + spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { kfree_skb(skb); @@ -2056,19 +2066,30 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) skb_dst_force(skb); __qdisc_update_bstats(q, skb->len); - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } __qdisc_run(q); - else + } else qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { skb_dst_force(skb); rc = qdisc_enqueue_root(skb, q); - qdisc_run(q); + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } } spin_unlock(root_lock); - + if (unlikely(contended)) + spin_unlock(&q->busylock); return rc; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 37b86eab677..d20fcd2a551 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -561,6 +561,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); + spin_lock_init(&sch->busylock); sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; -- cgit v1.2.3-70-g09d2 From c2d9ba9bce8d7323ca96f239e1f505c14d6244fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Jun 2010 06:51:19 +0000 Subject: net: CONFIG_NET_NS reduction Use read_pnet() and write_pnet() to reduce number of ifdef CONFIG_NET_NS Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +----- include/net/cfg80211.h | 15 ++------------- include/net/genetlink.h | 15 ++------------- include/net/netfilter/nf_conntrack.h | 6 +----- include/net/sock.h | 10 ++-------- net/ipv6/addrlabel.c | 6 +----- net/netfilter/nf_conntrack_core.c | 8 ++------ 7 files changed, 11 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a24916156f4..bd6b75317d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1087,11 +1087,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, static inline struct net *dev_net(const struct net_device *dev) { -#ifdef CONFIG_NET_NS - return dev->nd_net; -#else - return &init_net; -#endif + return read_pnet(&dev->nd_net); } static inline diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b44a2e5321a..e7ebeb8bdf7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1330,26 +1330,15 @@ struct wiphy { char priv[0] __attribute__((__aligned__(NETDEV_ALIGN))); }; -#ifdef CONFIG_NET_NS -static inline struct net *wiphy_net(struct wiphy *wiphy) -{ - return wiphy->_net; -} - -static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net) -{ - wiphy->_net = net; -} -#else static inline struct net *wiphy_net(struct wiphy *wiphy) { - return &init_net; + return read_pnet(&wiphy->_net); } static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net) { + write_pnet(&wiphy->_net, net); } -#endif /** * wiphy_priv - return priv from wiphy diff --git a/include/net/genetlink.h b/include/net/genetlink.h index eb551baafc0..f7dcd2c7041 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -68,26 +68,15 @@ struct genl_info { #endif }; -#ifdef CONFIG_NET_NS static inline struct net *genl_info_net(struct genl_info *info) { - return info->_net; + return read_pnet(&info->_net); } static inline void genl_info_net_set(struct genl_info *info, struct net *net) { - info->_net = net; + write_pnet(&info->_net, net); } -#else -static inline struct net *genl_info_net(struct genl_info *info) -{ - return &init_net; -} - -static inline void genl_info_net_set(struct genl_info *info, struct net *net) -{ -} -#endif /** * struct genl_ops - generic netlink operations diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index bde095f7e84..bbfdd945308 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -152,11 +152,7 @@ extern struct net init_net; static inline struct net *nf_ct_net(const struct nf_conn *ct) { -#ifdef CONFIG_NET_NS - return ct->ct_net; -#else - return &init_net; -#endif + return read_pnet(&ct->ct_net); } /* Alter reply tuple (maybe alter helper). */ diff --git a/include/net/sock.h b/include/net/sock.h index ca241ea1487..3461e5d1e9a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1724,19 +1724,13 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_e static inline struct net *sock_net(const struct sock *sk) { -#ifdef CONFIG_NET_NS - return sk->sk_net; -#else - return &init_net; -#endif + return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { -#ifdef CONFIG_NET_NS - sk->sk_net = net; -#endif + write_pnet(&sk->sk_net, net); } /* diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 8c4348cb195..f0e774cea38 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -53,11 +53,7 @@ static struct ip6addrlbl_table static inline struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) { -#ifdef CONFIG_NET_NS - return lbl->lbl_net; -#else - return &init_net; -#endif + return read_pnet(&lbl->lbl_net); } /* diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index eeeb8bc7398..77288980fae 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -619,9 +619,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); -#ifdef CONFIG_NET_NS - ct->ct_net = net; -#endif + write_pnet(&ct->ct_net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES if (zone) { struct nf_conntrack_zone *nf_ct_zone; @@ -1363,9 +1361,7 @@ static int nf_conntrack_init_init_net(void) goto err_extend; #endif /* Set up fake conntrack: to never be deleted, not in any hashes */ -#ifdef CONFIG_NET_NS - nf_conntrack_untracked.ct_net = &init_net; -#endif + write_pnet(&nf_conntrack_untracked.ct_net, &init_net); atomic_set(&nf_conntrack_untracked.ct_general.use, 1); /* - and look it like as a confirmed connection */ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); -- cgit v1.2.3-70-g09d2 From 614f60fa9d73a9e8fdff3df83381907fea7c5649 Mon Sep 17 00:00:00 2001 From: Scott McMillan Date: Wed, 2 Jun 2010 05:53:56 -0700 Subject: packet_mmap: expose hw packet timestamps to network packet capture utilities This patch adds a setting, PACKET_TIMESTAMP, to specify the packet timestamp source that is exported to capture utilities like tcpdump by packet_mmap. PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE and SOF_TIMESTAMPING_RAW_HARDWARE values are currently recognized by PACKET_TIMESTAMP. SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set. If PACKET_TIMESTAMP is not set, a software timestamp generated inside the networking stack is used (the behavior before this setting was added). Signed-off-by: Scott McMillan Signed-off-by: David S. Miller --- Documentation/networking/packet_mmap.txt | 26 ++++++++++++++++++++++ include/linux/if_packet.h | 1 + net/packet/af_packet.c | 37 ++++++++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 98f71a5cef0..2546aa4dc23 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -493,6 +493,32 @@ The user can also use poll() to check if a buffer is available: pfd.events = POLLOUT; retval = poll(&pfd, 1, timeout); +------------------------------------------------------------------------------- ++ PACKET_TIMESTAMP +------------------------------------------------------------------------------- + +The PACKET_TIMESTAMP setting determines the source of the timestamp in +the packet meta information. If your NIC is capable of timestamping +packets in hardware, you can request those hardware timestamps to used. +Note: you may need to enable the generation of hardware timestamps with +SIOCSHWTSTAMP. + +PACKET_TIMESTAMP accepts the same integer bit field as +SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE +and SOF_TIMESTAMPING_RAW_HARDWARE values are recognized by +PACKET_TIMESTAMP. SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over +SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set. + + int req = 0; + req |= SOF_TIMESTAMPING_SYS_HARDWARE; + setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) + +If PACKET_TIMESTAMP is not set, a software timestamp generated inside +the networking stack is used (the behavior before this setting was added). + +See include/linux/net_tstamp.h and Documentation/networking/timestamping +for more information on hardware timestamps. + -------------------------------------------------------------------------------- + THANKS -------------------------------------------------------------------------------- diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h index 6ac23ef1801..72bfa5a034d 100644 --- a/include/linux/if_packet.h +++ b/include/linux/if_packet.h @@ -48,6 +48,7 @@ struct sockaddr_ll { #define PACKET_LOSS 14 #define PACKET_VNET_HDR 15 #define PACKET_TX_TIMESTAMP 16 +#define PACKET_TIMESTAMP 17 struct tpacket_stats { unsigned int tp_packets; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2078a277e06..9a17f28b125 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -83,6 +83,7 @@ #include #include #include +#include #ifdef CONFIG_INET #include @@ -202,6 +203,7 @@ struct packet_sock { unsigned int tp_hdrlen; unsigned int tp_reserve; unsigned int tp_loss:1; + unsigned int tp_tstamp; struct packet_type prot_hook ____cacheline_aligned_in_smp; }; @@ -656,6 +658,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct sk_buff *copy_skb = NULL; struct timeval tv; struct timespec ts; + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -737,7 +740,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; - if (skb->tstamp.tv64) + if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) + && shhwtstamps->syststamp.tv64) + tv = ktime_to_timeval(shhwtstamps->syststamp); + else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) + && shhwtstamps->hwtstamp.tv64) + tv = ktime_to_timeval(shhwtstamps->hwtstamp); + else if (skb->tstamp.tv64) tv = ktime_to_timeval(skb->tstamp); else do_gettimeofday(&tv); @@ -750,7 +759,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; - if (skb->tstamp.tv64) + if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) + && shhwtstamps->syststamp.tv64) + ts = ktime_to_timespec(shhwtstamps->syststamp); + else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) + && shhwtstamps->hwtstamp.tv64) + ts = ktime_to_timespec(shhwtstamps->hwtstamp); + else if (skb->tstamp.tv64) ts = ktime_to_timespec(skb->tstamp); else getnstimeofday(&ts); @@ -2027,6 +2042,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->has_vnet_hdr = !!val; return 0; } + case PACKET_TIMESTAMP: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->tp_tstamp = val; + return 0; + } default: return -ENOPROTOOPT; } @@ -2119,6 +2146,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = po->tp_loss; data = &val; break; + case PACKET_TIMESTAMP: + if (len > sizeof(int)) + len = sizeof(int); + val = po->tp_tstamp; + data = &val; + break; default: return -ENOPROTOOPT; } -- cgit v1.2.3-70-g09d2 From 20c59de2e6b6bc74bbf714dcd4e720afe8d516cf Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Tue, 1 Jun 2010 21:35:01 +0000 Subject: ipv6: Refactor update of IPv6 flowi destination address for srcrt (RH) option There are more than a dozen occurrences of following code in the IPv6 stack: if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; ipv6_addr_copy(&final, &fl.fl6_dst); ipv6_addr_copy(&fl.fl6_dst, rt0->addr); final_p = &final; } Replace those with a helper. Note that the helper overrides final_p in all cases. This is ok as final_p was previously initialized to NULL when declared. Signed-off-by: Arnaud Ebalard Signed-off-by: David S. Miller --- include/net/ipv6.h | 4 ++++ net/dccp/ipv6.c | 30 ++++++------------------------ net/ipv6/af_inet6.c | 9 ++------- net/ipv6/datagram.c | 18 ++++-------------- net/ipv6/exthdrs.c | 24 ++++++++++++++++++++++++ net/ipv6/inet6_connection_sock.c | 9 ++------- net/ipv6/raw.c | 10 ++-------- net/ipv6/syncookies.c | 9 ++------- net/ipv6/tcp_ipv6.c | 27 ++++++--------------------- net/ipv6/udp.c | 11 +++-------- 10 files changed, 55 insertions(+), 96 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2600b69757b..f5808d596aa 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -551,6 +551,10 @@ extern int ipv6_ext_hdr(u8 nexthdr); extern int ipv6_find_tlv(struct sk_buff *skb, int offset, int type); +extern struct in6_addr *fl6_update_dst(struct flowi *fl, + const struct ipv6_txoptions *opt, + struct in6_addr *orig); + /* * socket options (ipv6_sockglue.c) */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 09169889959..6e3f32575df 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -248,7 +248,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ipv6_txoptions *opt = NULL; - struct in6_addr *final_p = NULL, final; + struct in6_addr *final_p, final; struct flowi fl; int err = -1; struct dst_entry *dst; @@ -265,13 +265,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, opt = np->opt; - if (opt != NULL && opt->srcrt != NULL) { - const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt; - - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, opt, &final); err = ip6_dst_lookup(sk, &dst, &fl); if (err) @@ -545,19 +539,13 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, goto out_overflow; if (dst == NULL) { - struct in6_addr *final_p = NULL, final; + struct in6_addr *final_p, final; struct flowi fl; memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_DCCP; ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); - if (opt != NULL && opt->srcrt != NULL) { - const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt; - - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, opt, &final); ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); fl.oif = sk->sk_bound_dev_if; fl.fl_ip_dport = inet_rsk(req)->rmt_port; @@ -885,7 +873,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct in6_addr *saddr = NULL, *final_p, final; struct flowi fl; struct dst_entry *dst; int addr_type; @@ -988,13 +976,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl.fl_ip_sport = inet->inet_sport; security_sk_classify_flow(sk, &fl); - if (np->opt != NULL && np->opt->srcrt != NULL) { - const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, np->opt, &final); err = ip6_dst_lookup(sk, &dst, &fl); if (err) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e733942dafe..94b1b9c954b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -651,7 +651,7 @@ int inet6_sk_rebuild_header(struct sock *sk) if (dst == NULL) { struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; + struct in6_addr *final_p, final; struct flowi fl; memset(&fl, 0, sizeof(fl)); @@ -665,12 +665,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl.fl_ip_sport = inet->inet_sport; security_sk_classify_flow(sk, &fl); - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, np->opt, &final); err = ip6_dst_lookup(sk, &dst, &fl); if (err) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 712684687c9..7d929a22cbc 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -38,10 +38,11 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr, *final_p = NULL, final; + struct in6_addr *daddr, *final_p, final; struct dst_entry *dst; struct flowi fl; struct ip6_flowlabel *flowlabel = NULL; + struct ipv6_txoptions *opt; int addr_type; int err; @@ -155,19 +156,8 @@ ipv4_connected: security_sk_classify_flow(sk, &fl); - if (flowlabel) { - if (flowlabel->opt && flowlabel->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - } else if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + opt = flowlabel ? flowlabel->opt : np->opt; + final_p = fl6_update_dst(&fl, opt, &final); err = ip6_dst_lookup(sk, &dst, &fl); if (err) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 8a659f92d17..853a633a94d 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -874,3 +874,27 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, return opt; } +/** + * fl6_update_dst - update flowi destination address with info given + * by srcrt option, if any. + * + * @fl: flowi for which fl6_dst is to be updated + * @opt: struct ipv6_txoptions in which to look for srcrt opt + * @orig: copy of original fl6_dst address if modified + * + * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * and initial value of fl->fl6_dst set in orig + */ +struct in6_addr *fl6_update_dst(struct flowi *fl, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) +{ + if (!opt || !opt->srcrt) + return NULL; + + ipv6_addr_copy(orig, &fl->fl6_dst); + ipv6_addr_copy(&fl->fl6_dst, ((struct rt0_hdr *)opt->srcrt)->addr); + return orig; +} + +EXPORT_SYMBOL_GPL(fl6_update_dst); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 0c5e3c3b7fd..8a1628023bd 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -185,7 +185,7 @@ int inet6_csk_xmit(struct sk_buff *skb) struct ipv6_pinfo *np = inet6_sk(sk); struct flowi fl; struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; + struct in6_addr *final_p, final; memset(&fl, 0, sizeof(fl)); fl.proto = sk->sk_protocol; @@ -199,12 +199,7 @@ int inet6_csk_xmit(struct sk_buff *skb) fl.fl_ip_dport = inet->inet_dport; security_sk_classify_flow(sk, &fl); - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, np->opt, &final); dst = __inet6_csk_dst_check(sk, np->dst_cookie); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4a4dcbe4f8b..864eb8e03b1 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -725,7 +725,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, { struct ipv6_txoptions opt_space; struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct in6_addr *daddr, *final_p = NULL, final; + struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct raw6_sock *rp = raw6_sk(sk); @@ -847,13 +847,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) ipv6_addr_copy(&fl.fl6_src, &np->saddr); - /* merge ip6_build_xmit from ip6_output */ - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, opt, &final); if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) fl.oif = np->mcast_oif; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 34d1f0690d7..12383705dba 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -240,17 +240,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) * me if there is a preferred way. */ { - struct in6_addr *final_p = NULL, final; + struct in6_addr *final_p, final; struct flowi fl; memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, np->opt, &final); ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); fl.oif = sk->sk_bound_dev_if; fl.mark = sk->sk_mark; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2b7c3a100e2..e487080d02d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -129,7 +129,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct in6_addr *saddr = NULL, *final_p, final; struct flowi fl; struct dst_entry *dst; int addr_type; @@ -250,12 +250,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl.fl_ip_dport = usin->sin6_port; fl.fl_ip_sport = inet->inet_sport; - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, np->opt, &final); security_sk_classify_flow(sk, &fl); @@ -477,7 +472,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; struct ipv6_txoptions *opt = NULL; - struct in6_addr * final_p = NULL, final; + struct in6_addr * final_p, final; struct flowi fl; struct dst_entry *dst; int err = -1; @@ -494,12 +489,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, security_req_classify_flow(req, &fl); opt = np->opt; - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, opt, &final); err = ip6_dst_lookup(sk, &dst, &fl); if (err) @@ -1392,18 +1382,13 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto out_overflow; if (dst == NULL) { - struct in6_addr *final_p = NULL, final; + struct in6_addr *final_p, final; struct flowi fl; memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl, opt, &final); ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr); fl.oif = sk->sk_bound_dev_if; fl.mark = sk->sk_mark; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3048f906c04..4aea57dec75 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -929,7 +929,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct in6_addr *daddr, *final_p = NULL, final; + struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct flowi fl; @@ -1099,14 +1099,9 @@ do_udp_sendmsg: ipv6_addr_copy(&fl.fl6_src, &np->saddr); fl.fl_ip_sport = inet->inet_sport; - /* merge ip6_build_xmit from ip6_output */ - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; + final_p = fl6_update_dst(&fl, opt, &final); + if (final_p) connected = 0; - } if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) { fl.oif = np->mcast_oif; -- cgit v1.2.3-70-g09d2 From ab95bfe01f9872459c8678572ccadbf646badad0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 1 Jun 2010 21:52:08 +0000 Subject: net: replace hooks in __netif_receive_skb V5 What this patch does is it removes two receive frame hooks (for bridge and for macvlan) from __netif_receive_skb. These are replaced them with a single hook for both. It only supports one hook per device because it makes no sense to do bridging and macvlan on the same device. Then a network driver (of virtual netdev like macvlan or bridge) can register an rx_handler for needed net device. Signed-off-by: Jiri Pirko Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 19 +++++--- include/linux/if_bridge.h | 2 - include/linux/if_macvlan.h | 4 -- include/linux/netdevice.h | 7 +++ net/bridge/br.c | 2 - net/bridge/br_if.c | 8 +++ net/bridge/br_input.c | 12 +++-- net/bridge/br_private.h | 3 +- net/core/dev.c | 119 +++++++++++++++++++++------------------------ 9 files changed, 93 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 87e8d4cb405..53422ce26f7 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -145,15 +145,16 @@ static void macvlan_broadcast(struct sk_buff *skb, } /* called under rcu_read_lock() from netif_receive_skb */ -static struct sk_buff *macvlan_handle_frame(struct macvlan_port *port, - struct sk_buff *skb) +static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) { + struct macvlan_port *port; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len; + port = rcu_dereference(skb->dev->macvlan_port); if (is_multicast_ether_addr(eth->h_dest)) { src = macvlan_hash_lookup(port, eth->h_source); if (!src) @@ -515,6 +516,7 @@ static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; + int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; @@ -528,13 +530,21 @@ static int macvlan_port_create(struct net_device *dev) for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); rcu_assign_pointer(dev->macvlan_port, port); - return 0; + + err = netdev_rx_handler_register(dev, macvlan_handle_frame); + if (err) { + rcu_assign_pointer(dev->macvlan_port, NULL); + kfree(port); + } + + return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = dev->macvlan_port; + netdev_rx_handler_unregister(dev); rcu_assign_pointer(dev->macvlan_port, NULL); synchronize_rcu(); kfree(port); @@ -767,14 +777,12 @@ static int __init macvlan_init_module(void) int err; register_netdevice_notifier(&macvlan_notifier_block); - macvlan_handle_frame_hook = macvlan_handle_frame; err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: - macvlan_handle_frame_hook = NULL; unregister_netdevice_notifier(&macvlan_notifier_block); return err; } @@ -782,7 +790,6 @@ err1: static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); - macvlan_handle_frame_hook = NULL; unregister_netdevice_notifier(&macvlan_notifier_block); } diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 938b7e81df9..0d241a5c490 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -102,8 +102,6 @@ struct __fdb_entry { #include extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); -extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, - struct sk_buff *skb); extern int (*br_should_route_hook)(struct sk_buff *skb); #endif diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9ea047aca79..c26a0e4f0ce 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -84,8 +84,4 @@ extern int macvlan_link_register(struct rtnl_link_ops *ops); extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev); - -extern struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *, - struct sk_buff *); - #endif /* _LINUX_IF_MACVLAN_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bd6b75317d5..5156b806924 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -381,6 +381,8 @@ enum gro_result { }; typedef enum gro_result gro_result_t; +typedef struct sk_buff *rx_handler_func_t(struct sk_buff *skb); + extern void __napi_schedule(struct napi_struct *n); static inline int napi_disable_pending(struct napi_struct *n) @@ -957,6 +959,7 @@ struct net_device { #endif struct netdev_queue rx_queue; + rx_handler_func_t *rx_handler; struct netdev_queue *_tx ____cacheline_aligned_in_smp; @@ -1689,6 +1692,10 @@ static inline void napi_free_frags(struct napi_struct *napi) napi->skb = NULL; } +extern int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler); +extern void netdev_rx_handler_unregister(struct net_device *dev); + extern void netif_nit_deliver(struct sk_buff *skb); extern int dev_valid_name(const char *name); extern int dev_ioctl(struct net *net, unsigned int cmd, void __user *); diff --git a/net/bridge/br.c b/net/bridge/br.c index 76357b54775..c8436fa3134 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -63,7 +63,6 @@ static int __init br_init(void) goto err_out4; brioctl_set(br_ioctl_deviceless_stub); - br_handle_frame_hook = br_handle_frame; #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) br_fdb_test_addr_hook = br_fdb_test_addr; @@ -100,7 +99,6 @@ static void __exit br_deinit(void) br_fdb_test_addr_hook = NULL; #endif - br_handle_frame_hook = NULL; br_fdb_fini(); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 18b245e2c00..d9242342837 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -147,6 +147,7 @@ static void del_nbp(struct net_bridge_port *p) list_del_rcu(&p->list); + netdev_rx_handler_unregister(dev); rcu_assign_pointer(dev->br_port, NULL); br_multicast_del_port(p); @@ -429,6 +430,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) goto err2; rcu_assign_pointer(dev->br_port, p); + + err = netdev_rx_handler_register(dev, br_handle_frame); + if (err) + goto err3; + dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -451,6 +457,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) br_netpoll_enable(br, dev); return 0; +err3: + rcu_assign_pointer(dev->br_port, NULL); err2: br_fdb_delete_by_port(br, p, 1); err1: diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index d36e700f7a2..99647d8f95c 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -131,15 +131,19 @@ static inline int is_link_local(const unsigned char *dest) } /* - * Called via br_handle_frame_hook. * Return NULL if skb is handled - * note: already called with rcu_read_lock (preempt_disabled) + * note: already called with rcu_read_lock (preempt_disabled) from + * netif_receive_skb */ -struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) +struct sk_buff *br_handle_frame(struct sk_buff *skb) { + struct net_bridge_port *p; const unsigned char *dest = eth_hdr(skb)->h_dest; int (*rhook)(struct sk_buff *skb); + if (skb->pkt_type == PACKET_LOOPBACK) + return skb; + if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto drop; @@ -147,6 +151,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) if (!skb) return NULL; + p = rcu_dereference(skb->dev->br_port); + if (unlikely(is_link_local(dest))) { /* Pause frames shouldn't be passed up by driver anyway */ if (skb->protocol == htons(ETH_P_PAUSE)) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 0f4a74bc6a9..c83519b555b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -331,8 +331,7 @@ extern void br_features_recompute(struct net_bridge *br); /* br_input.c */ extern int br_handle_frame_finish(struct sk_buff *skb); -extern struct sk_buff *br_handle_frame(struct net_bridge_port *p, - struct sk_buff *skb); +extern struct sk_buff *br_handle_frame(struct sk_buff *skb); /* br_ioctl.c */ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); diff --git a/net/core/dev.c b/net/core/dev.c index ffca5c1066f..ec01a5998d7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2604,70 +2604,14 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } -#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) - -#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ + (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -/* - * If bridge module is loaded call bridging hook. - * returns NULL if packet was consumed. - */ -struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, - struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL_GPL(br_handle_frame_hook); - -static inline struct sk_buff *handle_bridge(struct sk_buff *skb, - struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) -{ - struct net_bridge_port *port; - - if (skb->pkt_type == PACKET_LOOPBACK || - (port = rcu_dereference(skb->dev->br_port)) == NULL) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - return br_handle_frame_hook(port, skb); -} -#else -#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) -#endif - -#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) -struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p, - struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); - -static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, - struct net_device *orig_dev) -{ - struct macvlan_port *port; - - port = rcu_dereference(skb->dev->macvlan_port); - if (!port) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - return macvlan_handle_frame_hook(port, skb); -} -#else -#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) -#endif - #ifdef CONFIG_NET_CLS_ACT /* TODO: Maybe we should just force sch_ingress to be compiled in * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions @@ -2763,6 +2707,47 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } +/** + * netdev_rx_handler_register - register receive handler + * @dev: device to register a handler for + * @rx_handler: receive handler to register + * + * Register a receive hander for a device. This handler will then be + * called from __netif_receive_skb. A negative errno code is returned + * on a failure. + * + * The caller must hold the rtnl_mutex. + */ +int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler) +{ + ASSERT_RTNL(); + + if (dev->rx_handler) + return -EBUSY; + + rcu_assign_pointer(dev->rx_handler, rx_handler); + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_register); + +/** + * netdev_rx_handler_unregister - unregister receive handler + * @dev: device to unregister a handler from + * + * Unregister a receive hander from a device. + * + * The caller must hold the rtnl_mutex. + */ +void netdev_rx_handler_unregister(struct net_device *dev) +{ + + ASSERT_RTNL(); + rcu_assign_pointer(dev->rx_handler, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, struct net_device *master) { @@ -2815,6 +2800,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop); static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; + rx_handler_func_t *rx_handler; struct net_device *orig_dev; struct net_device *master; struct net_device *null_or_orig; @@ -2877,12 +2863,17 @@ static int __netif_receive_skb(struct sk_buff *skb) ncls: #endif - skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; - skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; + /* Handle special case of bridge or macvlan */ + rx_handler = rcu_dereference(skb->dev->rx_handler); + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + skb = rx_handler(skb); + if (!skb) + goto out; + } /* * Make sure frames received on VLAN interfaces stacked on -- cgit v1.2.3-70-g09d2 From 77c2061d10a408d0220c2b0e7faefe52d9c41008 Mon Sep 17 00:00:00 2001 From: Walter Goldens Date: Tue, 18 May 2010 04:44:54 -0700 Subject: wireless: fix several minor description typos Signed-off-by: Walter Goldens Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath5k/eeprom.c | 2 +- drivers/net/wireless/ath/ath9k/hw.h | 2 +- drivers/net/wireless/iwmc3200wifi/hal.c | 2 +- drivers/net/wireless/libertas/scan.c | 2 +- drivers/net/wireless/orinoco/hermes_dld.c | 2 +- drivers/net/wireless/rt2x00/rt2x00dev.c | 2 +- drivers/net/wireless/zd1211rw/zd_mac.c | 2 +- drivers/net/wireless/zd1211rw/zd_usb.c | 2 +- include/linux/nl80211.h | 2 +- include/net/cfg80211.h | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/status.c | 2 +- net/mac80211/work.c | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath5k/eeprom.c b/drivers/net/wireless/ath/ath5k/eeprom.c index ed0263672d6..8490348379a 100644 --- a/drivers/net/wireless/ath/ath5k/eeprom.c +++ b/drivers/net/wireless/ath/ath5k/eeprom.c @@ -715,7 +715,7 @@ ath5k_eeprom_convert_pcal_info_5111(struct ath5k_hw *ah, int mode, /* Only one curve for RF5111 * find out which one and place - * in in pd_curves. + * in pd_curves. * Note: ee_x_gain is reversed here */ for (idx = 0; idx < AR5K_EEPROM_N_PD_CURVES; idx++) { diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h index 5cf0714f069..ffc9249b02c 100644 --- a/drivers/net/wireless/ath/ath9k/hw.h +++ b/drivers/net/wireless/ath/ath9k/hw.h @@ -461,7 +461,7 @@ struct ath9k_hw_version { #define AR_GENTMR_BIT(_index) (1 << (_index)) /* - * Using de Bruijin sequence to to look up 1's index in a 32 bit number + * Using de Bruijin sequence to look up 1's index in a 32 bit number * debruijn32 = 0000 0111 0111 1100 1011 0101 0011 0001 */ #define debruijn32 0x077CB531U diff --git a/drivers/net/wireless/iwmc3200wifi/hal.c b/drivers/net/wireless/iwmc3200wifi/hal.c index 9531b18cf72..907ac890997 100644 --- a/drivers/net/wireless/iwmc3200wifi/hal.c +++ b/drivers/net/wireless/iwmc3200wifi/hal.c @@ -54,7 +54,7 @@ * LMAC. If you look at LMAC commands you'll se that they * are actually regular iwlwifi target commands encapsulated * into a special UMAC command called UMAC passthrough. - * This is due to the fact the the host talks exclusively + * This is due to the fact the host talks exclusively * to the UMAC and so there needs to be a special UMAC * command for talking to the LMAC. * This is how a wifi command is layed out: diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c index 24cd54b3a80..7d82f13bdf1 100644 --- a/drivers/net/wireless/libertas/scan.c +++ b/drivers/net/wireless/libertas/scan.c @@ -666,7 +666,7 @@ void lbs_scan_worker(struct work_struct *work) /** * @brief Interpret a BSS scan response returned from the firmware * - * Parse the various fixed fields and IEs passed back for a a BSS probe + * Parse the various fixed fields and IEs passed back for a BSS probe * response or beacon from the scan command. Record information as needed * in the scan table struct bss_descriptor for that entry. * diff --git a/drivers/net/wireless/orinoco/hermes_dld.c b/drivers/net/wireless/orinoco/hermes_dld.c index 6da85e75fce..f750f49bbd4 100644 --- a/drivers/net/wireless/orinoco/hermes_dld.c +++ b/drivers/net/wireless/orinoco/hermes_dld.c @@ -68,7 +68,7 @@ struct dblock { } __attribute__ ((packed)); /* - * Plug Data References are located in in the image after the last data + * Plug Data References are located in the image after the last data * block. They refer to areas in the adapter memory where the plug data * items with matching ID should be written. */ diff --git a/drivers/net/wireless/rt2x00/rt2x00dev.c b/drivers/net/wireless/rt2x00/rt2x00dev.c index 3ae468c4d76..2ed32e02a06 100644 --- a/drivers/net/wireless/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/rt2x00/rt2x00dev.c @@ -224,7 +224,7 @@ void rt2x00lib_txdone(struct queue_entry *entry, /* * If the IV/EIV data was stripped from the frame before it was * passed to the hardware, we should now reinsert it again because - * mac80211 will expect the the same data to be present it the + * mac80211 will expect the same data to be present it the * frame as it was passed to us. */ if (test_bit(CONFIG_SUPPORT_HW_CRYPTO, &rt2x00dev->flags)) diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c index b0b666019a9..163a8a06b22 100644 --- a/drivers/net/wireless/zd1211rw/zd_mac.c +++ b/drivers/net/wireless/zd1211rw/zd_mac.c @@ -855,7 +855,7 @@ int zd_mac_rx(struct ieee80211_hw *hw, const u8 *buffer, unsigned int length) if (skb == NULL) return -ENOMEM; if (need_padding) { - /* Make sure the the payload data is 4 byte aligned. */ + /* Make sure the payload data is 4 byte aligned. */ skb_reserve(skb, 2); } diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c index c257940b71b..818e1480ca9 100644 --- a/drivers/net/wireless/zd1211rw/zd_usb.c +++ b/drivers/net/wireless/zd1211rw/zd_usb.c @@ -844,7 +844,7 @@ out: * @usb: a &struct zd_usb pointer * @urb: URB to be freed * - * Frees the the transmission URB, which means to put it on the free URB + * Frees the transmission URB, which means to put it on the free URB * list. */ static void free_tx_urb(struct zd_usb *usb, struct urb *urb) diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index b7c77f9712f..64fb32b93a2 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -132,7 +132,7 @@ * %NL80211_ATTR_REG_RULE_POWER_MAX_ANT_GAIN and * %NL80211_ATTR_REG_RULE_POWER_MAX_EIRP. * @NL80211_CMD_REQ_SET_REG: ask the wireless core to set the regulatory domain - * to the the specified ISO/IEC 3166-1 alpha2 country code. The core will + * to the specified ISO/IEC 3166-1 alpha2 country code. The core will * store this as a valid request and then query userspace for it. * * @NL80211_CMD_GET_MESH_PARAMS: Get mesh networking properties for the diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b44a2e5321a..049e507d2f8 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -89,7 +89,7 @@ enum ieee80211_channel_flags { * @max_power: maximum transmission power (in dBm) * @beacon_found: helper to regulatory code to indicate when a beacon * has been found on this channel. Use regulatory_hint_found_beacon() - * to enable this, this is is useful only on 5 GHz band. + * to enable this, this is useful only on 5 GHz band. * @orig_mag: internal use * @orig_mpwr: internal use */ diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0839c4e8fd2..31e3386b8d4 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1763,7 +1763,7 @@ static void ieee80211_sta_work(struct work_struct *work) /* * ieee80211_queue_work() should have picked up most cases, - * here we'll pick the the rest. + * here we'll pick the rest. */ if (WARN(local->suspended, "STA MLME work scheduled while " "going to suspend\n")) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 94613af009f..34da67995d9 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -47,7 +47,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, - * we can definitely not rely on the the control information + * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. diff --git a/net/mac80211/work.c b/net/mac80211/work.c index be3d4a69869..4157717ed78 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -840,7 +840,7 @@ static void ieee80211_work_work(struct work_struct *work) /* * ieee80211_queue_work() should have picked up most cases, - * here we'll pick the the rest. + * here we'll pick the rest. */ if (WARN(local->suspended, "work scheduled while going to suspend\n")) return; -- cgit v1.2.3-70-g09d2 From 252aa631f88080920a7083ac5a5844ffc5463629 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 19 May 2010 12:17:12 +0200 Subject: cfg80211: make action channel type optional When sending action frames, we want to verify that we do that on the correct channel. However, checking the channel type in addition can get in the way, since the channel type could change on the fly during an association, and it's not useful to have the channel type anyway since it has no effect on the transmission. Therefore, make it optional to specify so that if wanted, it can still be checked, but is not required. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/cfg80211.h | 1 + net/mac80211/cfg.c | 4 +++- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 9 ++++++--- net/wireless/core.h | 1 + net/wireless/mlme.c | 3 ++- net/wireless/nl80211.c | 3 +++ 7 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 049e507d2f8..0c3c214772e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1168,6 +1168,7 @@ struct cfg80211_ops { int (*action)(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_channel *chan, enum nl80211_channel_type channel_type, + bool channel_type_valid, const u8 *buf, size_t len, u64 *cookie); int (*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c7000a6ca37..f8c49c5ad8a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1554,10 +1554,12 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_channel *chan, enum nl80211_channel_type channel_type, + bool channel_type_valid, const u8 *buf, size_t len, u64 *cookie) { return ieee80211_mgd_action(IEEE80211_DEV_TO_SUB_IF(dev), chan, - channel_type, buf, len, cookie); + channel_type, channel_type_valid, + buf, len, cookie); } struct cfg80211_ops mac80211_config_ops = { diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 1a9e2da37a9..d4677efd3a3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -988,6 +988,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, int ieee80211_mgd_action(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, enum nl80211_channel_type channel_type, + bool channel_type_valid, const u8 *buf, size_t len, u64 *cookie); ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 31e3386b8d4..29c3a75a7ad 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2308,6 +2308,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, int ieee80211_mgd_action(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, enum nl80211_channel_type channel_type, + bool channel_type_valid, const u8 *buf, size_t len, u64 *cookie) { struct ieee80211_local *local = sdata->local; @@ -2315,9 +2316,11 @@ int ieee80211_mgd_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; /* Check that we are on the requested channel for transmission */ - if ((chan != local->tmp_channel || - channel_type != local->tmp_channel_type) && - (chan != local->oper_channel || + if (chan != local->tmp_channel && + chan != local->oper_channel) + return -EBUSY; + if (channel_type_valid && + (channel_type != local->tmp_channel_type && channel_type != local->_oper_channel_type)) return -EBUSY; diff --git a/net/wireless/core.h b/net/wireless/core.h index ae930acf75e..63d57ae399c 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -339,6 +339,7 @@ int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, enum nl80211_channel_type channel_type, + bool channel_type_valid, const u8 *buf, size_t len, u64 *cookie); /* SME */ diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 48ead6f0426..f69ae19f497 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -827,6 +827,7 @@ int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, enum nl80211_channel_type channel_type, + bool channel_type_valid, const u8 *buf, size_t len, u64 *cookie) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -855,7 +856,7 @@ int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, /* Transmit the Action frame as requested by user space */ return rdev->ops->action(&rdev->wiphy, dev, chan, channel_type, - buf, len, cookie); + channel_type_valid, buf, len, cookie); } bool cfg80211_rx_action(struct net_device *dev, int freq, const u8 *buf, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index db71150b804..90ab3c8519b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4681,6 +4681,7 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) struct net_device *dev; struct ieee80211_channel *chan; enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + bool channel_type_valid = false; u32 freq; int err; void *hdr; @@ -4722,6 +4723,7 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) err = -EINVAL; goto out; } + channel_type_valid = true; } freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); @@ -4745,6 +4747,7 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) goto free_msg; } err = cfg80211_mlme_action(rdev, dev, chan, channel_type, + channel_type_valid, nla_data(info->attrs[NL80211_ATTR_FRAME]), nla_len(info->attrs[NL80211_ATTR_FRAME]), &cookie); -- cgit v1.2.3-70-g09d2 From b5f7e7554753e2cc3ef3bef0271fdb32027df2ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jun 2010 12:05:27 +0000 Subject: ipv4: add LINUX_MIB_IPRPFILTER snmp counter Christoph Lameter mentioned that packets could be dropped in input path because of rp_filter settings, without any SNMP counter being incremented. System administrator can have a hard time to track the problem. This patch introduces a new counter, LINUX_MIB_IPRPFILTER, incremented each time we drop a packet because Reverse Path Filter triggers. (We receive an IPv4 datagram on a given interface, and find the route to send an answer would use another interface) netstat -s | grep IPReversePathFilter IPReversePathFilter: 21714 Reported-by: Christoph Lameter Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/snmp.h | 1 + net/ipv4/fib_frontend.c | 6 ++++-- net/ipv4/ip_input.c | 3 +++ net/ipv4/proc.c | 1 + net/ipv4/route.c | 31 ++++++++++++++++++------------- 5 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 52797714ade..ebb0c80ffd6 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -229,6 +229,7 @@ enum LINUX_MIB_TCPBACKLOGDROP, LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */ LINUX_MIB_TCPDEFERACCEPTDROP, + LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4f0ed458c88..e830f7a123b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -284,7 +284,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (no_addr) goto last_resort; if (rpf == 1) - goto e_inval; + goto e_rpf; fl.oif = dev->ifindex; ret = 0; @@ -299,7 +299,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, last_resort: if (rpf) - goto e_inval; + goto e_rpf; *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; @@ -308,6 +308,8 @@ e_inval_res: fib_res_put(&res); e_inval: return -EINVAL; +e_rpf: + return -EXDEV; } static inline __be32 sk_extract_addr(struct sockaddr *addr) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d930dc5e4d8..d52c9da644c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,6 +340,9 @@ static int ip_rcv_finish(struct sk_buff *skb) else if (err == -ENETUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); goto drop; } } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3dc9914c1dc..e320ca6b3ef 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), + SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8495bceec76..d377b45005f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1851,6 +1851,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, __be32 spec_dst; struct in_device *in_dev = in_dev_get(dev); u32 itag = 0; + int err; /* Primary sanity checks. */ @@ -1865,10 +1866,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (!ipv4_is_local_multicast(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag, 0) < 0) - goto e_inval; - + } else { + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, + &itag, 0); + if (err < 0) + goto e_err; + } rth = dst_alloc(&ipv4_dst_ops); if (!rth) goto e_nobufs; @@ -1920,8 +1923,10 @@ e_nobufs: return -ENOBUFS; e_inval: + err = -EINVAL; +e_err: in_dev_put(in_dev); - return -EINVAL; + return err; } @@ -1985,7 +1990,6 @@ static int __mkroute_input(struct sk_buff *skb, ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); - err = -EINVAL; goto cleanup; } @@ -2157,13 +2161,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; if (res.type == RTN_LOCAL) { - int result; - result = fib_validate_source(saddr, daddr, tos, + err = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, dev, &spec_dst, &itag, skb->mark); - if (result < 0) - goto martian_source; - if (result) + if (err < 0) + goto martian_source_keep_err; + if (err) flags |= RTCF_DIRECTSRC; spec_dst = daddr; goto local_input; @@ -2191,7 +2194,7 @@ brd_input: err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag, skb->mark); if (err < 0) - goto martian_source; + goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; } @@ -2272,8 +2275,10 @@ e_nobufs: goto done; martian_source: + err = -EINVAL; +martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto e_inval; + goto done; } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- cgit v1.2.3-70-g09d2 From bc10502dba37d3b210efd9f3867212298f13b78e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Jun 2010 03:21:52 -0700 Subject: net: use __packed annotation cleanup patch. Use new __packed annotation in net/ and include/ (except netfilter) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/if_ether.h | 2 +- include/linux/if_fddi.h | 8 ++--- include/linux/if_frad.h | 2 +- include/linux/if_hippi.h | 8 ++--- include/linux/if_pppox.h | 8 ++--- include/linux/ipv6.h | 4 +-- include/linux/isdnif.h | 2 +- include/linux/mISDNif.h | 2 +- include/linux/nbd.h | 2 +- include/linux/ncp.h | 10 +++--- include/linux/ncp_fs_sb.h | 14 ++++---- include/linux/phonet.h | 4 +-- include/linux/rds.h | 12 +++---- include/linux/sctp.h | 80 +++++++++++++++++++++++----------------------- include/linux/wlp.h | 22 ++++++------- include/net/dn_dev.h | 8 ++--- include/net/dn_nsp.h | 16 +++++----- include/net/ip6_tunnel.h | 2 +- include/net/ipx.h | 8 ++--- include/net/mip6.h | 2 +- include/net/ndisc.h | 2 +- include/net/sctp/structs.h | 4 +-- include/rxrpc/packet.h | 8 ++--- net/bluetooth/bnep/bnep.h | 8 ++--- net/compat.c | 6 ++-- net/iucv/iucv.c | 14 ++++---- net/mac80211/cfg.c | 2 +- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/rx.c | 2 +- net/sctp/sm_make_chunk.c | 2 +- 30 files changed, 133 insertions(+), 133 deletions(-) (limited to 'include') diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index bed7a4682b9..c831467774d 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -119,7 +119,7 @@ struct ethhdr { unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ unsigned char h_source[ETH_ALEN]; /* source ether addr */ __be16 h_proto; /* packet type ID field */ -} __attribute__((packed)); +} __packed; #ifdef __KERNEL__ #include diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h index 5459c5c0993..9947c39e62f 100644 --- a/include/linux/if_fddi.h +++ b/include/linux/if_fddi.h @@ -67,7 +67,7 @@ struct fddi_8022_1_hdr { __u8 dsap; /* destination service access point */ __u8 ssap; /* source service access point */ __u8 ctrl; /* control byte #1 */ -} __attribute__ ((packed)); +} __packed; /* Define 802.2 Type 2 header */ struct fddi_8022_2_hdr { @@ -75,7 +75,7 @@ struct fddi_8022_2_hdr { __u8 ssap; /* source service access point */ __u8 ctrl_1; /* control byte #1 */ __u8 ctrl_2; /* control byte #2 */ -} __attribute__ ((packed)); +} __packed; /* Define 802.2 SNAP header */ #define FDDI_K_OUI_LEN 3 @@ -85,7 +85,7 @@ struct fddi_snap_hdr { __u8 ctrl; /* always 0x03 */ __u8 oui[FDDI_K_OUI_LEN]; /* organizational universal id */ __be16 ethertype; /* packet type ID field */ -} __attribute__ ((packed)); +} __packed; /* Define FDDI LLC frame header */ struct fddihdr { @@ -98,7 +98,7 @@ struct fddihdr { struct fddi_8022_2_hdr llc_8022_2; struct fddi_snap_hdr llc_snap; } hdr; -} __attribute__ ((packed)); +} __packed; #ifdef __KERNEL__ #include diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h index 80b3a1056a5..191ee0869bc 100644 --- a/include/linux/if_frad.h +++ b/include/linux/if_frad.h @@ -135,7 +135,7 @@ struct frhdr __be16 PID; #define IP_NLPID pad -} __attribute__((packed)); +} __packed; /* see RFC 1490 for the definition of the following */ #define FRAD_I_UI 0x03 diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h index 8d038eb8db5..5fe5f307c6f 100644 --- a/include/linux/if_hippi.h +++ b/include/linux/if_hippi.h @@ -104,7 +104,7 @@ struct hippi_fp_hdr { __be32 fixed; #endif __be32 d2_size; -} __attribute__ ((packed)); +} __packed; struct hippi_le_hdr { #if defined (__BIG_ENDIAN_BITFIELD) @@ -129,7 +129,7 @@ struct hippi_le_hdr { __u8 daddr[HIPPI_ALEN]; __u16 locally_administered; __u8 saddr[HIPPI_ALEN]; -} __attribute__ ((packed)); +} __packed; #define HIPPI_OUI_LEN 3 /* @@ -142,12 +142,12 @@ struct hippi_snap_hdr { __u8 ctrl; /* always 0x03 */ __u8 oui[HIPPI_OUI_LEN]; /* organizational universal id (zero)*/ __be16 ethertype; /* packet type ID field */ -} __attribute__ ((packed)); +} __packed; struct hippi_hdr { struct hippi_fp_hdr fp; struct hippi_le_hdr le; struct hippi_snap_hdr snap; -} __attribute__ ((packed)); +} __packed; #endif /* _LINUX_IF_HIPPI_H */ diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index a6577af0c4e..1925e0c3f16 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -59,7 +59,7 @@ struct sockaddr_pppox { union{ struct pppoe_addr pppoe; }sa_addr; -}__attribute__ ((packed)); +} __packed; /* The use of the above union isn't viable because the size of this * struct must stay fixed over time -- applications use sizeof(struct @@ -70,7 +70,7 @@ struct sockaddr_pppol2tp { sa_family_t sa_family; /* address family, AF_PPPOX */ unsigned int sa_protocol; /* protocol identifier */ struct pppol2tp_addr pppol2tp; -}__attribute__ ((packed)); +} __packed; /* The L2TPv3 protocol changes tunnel and session ids from 16 to 32 * bits. So we need a different sockaddr structure. @@ -79,7 +79,7 @@ struct sockaddr_pppol2tpv3 { sa_family_t sa_family; /* address family, AF_PPPOX */ unsigned int sa_protocol; /* protocol identifier */ struct pppol2tpv3_addr pppol2tp; -} __attribute__ ((packed)); +} __packed; /********************************************************************* * @@ -129,7 +129,7 @@ struct pppoe_hdr { __be16 sid; __be16 length; struct pppoe_tag tag[0]; -} __attribute__ ((packed)); +} __packed; /* Length of entire PPPoE + PPP header */ #define PPPOE_SES_HLEN 8 diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 99e1ab7e3ee..940e2159535 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -58,7 +58,7 @@ struct ipv6_opt_hdr { /* * TLV encoded option data follows. */ -} __attribute__ ((packed)); /* required for some archs */ +} __packed; /* required for some archs */ #define ipv6_destopt_hdr ipv6_opt_hdr #define ipv6_hopopt_hdr ipv6_opt_hdr @@ -99,7 +99,7 @@ struct ipv6_destopt_hao { __u8 type; __u8 length; struct in6_addr addr; -} __attribute__ ((__packed__)); +} __packed; /* * IPv6 fixed header diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h index b9b5a684ed6..b8c23f88dd5 100644 --- a/include/linux/isdnif.h +++ b/include/linux/isdnif.h @@ -317,7 +317,7 @@ typedef struct T30_s { __u8 r_scantime; __u8 r_id[FAXIDLEN]; __u8 r_code; -} __attribute__((packed)) T30_s; +} __packed T30_s; #define ISDN_TTY_FAX_CONN_IN 0 #define ISDN_TTY_FAX_CONN_OUT 1 diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index 78c3bed1c3f..b5e7f220248 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -251,7 +251,7 @@ struct mISDNhead { unsigned int prim; unsigned int id; -} __attribute__((packed)); +} __packed; #define MISDN_HEADER_LEN sizeof(struct mISDNhead) #define MAX_DATA_SIZE 2048 diff --git a/include/linux/nbd.h b/include/linux/nbd.h index 155719dab81..bb58854a806 100644 --- a/include/linux/nbd.h +++ b/include/linux/nbd.h @@ -88,7 +88,7 @@ struct nbd_request { char handle[8]; __be64 from; __be32 len; -} __attribute__ ((packed)); +} __packed; /* * This is the reply packet that nbd-server sends back to the client after diff --git a/include/linux/ncp.h b/include/linux/ncp.h index 99f0adeeb3f..3ace8370e61 100644 --- a/include/linux/ncp.h +++ b/include/linux/ncp.h @@ -27,7 +27,7 @@ struct ncp_request_header { __u8 conn_high; __u8 function; __u8 data[0]; -} __attribute__((packed)); +} __packed; #define NCP_REPLY (0x3333) #define NCP_WATCHDOG (0x3E3E) @@ -42,7 +42,7 @@ struct ncp_reply_header { __u8 completion_code; __u8 connection_state; __u8 data[0]; -} __attribute__((packed)); +} __packed; #define NCP_VOLNAME_LEN (16) #define NCP_NUMBER_OF_VOLUMES (256) @@ -158,7 +158,7 @@ struct nw_info_struct { #ifdef __KERNEL__ struct nw_nfs_info nfs; #endif -} __attribute__((packed)); +} __packed; /* modify mask - use with MODIFY_DOS_INFO structure */ #define DM_ATTRIBUTES (cpu_to_le32(0x02)) @@ -190,12 +190,12 @@ struct nw_modify_dos_info { __u16 inheritanceGrantMask; __u16 inheritanceRevokeMask; __u32 maximumSpace; -} __attribute__((packed)); +} __packed; struct nw_search_sequence { __u8 volNumber; __u32 dirBase; __u32 sequence; -} __attribute__((packed)); +} __packed; #endif /* _LINUX_NCP_H */ diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h index 5ec9ca67168..8da05bc098c 100644 --- a/include/linux/ncp_fs_sb.h +++ b/include/linux/ncp_fs_sb.h @@ -104,13 +104,13 @@ struct ncp_server { unsigned int state; /* STREAM only: receiver state */ struct { - __u32 magic __attribute__((packed)); - __u32 len __attribute__((packed)); - __u16 type __attribute__((packed)); - __u16 p1 __attribute__((packed)); - __u16 p2 __attribute__((packed)); - __u16 p3 __attribute__((packed)); - __u16 type2 __attribute__((packed)); + __u32 magic __packed; + __u32 len __packed; + __u16 type __packed; + __u16 p1 __packed; + __u16 p2 __packed; + __u16 p3 __packed; + __u16 type2 __packed; } buf; /* STREAM only: temporary buffer */ unsigned char* ptr; /* STREAM only: pointer to data */ size_t len; /* STREAM only: length of data to receive */ diff --git a/include/linux/phonet.h b/include/linux/phonet.h index e5126cff9b2..24426c3d6b5 100644 --- a/include/linux/phonet.h +++ b/include/linux/phonet.h @@ -56,7 +56,7 @@ struct phonethdr { __be16 pn_length; __u8 pn_robj; __u8 pn_sobj; -} __attribute__((packed)); +} __packed; /* Common Phonet payload header */ struct phonetmsg { @@ -98,7 +98,7 @@ struct sockaddr_pn { __u8 spn_dev; __u8 spn_resource; __u8 spn_zero[sizeof(struct sockaddr) - sizeof(sa_family_t) - 3]; -} __attribute__ ((packed)); +} __packed; /* Well known address */ #define PN_DEV_PC 0x10 diff --git a/include/linux/rds.h b/include/linux/rds.h index cab4994c2f6..24bce3ded9e 100644 --- a/include/linux/rds.h +++ b/include/linux/rds.h @@ -100,7 +100,7 @@ struct rds_info_counter { u_int8_t name[32]; u_int64_t value; -} __attribute__((packed)); +} __packed; #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 @@ -115,7 +115,7 @@ struct rds_info_connection { __be32 faddr; u_int8_t transport[TRANSNAMSIZ]; /* null term ascii */ u_int8_t flags; -} __attribute__((packed)); +} __packed; struct rds_info_flow { __be32 laddr; @@ -123,7 +123,7 @@ struct rds_info_flow { u_int32_t bytes; __be16 lport; __be16 fport; -} __attribute__((packed)); +} __packed; #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 @@ -136,7 +136,7 @@ struct rds_info_message { __be16 lport; __be16 fport; u_int8_t flags; -} __attribute__((packed)); +} __packed; struct rds_info_socket { u_int32_t sndbuf; @@ -146,7 +146,7 @@ struct rds_info_socket { __be16 connected_port; u_int32_t rcvbuf; u_int64_t inum; -} __attribute__((packed)); +} __packed; struct rds_info_tcp_socket { __be32 local_addr; @@ -158,7 +158,7 @@ struct rds_info_tcp_socket { u_int32_t last_sent_nxt; u_int32_t last_expected_una; u_int32_t last_seen_una; -} __attribute__((packed)); +} __packed; #define RDS_IB_GID_LEN 16 struct rds_info_rdma_connection { diff --git a/include/linux/sctp.h b/include/linux/sctp.h index c20d3ce673c..c11a28706fa 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -61,7 +61,7 @@ typedef struct sctphdr { __be16 dest; __be32 vtag; __le32 checksum; -} __attribute__((packed)) sctp_sctphdr_t; +} __packed sctp_sctphdr_t; #ifdef __KERNEL__ #include @@ -77,7 +77,7 @@ typedef struct sctp_chunkhdr { __u8 type; __u8 flags; __be16 length; -} __attribute__((packed)) sctp_chunkhdr_t; +} __packed sctp_chunkhdr_t; /* Section 3.2. Chunk Type Values. @@ -167,7 +167,7 @@ enum { SCTP_CHUNK_FLAG_T = 0x01 }; typedef struct sctp_paramhdr { __be16 type; __be16 length; -} __attribute__((packed)) sctp_paramhdr_t; +} __packed sctp_paramhdr_t; typedef enum { @@ -228,12 +228,12 @@ typedef struct sctp_datahdr { __be16 ssn; __be32 ppid; __u8 payload[0]; -} __attribute__((packed)) sctp_datahdr_t; +} __packed sctp_datahdr_t; typedef struct sctp_data_chunk { sctp_chunkhdr_t chunk_hdr; sctp_datahdr_t data_hdr; -} __attribute__((packed)) sctp_data_chunk_t; +} __packed sctp_data_chunk_t; /* DATA Chuck Specific Flags */ enum { @@ -259,78 +259,78 @@ typedef struct sctp_inithdr { __be16 num_inbound_streams; __be32 initial_tsn; __u8 params[0]; -} __attribute__((packed)) sctp_inithdr_t; +} __packed sctp_inithdr_t; typedef struct sctp_init_chunk { sctp_chunkhdr_t chunk_hdr; sctp_inithdr_t init_hdr; -} __attribute__((packed)) sctp_init_chunk_t; +} __packed sctp_init_chunk_t; /* Section 3.3.2.1. IPv4 Address Parameter (5) */ typedef struct sctp_ipv4addr_param { sctp_paramhdr_t param_hdr; struct in_addr addr; -} __attribute__((packed)) sctp_ipv4addr_param_t; +} __packed sctp_ipv4addr_param_t; /* Section 3.3.2.1. IPv6 Address Parameter (6) */ typedef struct sctp_ipv6addr_param { sctp_paramhdr_t param_hdr; struct in6_addr addr; -} __attribute__((packed)) sctp_ipv6addr_param_t; +} __packed sctp_ipv6addr_param_t; /* Section 3.3.2.1 Cookie Preservative (9) */ typedef struct sctp_cookie_preserve_param { sctp_paramhdr_t param_hdr; __be32 lifespan_increment; -} __attribute__((packed)) sctp_cookie_preserve_param_t; +} __packed sctp_cookie_preserve_param_t; /* Section 3.3.2.1 Host Name Address (11) */ typedef struct sctp_hostname_param { sctp_paramhdr_t param_hdr; uint8_t hostname[0]; -} __attribute__((packed)) sctp_hostname_param_t; +} __packed sctp_hostname_param_t; /* Section 3.3.2.1 Supported Address Types (12) */ typedef struct sctp_supported_addrs_param { sctp_paramhdr_t param_hdr; __be16 types[0]; -} __attribute__((packed)) sctp_supported_addrs_param_t; +} __packed sctp_supported_addrs_param_t; /* Appendix A. ECN Capable (32768) */ typedef struct sctp_ecn_capable_param { sctp_paramhdr_t param_hdr; -} __attribute__((packed)) sctp_ecn_capable_param_t; +} __packed sctp_ecn_capable_param_t; /* ADDIP Section 3.2.6 Adaptation Layer Indication */ typedef struct sctp_adaptation_ind_param { struct sctp_paramhdr param_hdr; __be32 adaptation_ind; -} __attribute__((packed)) sctp_adaptation_ind_param_t; +} __packed sctp_adaptation_ind_param_t; /* ADDIP Section 4.2.7 Supported Extensions Parameter */ typedef struct sctp_supported_ext_param { struct sctp_paramhdr param_hdr; __u8 chunks[0]; -} __attribute__((packed)) sctp_supported_ext_param_t; +} __packed sctp_supported_ext_param_t; /* AUTH Section 3.1 Random */ typedef struct sctp_random_param { sctp_paramhdr_t param_hdr; __u8 random_val[0]; -} __attribute__((packed)) sctp_random_param_t; +} __packed sctp_random_param_t; /* AUTH Section 3.2 Chunk List */ typedef struct sctp_chunks_param { sctp_paramhdr_t param_hdr; __u8 chunks[0]; -} __attribute__((packed)) sctp_chunks_param_t; +} __packed sctp_chunks_param_t; /* AUTH Section 3.3 HMAC Algorithm */ typedef struct sctp_hmac_algo_param { sctp_paramhdr_t param_hdr; __be16 hmac_ids[0]; -} __attribute__((packed)) sctp_hmac_algo_param_t; +} __packed sctp_hmac_algo_param_t; /* RFC 2960. Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2): * The INIT ACK chunk is used to acknowledge the initiation of an SCTP @@ -342,13 +342,13 @@ typedef sctp_init_chunk_t sctp_initack_chunk_t; typedef struct sctp_cookie_param { sctp_paramhdr_t p; __u8 body[0]; -} __attribute__((packed)) sctp_cookie_param_t; +} __packed sctp_cookie_param_t; /* Section 3.3.3.1 Unrecognized Parameters (8) */ typedef struct sctp_unrecognized_param { sctp_paramhdr_t param_hdr; sctp_paramhdr_t unrecognized; -} __attribute__((packed)) sctp_unrecognized_param_t; +} __packed sctp_unrecognized_param_t; @@ -363,7 +363,7 @@ typedef struct sctp_unrecognized_param { typedef struct sctp_gap_ack_block { __be16 start; __be16 end; -} __attribute__((packed)) sctp_gap_ack_block_t; +} __packed sctp_gap_ack_block_t; typedef __be32 sctp_dup_tsn_t; @@ -378,12 +378,12 @@ typedef struct sctp_sackhdr { __be16 num_gap_ack_blocks; __be16 num_dup_tsns; sctp_sack_variable_t variable[0]; -} __attribute__((packed)) sctp_sackhdr_t; +} __packed sctp_sackhdr_t; typedef struct sctp_sack_chunk { sctp_chunkhdr_t chunk_hdr; sctp_sackhdr_t sack_hdr; -} __attribute__((packed)) sctp_sack_chunk_t; +} __packed sctp_sack_chunk_t; /* RFC 2960. Section 3.3.5 Heartbeat Request (HEARTBEAT) (4): @@ -395,12 +395,12 @@ typedef struct sctp_sack_chunk { typedef struct sctp_heartbeathdr { sctp_paramhdr_t info; -} __attribute__((packed)) sctp_heartbeathdr_t; +} __packed sctp_heartbeathdr_t; typedef struct sctp_heartbeat_chunk { sctp_chunkhdr_t chunk_hdr; sctp_heartbeathdr_t hb_hdr; -} __attribute__((packed)) sctp_heartbeat_chunk_t; +} __packed sctp_heartbeat_chunk_t; /* For the abort and shutdown ACK we must carry the init tag in the @@ -409,7 +409,7 @@ typedef struct sctp_heartbeat_chunk { */ typedef struct sctp_abort_chunk { sctp_chunkhdr_t uh; -} __attribute__((packed)) sctp_abort_chunk_t; +} __packed sctp_abort_chunk_t; /* For the graceful shutdown we must carry the tag (in common header) @@ -417,12 +417,12 @@ typedef struct sctp_abort_chunk { */ typedef struct sctp_shutdownhdr { __be32 cum_tsn_ack; -} __attribute__((packed)) sctp_shutdownhdr_t; +} __packed sctp_shutdownhdr_t; struct sctp_shutdown_chunk_t { sctp_chunkhdr_t chunk_hdr; sctp_shutdownhdr_t shutdown_hdr; -} __attribute__ ((packed)); +} __packed; /* RFC 2960. Section 3.3.10 Operation Error (ERROR) (9) */ @@ -430,12 +430,12 @@ typedef struct sctp_errhdr { __be16 cause; __be16 length; __u8 variable[0]; -} __attribute__((packed)) sctp_errhdr_t; +} __packed sctp_errhdr_t; typedef struct sctp_operr_chunk { sctp_chunkhdr_t chunk_hdr; sctp_errhdr_t err_hdr; -} __attribute__((packed)) sctp_operr_chunk_t; +} __packed sctp_operr_chunk_t; /* RFC 2960 3.3.10 - Operation Error * @@ -525,7 +525,7 @@ typedef struct sctp_ecnehdr { typedef struct sctp_ecne_chunk { sctp_chunkhdr_t chunk_hdr; sctp_ecnehdr_t ence_hdr; -} __attribute__((packed)) sctp_ecne_chunk_t; +} __packed sctp_ecne_chunk_t; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Congestion Window Reduced (CWR) (13) @@ -537,7 +537,7 @@ typedef struct sctp_cwrhdr { typedef struct sctp_cwr_chunk { sctp_chunkhdr_t chunk_hdr; sctp_cwrhdr_t cwr_hdr; -} __attribute__((packed)) sctp_cwr_chunk_t; +} __packed sctp_cwr_chunk_t; /* PR-SCTP * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN) @@ -588,17 +588,17 @@ typedef struct sctp_cwr_chunk { struct sctp_fwdtsn_skip { __be16 stream; __be16 ssn; -} __attribute__((packed)); +} __packed; struct sctp_fwdtsn_hdr { __be32 new_cum_tsn; struct sctp_fwdtsn_skip skip[0]; -} __attribute((packed)); +} __packed; struct sctp_fwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_fwdtsn_hdr fwdtsn_hdr; -} __attribute((packed)); +} __packed; /* ADDIP @@ -636,17 +636,17 @@ struct sctp_fwdtsn_chunk { typedef struct sctp_addip_param { sctp_paramhdr_t param_hdr; __be32 crr_id; -} __attribute__((packed)) sctp_addip_param_t; +} __packed sctp_addip_param_t; typedef struct sctp_addiphdr { __be32 serial; __u8 params[0]; -} __attribute__((packed)) sctp_addiphdr_t; +} __packed sctp_addiphdr_t; typedef struct sctp_addip_chunk { sctp_chunkhdr_t chunk_hdr; sctp_addiphdr_t addip_hdr; -} __attribute__((packed)) sctp_addip_chunk_t; +} __packed sctp_addip_chunk_t; /* AUTH * Section 4.1 Authentication Chunk (AUTH) @@ -701,11 +701,11 @@ typedef struct sctp_authhdr { __be16 shkey_id; __be16 hmac_id; __u8 hmac[0]; -} __attribute__((packed)) sctp_authhdr_t; +} __packed sctp_authhdr_t; typedef struct sctp_auth_chunk { sctp_chunkhdr_t chunk_hdr; sctp_authhdr_t auth_hdr; -} __attribute__((packed)) sctp_auth_chunk_t; +} __packed sctp_auth_chunk_t; #endif /* __LINUX_SCTP_H__ */ diff --git a/include/linux/wlp.h b/include/linux/wlp.h index ac95ce6606a..c76fe239250 100644 --- a/include/linux/wlp.h +++ b/include/linux/wlp.h @@ -300,7 +300,7 @@ struct wlp_ie { __le16 cycle_param; __le16 acw_anchor_addr; u8 wssid_hash_list[]; -} __attribute__((packed)); +} __packed; static inline int wlp_ie_hash_length(struct wlp_ie *ie) { @@ -324,7 +324,7 @@ static inline void wlp_ie_set_hash_length(struct wlp_ie *ie, int hash_length) */ struct wlp_nonce { u8 data[16]; -} __attribute__((packed)); +} __packed; /** * WLP UUID @@ -336,7 +336,7 @@ struct wlp_nonce { */ struct wlp_uuid { u8 data[16]; -} __attribute__((packed)); +} __packed; /** @@ -348,7 +348,7 @@ struct wlp_dev_type { u8 OUI[3]; u8 OUIsubdiv; __le16 subID; -} __attribute__((packed)); +} __packed; /** * WLP frame header @@ -357,7 +357,7 @@ struct wlp_dev_type { struct wlp_frame_hdr { __le16 mux_hdr; /* WLP_PROTOCOL_ID */ enum wlp_frame_type type:8; -} __attribute__((packed)); +} __packed; /** * WLP attribute field header @@ -368,7 +368,7 @@ struct wlp_frame_hdr { struct wlp_attr_hdr { __le16 type; __le16 length; -} __attribute__((packed)); +} __packed; /** * Device information commonly used together @@ -401,13 +401,13 @@ struct wlp_device_info { struct wlp_attr_##name { \ struct wlp_attr_hdr hdr; \ type name; \ -} __attribute__((packed)); +} __packed; #define wlp_attr_array(type, name) \ struct wlp_attr_##name { \ struct wlp_attr_hdr hdr; \ type name[]; \ -} __attribute__((packed)); +} __packed; /** * WLP association attribute fields @@ -483,7 +483,7 @@ struct wlp_wss_info { struct wlp_attr_accept_enrl accept; struct wlp_attr_wss_sec_status sec_stat; struct wlp_attr_wss_bcast bcast; -} __attribute__((packed)); +} __packed; /* WLP WSS Information */ wlp_attr_array(struct wlp_wss_info, wss_info) @@ -520,7 +520,7 @@ wlp_attr(u8, wlp_assc_err) struct wlp_frame_std_abbrv_hdr { struct wlp_frame_hdr hdr; u8 tag; -} __attribute__((packed)); +} __packed; /** * WLP association frames @@ -533,7 +533,7 @@ struct wlp_frame_assoc { struct wlp_attr_version version; struct wlp_attr_msg_type msg_type; u8 attr[]; -} __attribute__((packed)); +} __packed; /* Ethernet to dev address mapping */ struct wlp_eda { diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h index 511a459ec10..0916bbf3bdf 100644 --- a/include/net/dn_dev.h +++ b/include/net/dn_dev.h @@ -101,7 +101,7 @@ struct dn_short_packet { __le16 dstnode; __le16 srcnode; __u8 forward; -} __attribute__((packed)); +} __packed; struct dn_long_packet { __u8 msgflg; @@ -115,7 +115,7 @@ struct dn_long_packet { __u8 visit_ct; __u8 s_class; __u8 pt; -} __attribute__((packed)); +} __packed; /*------------------------- DRP - Routing messages ---------------------*/ @@ -132,7 +132,7 @@ struct endnode_hello_message { __u8 mpd; __u8 datalen; __u8 data[2]; -} __attribute__((packed)); +} __packed; struct rtnode_hello_message { __u8 msgflg; @@ -144,7 +144,7 @@ struct rtnode_hello_message { __u8 area; __le16 timer; __u8 mpd; -} __attribute__((packed)); +} __packed; extern void dn_dev_init(void); diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h index 17d43d2db5e..e43a2893f13 100644 --- a/include/net/dn_nsp.h +++ b/include/net/dn_nsp.h @@ -74,18 +74,18 @@ struct nsp_data_seg_msg { __u8 msgflg; __le16 dstaddr; __le16 srcaddr; -} __attribute__((packed)); +} __packed; struct nsp_data_opt_msg { __le16 acknum; __le16 segnum; __le16 lsflgs; -} __attribute__((packed)); +} __packed; struct nsp_data_opt_msg1 { __le16 acknum; __le16 segnum; -} __attribute__((packed)); +} __packed; /* Acknowledgment Message (data/other data) */ @@ -94,13 +94,13 @@ struct nsp_data_ack_msg { __le16 dstaddr; __le16 srcaddr; __le16 acknum; -} __attribute__((packed)); +} __packed; /* Connect Acknowledgment Message */ struct nsp_conn_ack_msg { __u8 msgflg; __le16 dstaddr; -} __attribute__((packed)); +} __packed; /* Connect Initiate/Retransmit Initiate/Connect Confirm */ @@ -117,7 +117,7 @@ struct nsp_conn_init_msg { #define NSP_FC_MASK 0x0c /* FC type mask */ __u8 info; __le16 segsize; -} __attribute__((packed)); +} __packed; /* Disconnect Initiate/Disconnect Confirm */ struct nsp_disconn_init_msg { @@ -125,7 +125,7 @@ struct nsp_disconn_init_msg { __le16 dstaddr; __le16 srcaddr; __le16 reason; -} __attribute__((packed)); +} __packed; @@ -135,7 +135,7 @@ struct srcobj_fmt { __le16 grpcode; __le16 usrcode; __u8 dlen; -} __attribute__((packed)); +} __packed; /* * A collection of functions for manipulating the sequence diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index fbf9d1cda27..fc94ec568a5 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -27,6 +27,6 @@ struct ipv6_tlv_tnl_enc_lim { __u8 type; /* type-code for option */ __u8 length; /* option length */ __u8 encap_limit; /* tunnel encapsulation limit */ -} __attribute__ ((packed)); +} __packed; #endif diff --git a/include/net/ipx.h b/include/net/ipx.h index ef51a668ba1..05d7e4a88b4 100644 --- a/include/net/ipx.h +++ b/include/net/ipx.h @@ -27,9 +27,9 @@ struct ipx_address { #define IPX_MAX_PPROP_HOPS 8 struct ipxhdr { - __be16 ipx_checksum __attribute__ ((packed)); + __be16 ipx_checksum __packed; #define IPX_NO_CHECKSUM cpu_to_be16(0xFFFF) - __be16 ipx_pktsize __attribute__ ((packed)); + __be16 ipx_pktsize __packed; __u8 ipx_tctrl; __u8 ipx_type; #define IPX_TYPE_UNKNOWN 0x00 @@ -38,8 +38,8 @@ struct ipxhdr { #define IPX_TYPE_SPX 0x05 /* SPX protocol */ #define IPX_TYPE_NCP 0x11 /* $lots for docs on this (SPIT) */ #define IPX_TYPE_PPROP 0x14 /* complicated flood fill brdcast */ - struct ipx_address ipx_dest __attribute__ ((packed)); - struct ipx_address ipx_source __attribute__ ((packed)); + struct ipx_address ipx_dest __packed; + struct ipx_address ipx_source __packed; }; static __inline__ struct ipxhdr *ipx_hdr(struct sk_buff *skb) diff --git a/include/net/mip6.h b/include/net/mip6.h index a83ad1982a9..26ba99b5a4b 100644 --- a/include/net/mip6.h +++ b/include/net/mip6.h @@ -39,7 +39,7 @@ struct ip6_mh { __u16 ip6mh_cksum; /* Followed by type specific messages */ __u8 data[0]; -} __attribute__ ((__packed__)); +} __packed; #define IP6_MH_TYPE_BRR 0 /* Binding Refresh Request */ #define IP6_MH_TYPE_HOTI 1 /* HOTI Message */ diff --git a/include/net/ndisc.h b/include/net/ndisc.h index f76f22d0572..895997bc2ea 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -82,7 +82,7 @@ struct ra_msg { struct nd_opt_hdr { __u8 nd_opt_type; __u8 nd_opt_len; -} __attribute__((__packed__)); +} __packed; extern int ndisc_init(void); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4b860116e09..f9e7473613b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -443,7 +443,7 @@ struct sctp_signed_cookie { __u8 signature[SCTP_SECRET_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; -} __attribute__((packed)); +} __packed; /* This is another convenience type to allocate memory for address * params for the maximum size and pass such structures around @@ -488,7 +488,7 @@ typedef struct sctp_sender_hb_info { union sctp_addr daddr; unsigned long sent_at; __u64 hb_nonce; -} __attribute__((packed)) sctp_sender_hb_info_t; +} __packed sctp_sender_hb_info_t; /* * RFC 2960 1.3.2 Sequenced Delivery within Streams diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index b69e6e173ea..9b2c30897e5 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -65,7 +65,7 @@ struct rxrpc_header { }; __be16 serviceId; /* service ID */ -} __attribute__((packed)); +} __packed; #define __rxrpc_header_off(X) offsetof(struct rxrpc_header,X) @@ -120,7 +120,7 @@ struct rxrpc_ackpacket { #define RXRPC_ACK_TYPE_NACK 0 #define RXRPC_ACK_TYPE_ACK 1 -} __attribute__((packed)); +} __packed; /* * ACK packets can have a further piece of information tagged on the end @@ -141,7 +141,7 @@ struct rxkad_challenge { __be32 nonce; /* encrypted random number */ __be32 min_level; /* minimum security level */ __be32 __padding; /* padding to 8-byte boundary */ -} __attribute__((packed)); +} __packed; /*****************************************************************************/ /* @@ -164,7 +164,7 @@ struct rxkad_response { __be32 kvno; /* Kerberos key version number */ __be32 ticket_len; /* Kerberos ticket length */ -} __attribute__((packed)); +} __packed; /*****************************************************************************/ /* diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h index 0d9e506f5d5..70672544db8 100644 --- a/net/bluetooth/bnep/bnep.h +++ b/net/bluetooth/bnep/bnep.h @@ -86,26 +86,26 @@ struct bnep_setup_conn_req { __u8 ctrl; __u8 uuid_size; __u8 service[0]; -} __attribute__((packed)); +} __packed; struct bnep_set_filter_req { __u8 type; __u8 ctrl; __be16 len; __u8 list[0]; -} __attribute__((packed)); +} __packed; struct bnep_control_rsp { __u8 type; __u8 ctrl; __be16 resp; -} __attribute__((packed)); +} __packed; struct bnep_ext_hdr { __u8 type; __u8 len; __u8 data[0]; -} __attribute__((packed)); +} __packed; /* BNEP ioctl defines */ #define BNEPCONNADD _IOW('B', 200, int) diff --git a/net/compat.c b/net/compat.c index ec24d9edb02..1cf75905f13 100644 --- a/net/compat.c +++ b/net/compat.c @@ -531,7 +531,7 @@ struct compat_group_req { __u32 gr_interface; struct __kernel_sockaddr_storage gr_group __attribute__ ((aligned(4))); -} __attribute__ ((packed)); +} __packed; struct compat_group_source_req { __u32 gsr_interface; @@ -539,7 +539,7 @@ struct compat_group_source_req { __attribute__ ((aligned(4))); struct __kernel_sockaddr_storage gsr_source __attribute__ ((aligned(4))); -} __attribute__ ((packed)); +} __packed; struct compat_group_filter { __u32 gf_interface; @@ -549,7 +549,7 @@ struct compat_group_filter { __u32 gf_numsrc; struct __kernel_sockaddr_storage gf_slist[1] __attribute__ ((aligned(4))); -} __attribute__ ((packed)); +} __packed; #define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \ sizeof(struct __kernel_sockaddr_storage)) diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index f28ad2cc842..499c045d691 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -1463,7 +1463,7 @@ struct iucv_path_pending { u32 res3; u8 ippollfg; u8 res4[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_path_pending(struct iucv_irq_data *data) { @@ -1524,7 +1524,7 @@ struct iucv_path_complete { u32 res3; u8 ippollfg; u8 res4[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_path_complete(struct iucv_irq_data *data) { @@ -1554,7 +1554,7 @@ struct iucv_path_severed { u32 res4; u8 ippollfg; u8 res5[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_path_severed(struct iucv_irq_data *data) { @@ -1590,7 +1590,7 @@ struct iucv_path_quiesced { u32 res4; u8 ippollfg; u8 res5[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_path_quiesced(struct iucv_irq_data *data) { @@ -1618,7 +1618,7 @@ struct iucv_path_resumed { u32 res4; u8 ippollfg; u8 res5[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_path_resumed(struct iucv_irq_data *data) { @@ -1649,7 +1649,7 @@ struct iucv_message_complete { u32 ipbfln2f; u8 ippollfg; u8 res2[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_message_complete(struct iucv_irq_data *data) { @@ -1694,7 +1694,7 @@ struct iucv_message_pending { u32 ipbfln2f; u8 ippollfg; u8 res2[3]; -} __attribute__ ((packed)); +} __packed; static void iucv_message_pending(struct iucv_irq_data *data) { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c7000a6ca37..a2ed0f7b556 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -600,7 +600,7 @@ struct iapp_layer2_update { u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; -} __attribute__ ((packed)); +} __packed; static void ieee80211_send_layer2_update(struct sta_info *sta) { diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 1a9e2da37a9..ec3e5c3e27b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1084,7 +1084,7 @@ struct ieee80211_tx_status_rtap_hdr { u8 padding_for_rate; __le16 tx_flags; u8 data_retries; -} __attribute__ ((packed)); +} __packed; /* HT */ diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6e2a7bcd8cb..2d9a2ee94e1 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2139,7 +2139,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, u8 rate_or_pad; __le16 chan_freq; __le16 chan_flags; - } __attribute__ ((packed)) *rthdr; + } __packed *rthdr; struct sk_buff *skb = rx->skb, *skb2; struct net_device *prev_dev = NULL; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index bd2a50b482a..246f9292465 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1817,7 +1817,7 @@ malformed: struct __sctp_missing { __be32 num_missing; __be16 type; -} __attribute__((packed)); +} __packed; /* * Report a missing mandatory parameter. -- cgit v1.2.3-70-g09d2 From 095dfdb0c479661f437b24b85e31f0d0b841eab6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 26 May 2010 17:19:25 +0200 Subject: mac80211: remove tx status ampdu_ack_map There's a single use of this struct member, but as it is write-only it clearly not necessary. Thus we can free up some space here, even if we don't need it right now it seems pointless to carry around the variable. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn-tx.c | 1 - include/net/mac80211.h | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-tx.c b/drivers/net/wireless/iwlwifi/iwl-agn-tx.c index 52bec104046..bde342b5df8 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-tx.c @@ -1252,7 +1252,6 @@ static int iwlagn_tx_status_reply_compressed_ba(struct iwl_priv *priv, info->flags |= IEEE80211_TX_STAT_ACK; info->flags |= IEEE80211_TX_STAT_AMPDU; info->status.ampdu_ack_len = successes; - info->status.ampdu_ack_map = bitmap; info->status.ampdu_len = agg->frame_count; iwlagn_hwrate_to_tx_control(priv, agg->rate_n_flags, info); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index de22cbfef23..f26440a46df 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -412,8 +412,6 @@ struct ieee80211_tx_rate { * @driver_data: array of driver_data pointers * @ampdu_ack_len: number of acked aggregated frames. * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ampdu_ack_map: block ack bit map for the aggregation. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. * @ampdu_len: number of aggregated frames. * relevant only if IEEE80211_TX_STAT_AMPDU was set. * @ack_signal: signal strength of the ACK frame @@ -448,10 +446,9 @@ struct ieee80211_tx_info { struct { struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; u8 ampdu_ack_len; - u64 ampdu_ack_map; int ack_signal; u8 ampdu_len; - /* 7 bytes free */ + /* 15 bytes free */ } status; struct { struct ieee80211_tx_rate driver_rates[ -- cgit v1.2.3-70-g09d2 From 2b2c009ecf71f4c66ff8420b63dddbc9737e04e3 Mon Sep 17 00:00:00 2001 From: Juuso Oikarinen Date: Thu, 27 May 2010 15:32:13 +0300 Subject: mac80211: Add support for hardware ARP query filtering Some hardware allow extended filtering of ARP frames not intended for the host. To perform such filtering, the hardware needs to know the current IP address(es) of the host, bound to its interface. Add support for ARP filtering to mac80211 by adding a new op to the driver interface, allowing to configure the current IP addresses. This op is called upon association with the currently configured address(es), and when associated whenever the IP address(es) change. This patch adds configuration of IPv4 addresses only, as IPv6 addresses don't need ARP filtering. Signed-off-by: Juuso Oikarinen Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/mac80211.h | 14 ++++++++++ net/mac80211/driver-ops.h | 17 ++++++++++++ net/mac80211/driver-trace.h | 25 ++++++++++++++++++ net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/main.c | 63 ++++++++++++++++++++++++++++++++++++++++++++- net/mac80211/mlme.c | 11 +++++++- 6 files changed, 130 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f26440a46df..74b9b49ddfa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -19,6 +19,7 @@ #include #include #include +#include #include /** @@ -1532,6 +1533,16 @@ enum ieee80211_ampdu_mlme_action { * of the bss parameters has changed when a call is made. The callback * can sleep. * + * @configure_arp_filter: Configuration function for hardware ARP query filter. + * This function is called with all the IP addresses configured to the + * interface as argument - all ARP queries targeted to any of these + * addresses must pass through. If the hardware filter does not support + * enought addresses, hardware filtering must be disabled. The ifa_list + * argument may be NULL, indicating that filtering must be disabled. + * This function is called upon association complete with current + * address(es), and while associated whenever the IP address(es) change. + * The callback can sleep. + * * @prepare_multicast: Prepare for multicast filter configuration. * This callback is optional, and its return value is passed * to configure_filter(). This callback must be atomic. @@ -1671,6 +1682,9 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u32 changed); + int (*configure_arp_filter)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct in_ifaddr *ifa_list); u64 (*prepare_multicast)(struct ieee80211_hw *hw, struct netdev_hw_addr_list *mc_list); void (*configure_filter)(struct ieee80211_hw *hw, diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 4f227131665..978850ee3a5 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -83,6 +83,23 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, trace_drv_bss_info_changed(local, sdata, info, changed); } +struct in_ifaddr; +static inline int drv_configure_arp_filter(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct in_ifaddr *ifa_list) +{ + int ret = 0; + + might_sleep(); + + if (local->ops->configure_arp_filter) + ret = local->ops->configure_arp_filter(&local->hw, vif, + ifa_list); + + trace_drv_configure_arp_filter(local, vif_to_sdata(vif), ifa_list, ret); + return ret; +} + static inline u64 drv_prepare_multicast(struct ieee80211_local *local, struct netdev_hw_addr_list *mc_list) { diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index 6a9b2342a9c..577460da2ea 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -219,6 +219,31 @@ TRACE_EVENT(drv_bss_info_changed, ) ); +TRACE_EVENT(drv_configure_arp_filter, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct in_ifaddr *ifa_list, int ret), + + TP_ARGS(local, sdata, ifa_list, ret), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(int, ret) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->ret = ret; + ), + + TP_printk( + VIF_PR_FMT LOCAL_PR_FMT " ret:%d", + VIF_PR_ARG, LOCAL_PR_ARG, __entry->ret + ) +); + TRACE_EVENT(drv_prepare_multicast, TP_PROTO(struct ieee80211_local *local, int mc_count, u64 ret), diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d4677efd3a3..47d67537f17 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -851,6 +851,7 @@ struct ieee80211_local { struct work_struct dynamic_ps_disable_work; struct timer_list dynamic_ps_timer; struct notifier_block network_latency_notifier; + struct notifier_block ifa_notifier; int user_power_level; /* in dBm */ int power_constr_level; /* in dBm */ @@ -997,6 +998,7 @@ void ieee80211_send_pspoll(struct ieee80211_local *local, void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency); int ieee80211_max_network_latency(struct notifier_block *nb, unsigned long data, void *dummy); +int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_sw_ie *sw_elem, struct ieee80211_bss *bss, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index c8548e61f86..4051b232c6e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -329,6 +329,58 @@ static void ieee80211_recalc_smps_work(struct work_struct *work) mutex_unlock(&local->iflist_mtx); } +int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata) +{ + struct in_device *idev; + int ret = 0; + + BUG_ON(!sdata); + ASSERT_RTNL(); + + idev = sdata->dev->ip_ptr; + if (!idev) + return 0; + + ret = drv_configure_arp_filter(sdata->local, &sdata->vif, + idev->ifa_list); + return ret; +} + +static int ieee80211_ifa_changed(struct notifier_block *nb, + unsigned long data, void *arg) +{ + struct in_ifaddr *ifa = arg; + struct ieee80211_local *local = + container_of(nb, struct ieee80211_local, + ifa_notifier); + struct net_device *ndev = ifa->ifa_dev->dev; + struct wireless_dev *wdev = ndev->ieee80211_ptr; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + + /* Make sure it's our interface that got changed */ + if (!wdev) + return NOTIFY_DONE; + + if (wdev->wiphy != local->hw.wiphy) + return NOTIFY_DONE; + + /* We are concerned about IP addresses only when associated */ + sdata = IEEE80211_DEV_TO_SUB_IF(ndev); + + /* ARP filtering is only supported in managed mode */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return NOTIFY_DONE; + + ifmgd = &sdata->u.mgd; + mutex_lock(&ifmgd->mtx); + if (ifmgd->associated) + ieee80211_set_arp_filter(sdata); + mutex_unlock(&ifmgd->mtx); + + return NOTIFY_DONE; +} + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -612,14 +664,22 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_max_network_latency; result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY, &local->network_latency_notifier); - if (result) { rtnl_lock(); goto fail_pm_qos; } + local->ifa_notifier.notifier_call = ieee80211_ifa_changed; + result = register_inetaddr_notifier(&local->ifa_notifier); + if (result) + goto fail_ifa; + return 0; + fail_ifa: + pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, + &local->network_latency_notifier); + rtnl_lock(); fail_pm_qos: ieee80211_led_exit(local); ieee80211_remove_interfaces(local); @@ -647,6 +707,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, &local->network_latency_notifier); + unregister_inetaddr_notifier(&local->ifa_notifier); rtnl_lock(); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 29c3a75a7ad..7e720133358 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2078,8 +2078,17 @@ static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk, cfg80211_send_assoc_timeout(wk->sdata->dev, wk->filter_ta); return WORK_DONE_DESTROY; + } else { + mutex_unlock(&wk->sdata->u.mgd.mtx); + + /* + * configure ARP filter IP addresses to the driver, + * intentionally outside the mgd mutex. + */ + rtnl_lock(); + ieee80211_set_arp_filter(wk->sdata); + rtnl_unlock(); } - mutex_unlock(&wk->sdata->u.mgd.mtx); } cfg80211_send_rx_assoc(wk->sdata->dev, skb->data, skb->len); -- cgit v1.2.3-70-g09d2 From 6a8579d0e62c0eac428184ce45e86bc46677724a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 27 May 2010 14:41:07 +0200 Subject: mac80211: clean up ieee80211_stop_tx_ba_session There's no sense in letting anything but internal mac80211 functions set the initiator to anything but WLAN_BACK_INITIATOR, since WLAN_BACK_RECIPIENT is only valid when we have received a frame from the peer, which we react to directly in mac80211. The debugfs code I recently added got this wrong as well. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn-rs.c | 3 +-- include/net/mac80211.h | 6 ++---- net/mac80211/agg-tx.c | 7 +++---- net/mac80211/debugfs_sta.c | 3 +-- net/mac80211/driver-trace.h | 10 ++++------ 5 files changed, 11 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c index cf4a95bae4f..40933a5de02 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c @@ -313,8 +313,7 @@ static int rs_tl_turn_on_agg_for_tid(struct iwl_priv *priv, */ IWL_DEBUG_HT(priv, "Fail start Tx agg on tid: %d\n", tid); - ieee80211_stop_tx_ba_session(sta, tid, - WLAN_BACK_INITIATOR); + ieee80211_stop_tx_ba_session(sta, tid); } } else IWL_ERR(priv, "Fail finding valid aggregation tid: %d\n", tid); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 74b9b49ddfa..2e728611c57 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2345,16 +2345,14 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, * ieee80211_stop_tx_ba_session - Stop a Block Ack session. * @sta: the station whose BA session to stop * @tid: the TID to stop BA. - * @initiator: if indicates initiator DELBA frame will be sent. * - * Return: error if no sta with matching da found, success otherwise + * Return: negative error if the TID is invalid, or no aggregation active * * Although mac80211/low level driver/user space application can estimate * the need to stop aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ -int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid, - enum ieee80211_back_parties initiator); +int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_stop_tx_ba_cb - low level driver ready to stop aggregate. diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index c163d0a149f..feb15c4a1fa 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -538,14 +538,13 @@ int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, return ret; } -int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, - enum ieee80211_back_parties initiator) +int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; - trace_api_stop_tx_ba_session(pubsta, tid, initiator); + trace_api_stop_tx_ba_session(pubsta, tid); if (!local->ops->ampdu_action) return -EINVAL; @@ -553,7 +552,7 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, if (tid >= STA_TID_NUM) return -EINVAL; - return __ieee80211_stop_tx_ba_session(sta, tid, initiator); + return __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR); } EXPORT_SYMBOL(ieee80211_stop_tx_ba_session); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index e763f1529dd..9f140612224 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -210,8 +210,7 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu if (start) ret = ieee80211_start_tx_ba_session(&sta->sta, tid); else - ret = ieee80211_stop_tx_ba_session(&sta->sta, tid, - WLAN_BACK_RECIPIENT); + ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); } else { __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3); ret = 0; diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index 577460da2ea..6b90630151a 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -876,25 +876,23 @@ TRACE_EVENT(api_start_tx_ba_cb, ); TRACE_EVENT(api_stop_tx_ba_session, - TP_PROTO(struct ieee80211_sta *sta, u16 tid, u16 initiator), + TP_PROTO(struct ieee80211_sta *sta, u16 tid), - TP_ARGS(sta, tid, initiator), + TP_ARGS(sta, tid), TP_STRUCT__entry( STA_ENTRY __field(u16, tid) - __field(u16, initiator) ), TP_fast_assign( STA_ASSIGN; __entry->tid = tid; - __entry->initiator = initiator; ), TP_printk( - STA_PR_FMT " tid:%d initiator:%d", - STA_PR_ARG, __entry->tid, __entry->initiator + STA_PR_FMT " tid:%d", + STA_PR_ARG, __entry->tid ) ); -- cgit v1.2.3-70-g09d2 From 724df615928b7050d33b6243f60b12bd87484fc7 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Wed, 26 May 2010 09:22:40 -0700 Subject: fix comment typo in netdevice.h Fix missing "of" in comment. Signed-off-by: Justin P. Mattock Signed-off-by: Jiri Kosina --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a1bff651816..c761c903772 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -775,7 +775,7 @@ struct net_device { /* * This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name - * the interface. + * of the interface. */ char name[IFNAMSIZ]; -- cgit v1.2.3-70-g09d2 From 38a6cc7538d3c44b76f9dcea607a171adcc0208e Mon Sep 17 00:00:00 2001 From: Sujith Date: Wed, 19 May 2010 11:32:30 +0530 Subject: mac80211: Remove deprecated sta_notify commands STA_NOTIFY_ADD and STA_NOTIFY_REMOVE have no users anymore, and station addition/removal are indicated to drivers using sta_add() and sta_remove(), which can sleep. Signed-off-by: Sujith Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/mac80211.h | 6 +----- net/mac80211/driver-ops.h | 6 ------ 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2e728611c57..e3c1d479400 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -879,16 +879,12 @@ struct ieee80211_sta { * enum sta_notify_cmd - sta notify command * * Used with the sta_notify() callback in &struct ieee80211_ops, this - * indicates addition and removal of a station to station table, - * or if a associated station made a power state transition. + * indicates if an associated station made a power state transition. * - * @STA_NOTIFY_ADD: (DEPRECATED) a station was added to the station table - * @STA_NOTIFY_REMOVE: (DEPRECATED) a station being removed from the station table * @STA_NOTIFY_SLEEP: a station is now sleeping * @STA_NOTIFY_AWAKE: a sleeping station woke up */ enum sta_notify_cmd { - STA_NOTIFY_ADD, STA_NOTIFY_REMOVE, STA_NOTIFY_SLEEP, STA_NOTIFY_AWAKE, }; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 978850ee3a5..d1139e4f88a 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -269,9 +269,6 @@ static inline int drv_sta_add(struct ieee80211_local *local, if (local->ops->sta_add) ret = local->ops->sta_add(&local->hw, &sdata->vif, sta); - else if (local->ops->sta_notify) - local->ops->sta_notify(&local->hw, &sdata->vif, - STA_NOTIFY_ADD, sta); trace_drv_sta_add(local, sdata, sta, ret); @@ -286,9 +283,6 @@ static inline void drv_sta_remove(struct ieee80211_local *local, if (local->ops->sta_remove) local->ops->sta_remove(&local->hw, &sdata->vif, sta); - else if (local->ops->sta_notify) - local->ops->sta_notify(&local->hw, &sdata->vif, - STA_NOTIFY_REMOVE, sta); trace_drv_sta_remove(local, sdata, sta); } -- cgit v1.2.3-70-g09d2 From 14f92952bf74a365ca7f9dfbec158e7c933ea723 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 3 Jun 2010 19:37:27 -0700 Subject: ssb: add dma_dev to ssb_device structure Add dma_dev, a pointer to struct device, to struct ssb_device. We pass it to the generic DMA API with SSB_BUSTYPE_PCI and SSB_BUSTYPE_SSB. ssb_devices_register() sets up it properly. This is preparation for replacing the ssb bus specific DMA API (ssb_dma_*) with the generic DMA API. Signed-off-by: FUJITA Tomonori Acked-by: Michael Buesch Cc: Gary Zambrano Cc: Stefano Brivio Cc: Larry Finger Cc: John W. Linville Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: John W. Linville --- drivers/ssb/main.c | 2 ++ include/linux/ssb/ssb.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c index 51275aac5b3..a732b396e60 100644 --- a/drivers/ssb/main.c +++ b/drivers/ssb/main.c @@ -486,6 +486,7 @@ static int ssb_devices_register(struct ssb_bus *bus) #ifdef CONFIG_SSB_PCIHOST sdev->irq = bus->host_pci->irq; dev->parent = &bus->host_pci->dev; + sdev->dma_dev = dev->parent; #endif break; case SSB_BUSTYPE_PCMCIA: @@ -501,6 +502,7 @@ static int ssb_devices_register(struct ssb_bus *bus) break; case SSB_BUSTYPE_SSB: dev->dma_mask = &dev->coherent_dma_mask; + sdev->dma_dev = dev; break; } diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index a2608bff9c7..0d5f04316b3 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -167,7 +167,7 @@ struct ssb_device { * is an optimization. */ const struct ssb_bus_ops *ops; - struct device *dev; + struct device *dev, *dma_dev; struct ssb_bus *bus; struct ssb_device_id id; -- cgit v1.2.3-70-g09d2 From 467429b475e56f154f93b3b14fd75f238d14597a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 3 Jun 2010 19:37:44 -0700 Subject: ssb: remove the ssb DMA API Now they are unnecessary. We can use the generic DMA API with any bus. Signed-off-by: FUJITA Tomonori Cc: Michael Buesch Cc: Gary Zambrano Cc: Stefano Brivio Cc: Larry Finger Cc: John W. Linville Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: John W. Linville --- drivers/ssb/main.c | 74 ----------------------- include/linux/ssb/ssb.h | 157 ------------------------------------------------ 2 files changed, 231 deletions(-) (limited to 'include') diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c index a732b396e60..7cee7f4eb60 100644 --- a/drivers/ssb/main.c +++ b/drivers/ssb/main.c @@ -1228,80 +1228,6 @@ u32 ssb_dma_translation(struct ssb_device *dev) } EXPORT_SYMBOL(ssb_dma_translation); -int ssb_dma_set_mask(struct ssb_device *dev, u64 mask) -{ -#ifdef CONFIG_SSB_PCIHOST - int err; -#endif - - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - err = pci_set_dma_mask(dev->bus->host_pci, mask); - if (err) - return err; - err = pci_set_consistent_dma_mask(dev->bus->host_pci, mask); - return err; -#endif - case SSB_BUSTYPE_SSB: - return dma_set_mask(dev->dev, mask); - default: - __ssb_dma_not_implemented(dev); - } - return -ENOSYS; -} -EXPORT_SYMBOL(ssb_dma_set_mask); - -void * ssb_dma_alloc_consistent(struct ssb_device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp_flags) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - if (gfp_flags & GFP_DMA) { - /* Workaround: The PCI API does not support passing - * a GFP flag. */ - return dma_alloc_coherent(&dev->bus->host_pci->dev, - size, dma_handle, gfp_flags); - } - return pci_alloc_consistent(dev->bus->host_pci, size, dma_handle); -#endif - case SSB_BUSTYPE_SSB: - return dma_alloc_coherent(dev->dev, size, dma_handle, gfp_flags); - default: - __ssb_dma_not_implemented(dev); - } - return NULL; -} -EXPORT_SYMBOL(ssb_dma_alloc_consistent); - -void ssb_dma_free_consistent(struct ssb_device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - gfp_t gfp_flags) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - if (gfp_flags & GFP_DMA) { - /* Workaround: The PCI API does not support passing - * a GFP flag. */ - dma_free_coherent(&dev->bus->host_pci->dev, - size, vaddr, dma_handle); - return; - } - pci_free_consistent(dev->bus->host_pci, size, - vaddr, dma_handle); - return; -#endif - case SSB_BUSTYPE_SSB: - dma_free_coherent(dev->dev, size, vaddr, dma_handle); - return; - default: - __ssb_dma_not_implemented(dev); - } -} -EXPORT_SYMBOL(ssb_dma_free_consistent); - int ssb_bus_may_powerdown(struct ssb_bus *bus) { struct ssb_chipcommon *cc; diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 0d5f04316b3..623b704fdc4 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -470,14 +470,6 @@ extern u32 ssb_dma_translation(struct ssb_device *dev); #define SSB_DMA_TRANSLATION_MASK 0xC0000000 #define SSB_DMA_TRANSLATION_SHIFT 30 -extern int ssb_dma_set_mask(struct ssb_device *dev, u64 mask); - -extern void * ssb_dma_alloc_consistent(struct ssb_device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp_flags); -extern void ssb_dma_free_consistent(struct ssb_device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - gfp_t gfp_flags); - static inline void __cold __ssb_dma_not_implemented(struct ssb_device *dev) { #ifdef CONFIG_SSB_DEBUG @@ -486,155 +478,6 @@ static inline void __cold __ssb_dma_not_implemented(struct ssb_device *dev) #endif /* DEBUG */ } -static inline int ssb_dma_mapping_error(struct ssb_device *dev, dma_addr_t addr) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - return pci_dma_mapping_error(dev->bus->host_pci, addr); -#endif - break; - case SSB_BUSTYPE_SSB: - return dma_mapping_error(dev->dev, addr); - default: - break; - } - __ssb_dma_not_implemented(dev); - return -ENOSYS; -} - -static inline dma_addr_t ssb_dma_map_single(struct ssb_device *dev, void *p, - size_t size, enum dma_data_direction dir) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - return pci_map_single(dev->bus->host_pci, p, size, dir); -#endif - break; - case SSB_BUSTYPE_SSB: - return dma_map_single(dev->dev, p, size, dir); - default: - break; - } - __ssb_dma_not_implemented(dev); - return 0; -} - -static inline void ssb_dma_unmap_single(struct ssb_device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - pci_unmap_single(dev->bus->host_pci, dma_addr, size, dir); - return; -#endif - break; - case SSB_BUSTYPE_SSB: - dma_unmap_single(dev->dev, dma_addr, size, dir); - return; - default: - break; - } - __ssb_dma_not_implemented(dev); -} - -static inline void ssb_dma_sync_single_for_cpu(struct ssb_device *dev, - dma_addr_t dma_addr, - size_t size, - enum dma_data_direction dir) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - pci_dma_sync_single_for_cpu(dev->bus->host_pci, dma_addr, - size, dir); - return; -#endif - break; - case SSB_BUSTYPE_SSB: - dma_sync_single_for_cpu(dev->dev, dma_addr, size, dir); - return; - default: - break; - } - __ssb_dma_not_implemented(dev); -} - -static inline void ssb_dma_sync_single_for_device(struct ssb_device *dev, - dma_addr_t dma_addr, - size_t size, - enum dma_data_direction dir) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - pci_dma_sync_single_for_device(dev->bus->host_pci, dma_addr, - size, dir); - return; -#endif - break; - case SSB_BUSTYPE_SSB: - dma_sync_single_for_device(dev->dev, dma_addr, size, dir); - return; - default: - break; - } - __ssb_dma_not_implemented(dev); -} - -static inline void ssb_dma_sync_single_range_for_cpu(struct ssb_device *dev, - dma_addr_t dma_addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - /* Just sync everything. That's all the PCI API can do. */ - pci_dma_sync_single_for_cpu(dev->bus->host_pci, dma_addr, - offset + size, dir); - return; -#endif - break; - case SSB_BUSTYPE_SSB: - dma_sync_single_range_for_cpu(dev->dev, dma_addr, offset, - size, dir); - return; - default: - break; - } - __ssb_dma_not_implemented(dev); -} - -static inline void ssb_dma_sync_single_range_for_device(struct ssb_device *dev, - dma_addr_t dma_addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ - switch (dev->bus->bustype) { - case SSB_BUSTYPE_PCI: -#ifdef CONFIG_SSB_PCIHOST - /* Just sync everything. That's all the PCI API can do. */ - pci_dma_sync_single_for_device(dev->bus->host_pci, dma_addr, - offset + size, dir); - return; -#endif - break; - case SSB_BUSTYPE_SSB: - dma_sync_single_range_for_device(dev->dev, dma_addr, offset, - size, dir); - return; - default: - break; - } - __ssb_dma_not_implemented(dev); -} - - #ifdef CONFIG_SSB_PCIHOST /* PCI-host wrapper driver */ extern int ssb_pcihost_register(struct pci_driver *driver); -- cgit v1.2.3-70-g09d2 From 5360bd776f73d0a7da571d72a09a03f237e99900 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 28 May 2010 23:01:00 -0400 Subject: Fix up the "generic" unistd.h ABI to be more useful. Reserve 16 "architecture-specific" syscall numbers starting at 244. Allow use of the sys_sync_file_range2() API with the generic unistd.h by specifying __ARCH_WANT_SYNC_FILE_RANGE2 before including it. Allow using the generic unistd.h to create the "compat" syscall table by specifying __SYSCALL_COMPAT before including it. Use sys_fadvise64_64 for __NR3264_fadvise64 in both 32- and 64-bit mode. Request the appropriate __ARCH_WANT_COMPAT_SYS_xxx values when some deprecated syscall modes are selected. As part of this change to fix up the syscalls, also provide a couple of missing signal-related syscall prototypes in . Signed-off-by: Chris Metcalf Acked-by: Arnd Bergmann --- include/asm-generic/unistd.h | 26 ++++++++++++++++++++------ include/linux/syscalls.h | 4 ++++ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 6a0b30f78a6..30218b4fa4e 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -18,7 +18,7 @@ #define __SYSCALL(x, y) #endif -#if __BITS_PER_LONG == 32 +#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT) #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32) #else #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64) @@ -241,8 +241,13 @@ __SYSCALL(__NR_sync, sys_sync) __SYSCALL(__NR_fsync, sys_fsync) #define __NR_fdatasync 83 __SYSCALL(__NR_fdatasync, sys_fdatasync) +#ifdef __ARCH_WANT_SYNC_FILE_RANGE2 +#define __NR_sync_file_range2 84 +__SYSCALL(__NR_sync_file_range2, sys_sync_file_range2) +#else #define __NR_sync_file_range 84 -__SYSCALL(__NR_sync_file_range, sys_sync_file_range) /* .long sys_sync_file_range2, */ +__SYSCALL(__NR_sync_file_range, sys_sync_file_range) +#endif /* fs/timerfd.c */ #define __NR_timerfd_create 85 @@ -580,7 +585,7 @@ __SYSCALL(__NR_execve, sys_execve) /* .long sys_execve_wrapper */ __SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap) /* mm/fadvise.c */ #define __NR3264_fadvise64 223 -__SC_3264(__NR3264_fadvise64, sys_fadvise64_64, sys_fadvise64) +__SYSCALL(__NR3264_fadvise64, sys_fadvise64_64) /* mm/, CONFIG_MMU only */ #ifndef __ARCH_NOMMU @@ -627,8 +632,14 @@ __SYSCALL(__NR_accept4, sys_accept4) #define __NR_recvmmsg 243 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +/* + * Architectures may provide up to 16 syscalls of their own + * starting with this value. + */ +#define __NR_arch_specific_syscall 244 + #undef __NR_syscalls -#define __NR_syscalls 244 +#define __NR_syscalls 260 /* * All syscalls below here should go away really, @@ -694,7 +705,8 @@ __SYSCALL(__NR_signalfd, sys_signalfd) #define __NR_syscalls (__NR_signalfd+1) #endif /* __ARCH_WANT_SYSCALL_NO_FLAGS */ -#if __BITS_PER_LONG == 32 && defined(__ARCH_WANT_SYSCALL_OFF_T) +#if (__BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)) && \ + defined(__ARCH_WANT_SYSCALL_OFF_T) #define __NR_sendfile 1046 __SYSCALL(__NR_sendfile, sys_sendfile) #define __NR_ftruncate 1047 @@ -740,6 +752,7 @@ __SYSCALL(__NR_getpgrp, sys_getpgrp) __SYSCALL(__NR_pause, sys_pause) #define __NR_time 1062 #define __ARCH_WANT_SYS_TIME +#define __ARCH_WANT_COMPAT_SYS_TIME __SYSCALL(__NR_time, sys_time) #define __NR_utime 1063 #define __ARCH_WANT_SYS_UTIME @@ -801,7 +814,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall) * Here we map the numbers so that both versions * use the same syscall table layout. */ -#if __BITS_PER_LONG == 64 +#if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT) #define __NR_fcntl __NR3264_fcntl #define __NR_statfs __NR3264_statfs #define __NR_fstatfs __NR3264_fstatfs @@ -848,6 +861,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall) #endif #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND +#define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND /* * "Conditional" syscalls diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a1a86a53bc7..4a19d9bb836 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -364,9 +364,13 @@ asmlinkage long sys_init_module(void __user *umod, unsigned long len, asmlinkage long sys_delete_module(const char __user *name_user, unsigned int flags); +asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act, + struct sigaction __user *oact, + size_t sigsetsize); asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize); asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize); +asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize); asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, siginfo_t __user *uinfo, const struct timespec __user *uts, -- cgit v1.2.3-70-g09d2 From b78462ebc6a4ef9074aa80abebcdd470dc5f0ce0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 Jun 2010 12:24:37 +0000 Subject: skbuff: add check for non-linear to warn_if_lro and needs_linearize We can avoid an unecessary cache miss by checking if the skb is non-linear before accessing gso_size/gso_type in skb_warn_if_lro, the same can also be done to avoid a cache miss on nr_frags if data_len is 0. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++- net/core/dev.c | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf243fc5495..645e78d395f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2129,7 +2129,8 @@ static inline bool skb_warn_if_lro(const struct sk_buff *skb) /* LRO sets gso_size but not gso_type, whereas if GSO is really * wanted then gso_type will be set. */ struct skb_shared_info *shinfo = skb_shinfo(skb); - if (shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) { + if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 && + unlikely(shinfo->gso_type == 0)) { __skb_warn_lro_forwarding(skb); return true; } diff --git a/net/core/dev.c b/net/core/dev.c index ec01a5998d7..3abb3a6058b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2103,9 +2103,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { - return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || - illegal_highdma(dev, skb))); + return skb_is_nonlinear(skb) && + ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || + illegal_highdma(dev, skb)))); } /** -- cgit v1.2.3-70-g09d2 From bb1d912323d5dd50e1079e389f4e964be14f0ae3 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Wed, 2 Jun 2010 08:40:18 +0000 Subject: bonding: allow user-controlled output slave selection v2: changed bonding module version, modified to apply on top of changes from previous patch in series, and updated documentation to elaborate on multiqueue awareness that now exists in bonding driver. This patch give the user the ability to control the output slave for round-robin and active-backup bonding. Similar functionality was discussed in the past, but Jay Vosburgh indicated he would rather see a feature like this added to existing modes rather than creating a completely new mode. Jay's thoughts as well as Neil's input surrounding some of the issues with the first implementation pushed us toward a design that relied on the queue_mapping rather than skb marks. Round-robin and active-backup modes were chosen as the first users of this slave selection as they seemed like the most logical choices when considering a multi-switch environment. Round-robin mode works without any modification, but active-backup does require inclusion of the first patch in this series and setting the 'all_slaves_active' flag. This will allow reception of unicast traffic on any of the backup interfaces. This was tested with IPv4-based filters as well as VLAN-based filters with good results. More information as well as a configuration example is available in the patch to Documentation/networking/bonding.txt. Signed-off-by: Andy Gospodarek Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- Documentation/networking/bonding.txt | 84 ++++++++++++++++++++++++- drivers/net/bonding/bond_main.c | 75 +++++++++++++++++++++- drivers/net/bonding/bond_sysfs.c | 116 +++++++++++++++++++++++++++++++++++ drivers/net/bonding/bonding.h | 9 ++- include/linux/if_bonding.h | 1 + 5 files changed, 278 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 61f516b135b..d0914781830 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -49,6 +49,7 @@ Table of Contents 3.3 Configuring Bonding Manually with Ifenslave 3.3.1 Configuring Multiple Bonds Manually 3.4 Configuring Bonding Manually via Sysfs +3.5 Overriding Configuration for Special Cases 4. Querying Bonding Configuration 4.1 Bonding Configuration @@ -1318,8 +1319,87 @@ echo 2000 > /sys/class/net/bond1/bonding/arp_interval echo +eth2 > /sys/class/net/bond1/bonding/slaves echo +eth3 > /sys/class/net/bond1/bonding/slaves - -4. Querying Bonding Configuration +3.5 Overriding Configuration for Special Cases +---------------------------------------------- +When using the bonding driver, the physical port which transmits a frame is +typically selected by the bonding driver, and is not relevant to the user or +system administrator. The output port is simply selected using the policies of +the selected bonding mode. On occasion however, it is helpful to direct certain +classes of traffic to certain physical interfaces on output to implement +slightly more complex policies. For example, to reach a web server over a +bonded interface in which eth0 connects to a private network, while eth1 +connects via a public network, it may be desirous to bias the bond to send said +traffic over eth0 first, using eth1 only as a fall back, while all other traffic +can safely be sent over either interface. Such configurations may be achieved +using the traffic control utilities inherent in linux. + +By default the bonding driver is multiqueue aware and 16 queues are created +when the driver initializes (see Documentation/networking/multiqueue.txt +for details). If more or less queues are desired the module parameter +tx_queues can be used to change this value. There is no sysfs parameter +available as the allocation is done at module init time. + +The output of the file /proc/net/bonding/bondX has changed so the output Queue +ID is now printed for each slave: + +Bonding Mode: fault-tolerance (active-backup) +Primary Slave: None +Currently Active Slave: eth0 +MII Status: up +MII Polling Interval (ms): 0 +Up Delay (ms): 0 +Down Delay (ms): 0 + +Slave Interface: eth0 +MII Status: up +Link Failure Count: 0 +Permanent HW addr: 00:1a:a0:12:8f:cb +Slave queue ID: 0 + +Slave Interface: eth1 +MII Status: up +Link Failure Count: 0 +Permanent HW addr: 00:1a:a0:12:8f:cc +Slave queue ID: 2 + +The queue_id for a slave can be set using the command: + +# echo "eth1:2" > /sys/class/net/bond0/bonding/queue_id + +Any interface that needs a queue_id set should set it with multiple calls +like the one above until proper priorities are set for all interfaces. On +distributions that allow configuration via initscripts, multiple 'queue_id' +arguments can be added to BONDING_OPTS to set all needed slave queues. + +These queue id's can be used in conjunction with the tc utility to configure +a multiqueue qdisc and filters to bias certain traffic to transmit on certain +slave devices. For instance, say we wanted, in the above configuration to +force all traffic bound to 192.168.1.100 to use eth1 in the bond as its output +device. The following commands would accomplish this: + +# tc qdisc add dev bond0 handle 1 root multiq + +# tc filter add dev bond0 protocol ip parent 1: prio 1 u32 match ip dst \ + 192.168.1.100 action skbedit queue_mapping 2 + +These commands tell the kernel to attach a multiqueue queue discipline to the +bond0 interface and filter traffic enqueued to it, such that packets with a dst +ip of 192.168.1.100 have their output queue mapping value overwritten to 2. +This value is then passed into the driver, causing the normal output path +selection policy to be overridden, selecting instead qid 2, which maps to eth1. + +Note that qid values begin at 1. Qid 0 is reserved to initiate to the driver +that normal output policy selection should take place. One benefit to simply +leaving the qid for a slave to 0 is the multiqueue awareness in the bonding +driver that is now present. This awareness allows tc filters to be placed on +slave devices as well as bond devices and the bonding driver will simply act as +a pass-through for selecting output queues on the slave device rather than +output port selection. + +This feature first appeared in bonding driver version 3.7.0 and support for +output slave selection was limited to round-robin and active-backup modes. + +4 Querying Bonding Configuration ================================= 4.1 Bonding Configuration diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f22f6bf4385..1b19276cff1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -90,6 +90,7 @@ #define BOND_LINK_ARP_INTERV 0 static int max_bonds = BOND_DEFAULT_MAX_BONDS; +static int tx_queues = BOND_DEFAULT_TX_QUEUES; static int num_grat_arp = 1; static int num_unsol_na = 1; static int miimon = BOND_LINK_MON_INTERV; @@ -111,6 +112,8 @@ static struct bond_params bonding_defaults; module_param(max_bonds, int, 0); MODULE_PARM_DESC(max_bonds, "Max number of bonded devices"); +module_param(tx_queues, int, 0); +MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)"); module_param(num_grat_arp, int, 0644); MODULE_PARM_DESC(num_grat_arp, "Number of gratuitous ARP packets to send on failover event"); module_param(num_unsol_na, int, 0644); @@ -1540,6 +1543,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) goto err_undo_flags; } + /* + * Set the new_slave's queue_id to be zero. Queue ID mapping + * is set via sysfs or module option if desired. + */ + new_slave->queue_id = 0; + /* Save slave's original mtu and then set it to match the bond */ new_slave->original_mtu = slave_dev->mtu; res = dev_set_mtu(slave_dev, bond->dev->mtu); @@ -3285,6 +3294,7 @@ static void bond_info_show_slave(struct seq_file *seq, else seq_puts(seq, "Aggregator ID: N/A\n"); } + seq_printf(seq, "Slave queue ID: %d\n", slave->queue_id); } static int bond_info_seq_show(struct seq_file *seq, void *v) @@ -4421,9 +4431,59 @@ static void bond_set_xmit_hash_policy(struct bonding *bond) } } +/* + * Lookup the slave that corresponds to a qid + */ +static inline int bond_slave_override(struct bonding *bond, + struct sk_buff *skb) +{ + int i, res = 1; + struct slave *slave = NULL; + struct slave *check_slave; + + read_lock(&bond->lock); + + if (!BOND_IS_OK(bond) || !skb->queue_mapping) + goto out; + + /* Find out if any slaves have the same mapping as this skb. */ + bond_for_each_slave(bond, check_slave, i) { + if (check_slave->queue_id == skb->queue_mapping) { + slave = check_slave; + break; + } + } + + /* If the slave isn't UP, use default transmit policy. */ + if (slave && slave->queue_id && IS_UP(slave->dev) && + (slave->link == BOND_LINK_UP)) { + res = bond_dev_queue_xmit(bond, skb, slave->dev); + } + +out: + read_unlock(&bond->lock); + return res; +} + +static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb) +{ + /* + * This helper function exists to help dev_pick_tx get the correct + * destination queue. Using a helper function skips the a call to + * skb_tx_hash and will put the skbs in the queue we expect on their + * way down to the bonding driver. + */ + return skb->queue_mapping; +} + static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { - const struct bonding *bond = netdev_priv(dev); + struct bonding *bond = netdev_priv(dev); + + if (TX_QUEUE_OVERRIDE(bond->params.mode)) { + if (!bond_slave_override(bond, skb)) + return NETDEV_TX_OK; + } switch (bond->params.mode) { case BOND_MODE_ROUNDROBIN: @@ -4508,6 +4568,7 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_open = bond_open, .ndo_stop = bond_close, .ndo_start_xmit = bond_start_xmit, + .ndo_select_queue = bond_select_queue, .ndo_get_stats = bond_get_stats, .ndo_do_ioctl = bond_do_ioctl, .ndo_set_multicast_list = bond_set_multicast_list, @@ -4776,6 +4837,13 @@ static int bond_check_params(struct bond_params *params) } } + if (tx_queues < 1 || tx_queues > 255) { + pr_warning("Warning: tx_queues (%d) should be between " + "1 and 255, resetting to %d\n", + tx_queues, BOND_DEFAULT_TX_QUEUES); + tx_queues = BOND_DEFAULT_TX_QUEUES; + } + if ((all_slaves_active != 0) && (all_slaves_active != 1)) { pr_warning("Warning: all_slaves_active module parameter (%d), " "not of valid value (0/1), so it was set to " @@ -4953,6 +5021,7 @@ static int bond_check_params(struct bond_params *params) params->primary[0] = 0; params->primary_reselect = primary_reselect_value; params->fail_over_mac = fail_over_mac_value; + params->tx_queues = tx_queues; params->all_slaves_active = all_slaves_active; if (primary) { @@ -5040,8 +5109,8 @@ int bond_create(struct net *net, const char *name) rtnl_lock(); - bond_dev = alloc_netdev(sizeof(struct bonding), name ? name : "", - bond_setup); + bond_dev = alloc_netdev_mq(sizeof(struct bonding), name ? name : "", + bond_setup, tx_queues); if (!bond_dev) { pr_err("%s: eek! can't alloc netdev!\n", name); rtnl_unlock(); diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 066311a5e08..f9a034361a8 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -1411,6 +1411,121 @@ static ssize_t bonding_show_ad_partner_mac(struct device *d, } static DEVICE_ATTR(ad_partner_mac, S_IRUGO, bonding_show_ad_partner_mac, NULL); +/* + * Show the queue_ids of the slaves in the current bond. + */ +static ssize_t bonding_show_queue_id(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct slave *slave; + int i, res = 0; + struct bonding *bond = to_bond(d); + + if (!rtnl_trylock()) + return restart_syscall(); + + read_lock(&bond->lock); + bond_for_each_slave(bond, slave, i) { + if (res > (PAGE_SIZE - 6)) { + /* not enough space for another interface name */ + if ((PAGE_SIZE - res) > 10) + res = PAGE_SIZE - 10; + res += sprintf(buf + res, "++more++ "); + break; + } + res += sprintf(buf + res, "%s:%d ", + slave->dev->name, slave->queue_id); + } + read_unlock(&bond->lock); + if (res) + buf[res-1] = '\n'; /* eat the leftover space */ + rtnl_unlock(); + return res; +} + +/* + * Set the queue_ids of the slaves in the current bond. The bond + * interface must be enslaved for this to work. + */ +static ssize_t bonding_store_queue_id(struct device *d, + struct device_attribute *attr, + const char *buffer, size_t count) +{ + struct slave *slave, *update_slave; + struct bonding *bond = to_bond(d); + u16 qid; + int i, ret = count; + char *delim; + struct net_device *sdev = NULL; + + if (!rtnl_trylock()) + return restart_syscall(); + + /* delim will point to queue id if successful */ + delim = strchr(buffer, ':'); + if (!delim) + goto err_no_cmd; + + /* + * Terminate string that points to device name and bump it + * up one, so we can read the queue id there. + */ + *delim = '\0'; + if (sscanf(++delim, "%hd\n", &qid) != 1) + goto err_no_cmd; + + /* Check buffer length, valid ifname and queue id */ + if (strlen(buffer) > IFNAMSIZ || + !dev_valid_name(buffer) || + qid > bond->params.tx_queues) + goto err_no_cmd; + + /* Get the pointer to that interface if it exists */ + sdev = __dev_get_by_name(dev_net(bond->dev), buffer); + if (!sdev) + goto err_no_cmd; + + read_lock(&bond->lock); + + /* Search for thes slave and check for duplicate qids */ + update_slave = NULL; + bond_for_each_slave(bond, slave, i) { + if (sdev == slave->dev) + /* + * We don't need to check the matching + * slave for dups, since we're overwriting it + */ + update_slave = slave; + else if (qid && qid == slave->queue_id) { + goto err_no_cmd_unlock; + } + } + + if (!update_slave) + goto err_no_cmd_unlock; + + /* Actually set the qids for the slave */ + update_slave->queue_id = qid; + + read_unlock(&bond->lock); +out: + rtnl_unlock(); + return ret; + +err_no_cmd_unlock: + read_unlock(&bond->lock); +err_no_cmd: + pr_info("invalid input for queue_id set for %s.\n", + bond->dev->name); + ret = -EPERM; + goto out; +} + +static DEVICE_ATTR(queue_id, S_IRUGO | S_IWUSR, bonding_show_queue_id, + bonding_store_queue_id); + + /* * Show and set the all_slaves_active flag. */ @@ -1489,6 +1604,7 @@ static struct attribute *per_bond_attrs[] = { &dev_attr_ad_actor_key.attr, &dev_attr_ad_partner_key.attr, &dev_attr_ad_partner_mac.attr, + &dev_attr_queue_id.attr, &dev_attr_all_slaves_active.attr, NULL, }; diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index cecdea2a629..c6fdd851579 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -23,8 +23,8 @@ #include "bond_3ad.h" #include "bond_alb.h" -#define DRV_VERSION "3.6.0" -#define DRV_RELDATE "September 26, 2009" +#define DRV_VERSION "3.7.0" +#define DRV_RELDATE "June 2, 2010" #define DRV_NAME "bonding" #define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" @@ -60,6 +60,9 @@ ((mode) == BOND_MODE_TLB) || \ ((mode) == BOND_MODE_ALB)) +#define TX_QUEUE_OVERRIDE(mode) \ + (((mode) == BOND_MODE_ACTIVEBACKUP) || \ + ((mode) == BOND_MODE_ROUNDROBIN)) /* * Less bad way to call ioctl from within the kernel; this needs to be * done some other way to get the call out of interrupt context. @@ -131,6 +134,7 @@ struct bond_params { char primary[IFNAMSIZ]; int primary_reselect; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; + int tx_queues; int all_slaves_active; }; @@ -165,6 +169,7 @@ struct slave { u8 perm_hwaddr[ETH_ALEN]; u16 speed; u8 duplex; + u16 queue_id; struct ad_slave_info ad_info; /* HUGE - better to dynamically alloc */ struct tlb_slave_info tlb_info; }; diff --git a/include/linux/if_bonding.h b/include/linux/if_bonding.h index cd525fae3c9..2c7994372bd 100644 --- a/include/linux/if_bonding.h +++ b/include/linux/if_bonding.h @@ -83,6 +83,7 @@ #define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ +#define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ /* hashing types */ #define BOND_XMIT_POLICY_LAYER2 0 /* layer 2 (MAC only), default */ #define BOND_XMIT_POLICY_LAYER34 1 /* layer 3+4 (IP ^ (TCP || UDP)) */ -- cgit v1.2.3-70-g09d2 From a8b690f98baf9fb1902b8eeab801351ea603fa3a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Jun 2010 00:43:42 -0700 Subject: tcp: Fix slowness in read /proc/net/tcp This patch address a serious performance issue in reading the TCP sockets table (/proc/net/tcp). Reading the full table is done by a number of sequential read operations. At each read operation, a seek is done to find the last socket that was previously read. This seek operation requires that the sockets in the table need to be counted up to the current file position, and to count each of these requires taking a lock for each non-empty bucket. The whole algorithm is O(n^2). The fix is to cache the last bucket value, offset within the bucket, and the file position returned by the last read operation. On the next sequential read, the bucket and offset are used to find the last read socket immediately without needing ot scan the previous buckets the table. This algorithm t read the whole table is O(n). The improvement offered by this patch is easily show by performing cat'ing /proc/net/tcp on a machine with a lot of connections. With about 182K connections in the table, I see the following: - Without patch time cat /proc/net/tcp > /dev/null real 1m56.729s user 0m0.214s sys 1m56.344s - With patch time cat /proc/net/tcp > /dev/null real 0m0.894s user 0m0.290s sys 0m0.594s Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +- net/ipv4/tcp_ipv4.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 86 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a1449144848..57316648441 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1413,7 +1413,8 @@ struct tcp_iter_state { sa_family_t family; enum tcp_seq_states state; struct sock *syn_wait_sk; - int bucket, sbucket, num, uid; + int bucket, offset, sbucket, num, uid; + loff_t last_pos; }; extern int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index acdc4c98985..7f976af27bf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1980,6 +1980,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; } +/* + * Get next listener socket follow cur. If cur is NULL, get first socket + * starting from bucket given in st->bucket; when st->bucket is zero the + * very first socket in the hash table is returned. + */ static void *listening_get_next(struct seq_file *seq, void *cur) { struct inet_connection_sock *icsk; @@ -1990,14 +1995,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct net *net = seq_file_net(seq); if (!sk) { - st->bucket = 0; - ilb = &tcp_hashinfo.listening_hash[0]; + ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); sk = sk_nulls_head(&ilb->head); + st->offset = 0; goto get_sk; } ilb = &tcp_hashinfo.listening_hash[st->bucket]; ++st->num; + ++st->offset; if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; @@ -2012,6 +2018,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) } req = req->dl_next; } + st->offset = 0; if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) break; get_req: @@ -2047,6 +2054,7 @@ start_req: read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); + st->offset = 0; if (++st->bucket < INET_LHTABLE_SIZE) { ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); @@ -2060,7 +2068,12 @@ out: static void *listening_get_idx(struct seq_file *seq, loff_t *pos) { - void *rc = listening_get_next(seq, NULL); + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + st->offset = 0; + rc = listening_get_next(seq, NULL); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2075,13 +2088,18 @@ static inline int empty_bucket(struct tcp_iter_state *st) hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); } +/* + * Get first established socket starting from bucket given in st->bucket. + * If st->bucket is zero, the very first socket in the hash is returned. + */ static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; - for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; @@ -2126,6 +2144,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct net *net = seq_file_net(seq); ++st->num; + ++st->offset; if (st->state == TCP_SEQ_STATE_TIME_WAIT) { tw = cur; @@ -2142,6 +2161,7 @@ get_tw: st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ + st->offset = 0; while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; @@ -2169,7 +2189,11 @@ out: static void *established_get_idx(struct seq_file *seq, loff_t pos) { - void *rc = established_get_first(seq); + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + rc = established_get_first(seq); while (rc && pos) { rc = established_get_next(seq, rc); @@ -2194,24 +2218,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) return rc; } +static void *tcp_seek_last_pos(struct seq_file *seq) +{ + struct tcp_iter_state *st = seq->private; + int offset = st->offset; + int orig_num = st->num; + void *rc = NULL; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + if (st->bucket >= INET_LHTABLE_SIZE) + break; + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_next(seq, NULL); + while (offset-- && rc) + rc = listening_get_next(seq, rc); + if (rc) + break; + st->bucket = 0; + /* Fallthrough */ + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + st->state = TCP_SEQ_STATE_ESTABLISHED; + if (st->bucket > tcp_hashinfo.ehash_mask) + break; + rc = established_get_first(seq); + while (offset-- && rc) + rc = established_get_next(seq, rc); + } + + st->num = orig_num; + + return rc; +} + static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; + void *rc; + + if (*pos && *pos == st->last_pos) { + rc = tcp_seek_last_pos(seq); + if (rc) + goto out; + } + st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; - return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + st->bucket = 0; + st->offset = 0; + rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + st->last_pos = *pos; + return rc; } static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct tcp_iter_state *st = seq->private; void *rc = NULL; - struct tcp_iter_state *st; if (v == SEQ_START_TOKEN) { rc = tcp_get_idx(seq, 0); goto out; } - st = seq->private; switch (st->state) { case TCP_SEQ_STATE_OPENREQ: @@ -2219,6 +2291,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) rc = listening_get_next(seq, v); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + st->offset = 0; rc = established_get_first(seq); } break; @@ -2229,6 +2303,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } out: ++*pos; + st->last_pos = *pos; return rc; } @@ -2267,6 +2342,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file) s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; + s->last_pos = 0; return 0; } -- cgit v1.2.3-70-g09d2 From 139ef32b0e6b88b00b5e3e74d052d938f178dc9b Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 7 Jun 2010 08:48:13 -0400 Subject: Revert adding some arch-specific signal syscalls to . It turns out there is some variance on the calling conventions for these syscalls, and is already the mechanism used to handle this. Switch arch/tile over to using that mechanism and tweak the calling conventions for a couple of tile syscalls to match . Acked-by: Arnd Bergmann Signed-off-by: Chris Metcalf --- arch/tile/include/asm/syscalls.h | 22 +--------------------- arch/tile/kernel/process.c | 2 +- arch/tile/kernel/signal.c | 4 ++-- arch/tile/kernel/sys.c | 2 +- include/linux/syscalls.h | 4 ---- 5 files changed, 5 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h index e1be54d1a7d..9f2b8e2f69d 100644 --- a/arch/tile/include/asm/syscalls.h +++ b/arch/tile/include/asm/syscalls.h @@ -22,21 +22,7 @@ #include #include #include - -/* kernel/process.c */ -int sys_fork(struct pt_regs *); -int sys_vfork(struct pt_regs *); -int sys_clone(unsigned long clone_flags, unsigned long newsp, - int __user *parent_tidptr, int __user *child_tidptr, - struct pt_regs *); -int sys_execve(char __user *path, char __user *__user *argv, - char __user *__user *envp, struct pt_regs *); - -/* kernel/signal.c */ -int sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); -int sys_raise_fpe(int code, unsigned long addr, struct pt_regs*); +#include /* kernel/sys.c */ ssize_t sys32_readahead(int fd, u32 offset_lo, u32 offset_hi, u32 count); @@ -45,12 +31,6 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi, int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi, u32 len_lo, u32 len_hi, int advice); long sys_flush_cache(void); -long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long offset); -long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long offset); #ifndef __tilegx__ /* mm/fault.c */ diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 824f230e6d1..c70ff14a48e 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -502,7 +502,7 @@ int _sys_fork(struct pt_regs *regs) } int _sys_clone(unsigned long clone_flags, unsigned long newsp, - int __user *parent_tidptr, int __user *child_tidptr, + void __user *parent_tidptr, void __user *child_tidptr, struct pt_regs *regs) { if (!newsp) diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index 7ea85eb8524..45835cfad40 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -43,8 +43,8 @@ /* Caller before callee in this file; other callee is in assembler */ void do_signal(struct pt_regs *regs); -int _sys_sigaltstack(const stack_t __user *uss, - stack_t __user *uoss, struct pt_regs *regs) +long _sys_sigaltstack(const stack_t __user *uss, + stack_t __user *uoss, struct pt_regs *regs) { return do_sigaltstack(uss, uoss, regs->sp); } diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c index a3d982b212b..0427978cea0 100644 --- a/arch/tile/kernel/sys.c +++ b/arch/tile/kernel/sys.c @@ -95,7 +95,7 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, */ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, unsigned long, offset) + unsigned long, fd, off_t, offset) { if (offset & ((1 << PAGE_SHIFT) - 1)) return -EINVAL; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1e3cd5fec7e..7f614ce274a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -364,13 +364,9 @@ asmlinkage long sys_init_module(void __user *umod, unsigned long len, asmlinkage long sys_delete_module(const char __user *name_user, unsigned int flags); -asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize); asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize); -asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize); asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, siginfo_t __user *uinfo, const struct timespec __user *uts, -- cgit v1.2.3-70-g09d2 From abbceff7d7a884968e876e52578da1db4a4f6b54 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 May 2010 15:15:12 -0400 Subject: swiotlb: add the swiotlb initialization function with iotlb memory This enables the caller to initialize swiotlb with its own iotlb memory. See "swiotlb: swiotlb: add swiotlb_tbl_map_single library function" for full description of patchset. [v2: changed ..with_tlb to ..with_tbl] Signed-off-by: FUJITA Tomonori Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Albert Herranz --- include/linux/swiotlb.h | 1 + lib/swiotlb.c | 48 ++++++++++++++++++++++++++++++------------------ 2 files changed, 31 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 81a4e213c6c..b406261d888 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -23,6 +23,7 @@ extern int swiotlb_force; #define IO_TLB_SHIFT 11 extern void swiotlb_init(int verbose); +extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 783aff00024..ec61e1507d0 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -140,28 +140,14 @@ void swiotlb_print_info(void) (unsigned long long)pend); } -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init_with_default_size(size_t default_size, int verbose) +void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } + bytes = nslabs << IO_TLB_SHIFT; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - /* - * Get IO TLB memory from the low pages - */ - io_tlb_start = alloc_bootmem_low_pages(bytes); - if (!io_tlb_start) - panic("Cannot allocate SWIOTLB buffer"); + io_tlb_nslabs = nslabs; + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; /* @@ -185,6 +171,32 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) swiotlb_print_info(); } +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init +swiotlb_init_with_default_size(size_t default_size, int verbose) +{ + unsigned long bytes; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from the low pages + */ + io_tlb_start = alloc_bootmem_low_pages(bytes); + if (!io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); + + swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose); +} + void __init swiotlb_init(int verbose) { -- cgit v1.2.3-70-g09d2 From 22d48269984fc93a71f65a54aa422aacf5fdb926 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 10 May 2010 15:01:15 -0500 Subject: swiotlb: search and replace "int dir" with "enum dma_data_direction dir" .. to catch anybody doing something funky. See "swiotlb: swiotlb: add swiotlb_tbl_map_single library function" for full description of patchset. [v2: swiotlb_sync_single_range_* no more - removed usage] [v3: enum dma_data_direction direction -> enum dma_data_direction dir] Signed-off-by: Konrad Rzeszutek Wilk Acked-by: FUJITA Tomonori Tested-by: Albert Herranz --- include/linux/swiotlb.h | 4 ++-- lib/swiotlb.c | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b406261d888..250d766f17f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -43,11 +43,11 @@ extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, - int direction); + enum dma_data_direction dir); extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, - int direction); + enum dma_data_direction dir); extern int swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 1fc15bf6394..5f60157f31d 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -374,7 +374,8 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, } void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, - phys_addr_t phys, size_t size, int dir) + phys_addr_t phys, size_t size, + enum dma_data_direction dir) { unsigned long flags; char *dma_addr; @@ -481,7 +482,8 @@ found: */ static void * -map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir) { dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); @@ -493,7 +495,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) */ static void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, - int dir) + enum dma_data_direction dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -534,7 +536,7 @@ swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, static void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, - int dir, int target) + enum dma_data_direction dir, int target) { int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t phys = io_tlb_orig_addr[index]; @@ -624,7 +626,8 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, EXPORT_SYMBOL(swiotlb_free_coherent); static void -swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, + int do_panic) { /* * Ran out of IOMMU space for this operation. This is very bad. @@ -702,7 +705,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * whatever the device wrote there. */ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir) + size_t size, enum dma_data_direction dir) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); @@ -745,7 +748,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); */ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir, int target) + size_t size, enum dma_data_direction dir, int target) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); @@ -832,7 +835,7 @@ EXPORT_SYMBOL(swiotlb_map_sg_attrs); int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) + enum dma_data_direction dir) { return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); } @@ -859,7 +862,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) + enum dma_data_direction dir) { return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); } @@ -874,7 +877,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg); */ static void swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, int dir, int target) + int nelems, enum dma_data_direction dir, int target) { struct scatterlist *sg; int i; -- cgit v1.2.3-70-g09d2 From d7ef1533a90f432615d25729c2477bac9e72051d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 28 May 2010 11:37:10 -0400 Subject: swiotlb: Make swiotlb bookkeeping functions visible in the header file. We put the functions dealing with the operations on the SWIOTLB buffer in the header and make those functions non-static. And also make the functions exported via EXPORT_SYMBOL_GPL. See "swiotlb: swiotlb: add swiotlb_tbl_map_single library function" for full description of patchset. [v2: swiotlb_sync_single_range_for_* no more. Remove usage.] Signed-off-by: Konrad Rzeszutek Wilk Acked-by: FUJITA Tomonori Tested-by: Albert Herranz --- include/linux/swiotlb.h | 22 ++++++++++++++++++++++ lib/swiotlb.c | 29 ++++++++++++++--------------- 2 files changed, 36 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 250d766f17f..8c0e349f4a6 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -25,6 +25,28 @@ extern int swiotlb_force; extern void swiotlb_init(int verbose); extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); +/* + * Enumeration for sync targets + */ +enum dma_sync_target { + SYNC_FOR_CPU = 0, + SYNC_FOR_DEVICE = 1, +}; +extern void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, + phys_addr_t phys, size_t size, + enum dma_data_direction dir); + +extern void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, + size_t size, enum dma_data_direction dir); + +extern void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target); + +/* Accessory functions. */ +extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 5f60157f31d..34e3082632d 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -50,14 +50,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -/* - * Enumeration for sync targets - */ -enum dma_sync_target { - SYNC_FOR_CPU = 0, - SYNC_FOR_DEVICE = 1, -}; - int swiotlb_force; /* @@ -335,8 +327,8 @@ static int is_swiotlb_buffer(phys_addr_t paddr) /* * Bounce: copy the swiotlb buffer back to the original dma location */ -static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, - enum dma_data_direction dir) +void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir) { unsigned long pfn = PFN_DOWN(phys); @@ -372,6 +364,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, memcpy(phys_to_virt(phys), dma_addr, size); } } +EXPORT_SYMBOL_GPL(swiotlb_bounce); void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, phys_addr_t phys, size_t size, @@ -476,6 +469,7 @@ found: return dma_addr; } +EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); /* * Allocates bounce buffer and returns its kernel virtual address. @@ -493,7 +487,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ -static void +void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, enum dma_data_direction dir) { @@ -533,10 +527,12 @@ swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, } spin_unlock_irqrestore(&io_tlb_lock, flags); } +EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single); -static void +void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, - enum dma_data_direction dir, int target) + enum dma_data_direction dir, + enum dma_sync_target target) { int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t phys = io_tlb_orig_addr[index]; @@ -560,6 +556,7 @@ swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, BUG(); } } +EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single); void * swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -748,7 +745,8 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); */ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, int target) + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); @@ -877,7 +875,8 @@ EXPORT_SYMBOL(swiotlb_unmap_sg); */ static void swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, int target) + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) { struct scatterlist *sg; int i; -- cgit v1.2.3-70-g09d2 From 66018506e15bea62de4eefc3298f170b4bfcf5ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Jun 2010 03:12:08 +0000 Subject: ip: Router Alert RCU conversion Straightforward conversion to RCU. One rwlock becomes a spinlock, and is static. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/ip_input.c | 11 +++-------- net/ipv4/ip_sockglue.c | 23 ++++++++++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 452f229c380..9982c97f0bd 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -62,10 +62,10 @@ struct ip_ra_chain { struct ip_ra_chain *next; struct sock *sk; void (*destructor)(struct sock *); + struct rcu_head rcu; }; extern struct ip_ra_chain *ip_ra_chain; -extern rwlock_t ip_ra_lock; /* IP flags. */ #define IP_CE 0x8000 /* Flag: "Congestion" */ diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d52c9da644c..d274078b166 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,7 +146,7 @@ #include /* - * Process Router Attention IP option + * Process Router Attention IP option (RFC 2113) */ int ip_call_ra_chain(struct sk_buff *skb) { @@ -155,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb) struct sock *last = NULL; struct net_device *dev = skb->dev; - read_lock(&ip_ra_lock); - for (ra = ip_ra_chain; ra; ra = ra->next) { + for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -167,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb) sk->sk_bound_dev_if == dev->ifindex) && net_eq(sock_net(sk), dev_net(dev))) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { - read_unlock(&ip_ra_lock); + if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) return 1; - } } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -183,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb) if (last) { raw_rcv(last, skb); - read_unlock(&ip_ra_lock); return 1; } - read_unlock(&ip_ra_lock); return 0; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ce231780a2b..08b9519a24f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -239,7 +239,12 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) sent to multicast group to reach destination designated router. */ struct ip_ra_chain *ip_ra_chain; -DEFINE_RWLOCK(ip_ra_lock); +static DEFINE_SPINLOCK(ip_ra_lock); + +static void ip_ra_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_ra_chain, rcu)); +} int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) @@ -251,35 +256,35 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - write_lock_bh(&ip_ra_lock); + spin_lock_bh(&ip_ra_lock); for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); kfree(new_ra); return -EADDRINUSE; } - *rap = ra->next; - write_unlock_bh(&ip_ra_lock); + rcu_assign_pointer(*rap, ra->next); + spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); sock_put(sk); - kfree(ra); + call_rcu(&ra->rcu, ip_ra_free_rcu); return 0; } } if (new_ra == NULL) { - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->destructor = destructor; new_ra->next = ra; - *rap = new_ra; + rcu_assign_pointer(*rap, new_ra); sock_hold(sk); - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); return 0; } -- cgit v1.2.3-70-g09d2 From bb69ae049fcc986fcd742eb90ca0d44a7a49c9f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Jun 2010 11:42:13 +0000 Subject: anycast: Some RCU conversions - dev_get_by_flags() changed to dev_get_by_flags_rcu() - ipv6_sock_ac_join() dont touch dev & idev refcounts - ipv6_sock_ac_drop() dont touch dev & idev refcounts - ipv6_sock_ac_close() dont touch dev & idev refcounts - ipv6_dev_ac_dec() dount touch idev refcount - ipv6_chk_acast_addr() dont touch idev refcount Signed-off-by: Eric Dumazet CC: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +-- net/core/dev.c | 14 +++----- net/ipv6/anycast.c | 90 ++++++++++++++++++++++------------------------- 3 files changed, 49 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5156b806924..c319f28d699 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1271,8 +1271,8 @@ extern void dev_add_pack(struct packet_type *pt); extern void dev_remove_pack(struct packet_type *pt); extern void __dev_remove_pack(struct packet_type *pt); -extern struct net_device *dev_get_by_flags(struct net *net, unsigned short flags, - unsigned short mask); +extern struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags, + unsigned short mask); extern struct net_device *dev_get_by_name(struct net *net, const char *name); extern struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); extern struct net_device *__dev_get_by_name(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index c8d127718ff..6f330cee79a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -803,35 +803,31 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags - find any device with given flags + * dev_get_by_flags_rcu - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * is not found or a pointer to the device. Must be called inside + * rcu_read_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags, +struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ret = NULL; - rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { - dev_hold(dev); ret = dev; break; } } - rcu_read_unlock(); return ret; } -EXPORT_SYMBOL(dev_get_by_flags); +EXPORT_SYMBOL(dev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index b5b07054508..f058fbd808c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -77,41 +77,40 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) pac->acl_next = NULL; ipv6_addr_copy(&pac->acl_addr, addr); + rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { dev = rt->rt6i_dev; - dev_hold(dev); dst_release(&rt->u.dst); } else if (ishost) { err = -EADDRNOTAVAIL; - goto out_free_pac; + goto error; } else { /* router, no matching interface: just pick one */ - - dev = dev_get_by_flags(net, IFF_UP, IFF_UP|IFF_LOOPBACK); + dev = dev_get_by_flags_rcu(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index(net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (dev == NULL) { err = -ENODEV; - goto out_free_pac; + goto error; } - idev = in6_dev_get(dev); + idev = __in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; else err = -EADDRNOTAVAIL; - goto out_dev_put; + goto error; } /* reset ishost, now that we have a specific device */ ishost = !idev->cnf.forwarding; - in6_dev_put(idev); pac->acl_ifindex = dev->ifindex; @@ -124,26 +123,22 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (ishost) err = -EADDRNOTAVAIL; if (err) - goto out_dev_put; + goto error; } err = ipv6_dev_ac_inc(dev, addr); - if (err) - goto out_dev_put; - - write_lock_bh(&ipv6_sk_ac_lock); - pac->acl_next = np->ipv6_ac_list; - np->ipv6_ac_list = pac; - write_unlock_bh(&ipv6_sk_ac_lock); - - dev_put(dev); - - return 0; + if (!err) { + write_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + write_unlock_bh(&ipv6_sk_ac_lock); + pac = NULL; + } -out_dev_put: - dev_put(dev); -out_free_pac: - sock_kfree_s(sk, pac, sizeof(*pac)); +error: + rcu_read_unlock(); + if (pac) + sock_kfree_s(sk, pac, sizeof(*pac)); return err; } @@ -176,11 +171,12 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) write_unlock_bh(&ipv6_sk_ac_lock); - dev = dev_get_by_index(net, pac->acl_ifindex); - if (dev) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - dev_put(dev); - } + rcu_read_unlock(); + sock_kfree_s(sk, pac, sizeof(*pac)); return 0; } @@ -199,13 +195,12 @@ void ipv6_sock_ac_close(struct sock *sk) write_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; + rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - if (dev) - dev_put(dev); - dev = dev_get_by_index(net, pac->acl_ifindex); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -213,8 +208,7 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - if (dev) - dev_put(dev); + rcu_read_unlock(); } #if 0 @@ -363,33 +357,32 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) return 0; } +/* called with rcu_read_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) { - int ret; - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get(dev); + if (idev == NULL) return -ENODEV; - ret = __ipv6_dev_ac_dec(idev, addr); - in6_dev_put(idev); - return ret; + return __ipv6_dev_ac_dec(idev, addr); } /* * check if the interface has this anycast address + * called with rcu_read_lock() */ static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr) { struct inet6_dev *idev; struct ifacaddr6 *aca; - idev = in6_dev_get(dev); + idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); for (aca = idev->ac_list; aca; aca = aca->aca_next) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; read_unlock_bh(&idev->lock); - in6_dev_put(idev); return aca != NULL; } return 0; @@ -403,14 +396,15 @@ int ipv6_chk_acast_addr(struct net *net, struct net_device *dev, { int found = 0; - if (dev) - return ipv6_chk_acast_dev(dev, addr); rcu_read_lock(); - for_each_netdev_rcu(net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { - found = 1; - break; - } + if (dev) + found = ipv6_chk_acast_dev(dev, addr); + else + for_each_netdev_rcu(net, dev) + if (ipv6_chk_acast_dev(dev, addr)) { + found = 1; + break; + } rcu_read_unlock(); return found; } -- cgit v1.2.3-70-g09d2 From 339bb99e4a8ba1f8960eed21d50be808b35ad22a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Jun 2010 14:11:19 +0200 Subject: netfilter: xt_rateest: Better struct xt_rateest layout We currently dirty two cache lines in struct xt_rateest, this hurts SMP performance. This patch moves lock/bstats/rstats at beginning of structure so that they share a single cache line. Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- include/net/netfilter/xt_rateest.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index ddbf37e1961..b1d780e21ce 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -2,13 +2,17 @@ #define _XT_RATEEST_H struct xt_rateest { + /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ + struct gnet_stats_basic_packed bstats; + spinlock_t lock; + /* keep rstats and lock on same cache line to speedup xt_rateest_mt() */ + struct gnet_stats_rate_est rstats; + + /* following fields not accessed in hot path */ struct hlist_node list; char name[IFNAMSIZ]; unsigned int refcnt; - spinlock_t lock; struct gnet_estimator params; - struct gnet_stats_rate_est rstats; - struct gnet_stats_basic_packed bstats; }; extern struct xt_rateest *xt_rateest_lookup(const char *name); -- cgit v1.2.3-70-g09d2 From abe37c4b84502d6931e04e94c9c2c45b4da8c889 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Jun 2010 11:12:27 +0200 Subject: wireless: fix kernel-doc Fix a whole bunch of kernel-doc warnings and errors that crop up when running it on mac80211 and cfg80211; the latter isn't normally done so lots of bit-rot happened. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/cfg80211.h | 117 +++++++++++++++++++++++++++++++++++++------------ include/net/mac80211.h | 4 +- 2 files changed, 90 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0c3c214772e..22ab9d88cf4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -37,6 +37,7 @@ * * @IEEE80211_BAND_2GHZ: 2.4GHz ISM band * @IEEE80211_BAND_5GHZ: around 5GHz band (4.9-5.7) + * @IEEE80211_NUM_BANDS: number of defined bands */ enum ieee80211_band { IEEE80211_BAND_2GHZ = NL80211_BAND_2GHZ, @@ -188,6 +189,7 @@ struct ieee80211_sta_ht_cap { * in this band. Must be sorted to give a valid "supported * rates" IE, i.e. CCK rates first, then OFDM. * @n_bitrates: Number of bitrates in @bitrates + * @ht_cap: HT capabilities in this band */ struct ieee80211_supported_band { struct ieee80211_channel *channels; @@ -225,6 +227,7 @@ struct vif_params { * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used * with the get_key() callback, must be in little endian, * length given by @seq_len. + * @seq_len: length of @seq. */ struct key_params { u8 *key; @@ -237,6 +240,8 @@ struct key_params { /** * enum survey_info_flags - survey information flags * + * @SURVEY_INFO_NOISE_DBM: noise (in dBm) was filled in + * * Used by the driver to indicate which info in &struct survey_info * it has filled in during the get_survey(). */ @@ -247,13 +252,13 @@ enum survey_info_flags { /** * struct survey_info - channel survey response * - * Used by dump_survey() to report back per-channel survey information. - * * @channel: the channel this survey record reports, mandatory * @filled: bitflag of flags from &enum survey_info_flags * @noise: channel noise in dBm. This and all following fields are * optional * + * Used by dump_survey() to report back per-channel survey information. + * * This structure can later be expanded with things like * channel duty cycle etc. */ @@ -288,7 +293,7 @@ struct beacon_parameters { * * @PLINK_ACTION_INVALID: action 0 is reserved * @PLINK_ACTION_OPEN: start mesh peer link establishment - * @PLINK_ACTION_BLOCL: block traffic from this mesh peer + * @PLINK_ACTION_BLOCK: block traffic from this mesh peer */ enum plink_actions { PLINK_ACTION_INVALID, @@ -311,6 +316,8 @@ enum plink_actions { * (bitmask of BIT(NL80211_STA_FLAG_...)) * @listen_interval: listen interval or -1 for no change * @aid: AID or zero for no change + * @plink_action: plink action to take + * @ht_capa: HT capabilities of station */ struct station_parameters { u8 *supported_rates; @@ -448,13 +455,13 @@ enum monitor_flags { * Used by the driver to indicate which info in &struct mpath_info it has filled * in during get_station() or dump_station(). * - * MPATH_INFO_FRAME_QLEN: @frame_qlen filled - * MPATH_INFO_SN: @sn filled - * MPATH_INFO_METRIC: @metric filled - * MPATH_INFO_EXPTIME: @exptime filled - * MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled - * MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled - * MPATH_INFO_FLAGS: @flags filled + * @MPATH_INFO_FRAME_QLEN: @frame_qlen filled + * @MPATH_INFO_SN: @sn filled + * @MPATH_INFO_METRIC: @metric filled + * @MPATH_INFO_EXPTIME: @exptime filled + * @MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled + * @MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled + * @MPATH_INFO_FLAGS: @flags filled */ enum mpath_info_flags { MPATH_INFO_FRAME_QLEN = BIT(0), @@ -587,6 +594,7 @@ struct cfg80211_ssid { * @ie_len: length of ie in octets * @wiphy: the wiphy this was for * @dev: the interface + * @aborted: (internal) scan request was notified as aborted */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; @@ -623,6 +631,7 @@ enum cfg80211_signal_type { * This structure describes a BSS (which may also be a mesh network) * for use in scan results and similar. * + * @channel: channel this BSS is on * @bssid: BSSID of the BSS * @tsf: timestamp of last received update * @beacon_interval: the beacon interval as from the frame @@ -826,8 +835,8 @@ struct cfg80211_ibss_params { * @ssid: SSID * @ssid_len: Length of ssid in octets * @auth_type: Authentication type (algorithm) - * @assoc_ie: IEs for association request - * @assoc_ie_len: Length of assoc_ie in octets + * @ie: IEs for association request + * @ie_len: Length of assoc_ie in octets * @privacy: indicates whether privacy-enabled APs should be used * @crypto: crypto settings * @key_len: length of WEP key for shared key authentication @@ -850,10 +859,11 @@ struct cfg80211_connect_params { /** * enum wiphy_params_flags - set_wiphy_params bitfield values - * WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed - * WIPHY_PARAM_RETRY_LONG: wiphy->retry_long has changed - * WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed - * WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed + * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed + * @WIPHY_PARAM_RETRY_LONG: wiphy->retry_long has changed + * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed + * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed + * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed */ enum wiphy_params_flags { WIPHY_PARAM_RETRY_SHORT = 1 << 0, @@ -949,10 +959,16 @@ struct cfg80211_pmksa { * @del_beacon: Remove beacon configuration and stop sending the beacon. * * @add_station: Add a new station. - * * @del_station: Remove a station; @mac may be NULL to remove all stations. - * * @change_station: Modify a given station. + * @get_station: get station information for the station identified by @mac + * @dump_station: dump station callback -- resume dump at index @idx + * + * @add_mpath: add a fixed mesh path + * @del_mpath: delete a given mesh path + * @change_mpath: change a given mesh path + * @get_mpath: get a mesh path for the given parameters + * @dump_mpath: dump mesh path callback -- resume dump at index @idx * * @get_mesh_params: Put the current mesh parameters into *params * @@ -960,8 +976,6 @@ struct cfg80211_pmksa { * The mask is a bitfield which tells us which parameters to * set, and which to leave alone. * - * @set_mesh_cfg: set mesh parameters (by now, just mesh id) - * * @change_bss: Modify parameters for a given BSS. * * @set_txq_params: Set TX queue parameters @@ -1002,6 +1016,8 @@ struct cfg80211_pmksa { * @get_tx_power: store the current TX power into the dbm variable; * return 0 if successful * + * @set_wds_peer: set the WDS peer for a WDS interface + * * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting * functions to adjust rfkill hw state * @@ -1019,6 +1035,8 @@ struct cfg80211_pmksa { * * @testmode_cmd: run a test mode command * + * @set_bitrate_mask: set the bitrate mask configuration + * * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac * devices running firmwares capable of generating the (re) association * RSN IE. It allows for faster roaming between WPA2 BSSIDs. @@ -1231,8 +1249,6 @@ struct mac_address { /** * struct wiphy - wireless hardware description - * @idx: the wiphy index assigned to this item - * @class_dev: the class device representing /sys/class/ieee80211/ * @reg_notifier: the driver's regulatory notification callback * @regd: the driver's regulatory domain, if one was requested via * the regulatory_hint() API. This can be used by the driver @@ -1246,7 +1262,7 @@ struct mac_address { * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold); * -1 = fragmentation disabled, only odd values >= 256 used * @rts_threshold: RTS threshold (dot11RTSThreshold); -1 = RTS/CTS disabled - * @net: the network namespace this wiphy currently lives in + * @_net: the network namespace this wiphy currently lives in * @perm_addr: permanent MAC address of this device * @addr_mask: If the device supports multiple MAC addresses by masking, * set this to a mask with variable bits set to 1, e.g. if the last @@ -1259,6 +1275,28 @@ struct mac_address { * by default for perm_addr. In this case, the mask should be set to * all-zeroes. In this case it is assumed that the device can handle * the same number of arbitrary MAC addresses. + * @debugfsdir: debugfs directory used for this wiphy, will be renamed + * automatically on wiphy renames + * @dev: (virtual) struct device for this wiphy + * @wext: wireless extension handlers + * @priv: driver private data (sized according to wiphy_new() parameter) + * @interface_modes: bitmask of interfaces types valid for this wiphy, + * must be set by driver + * @flags: wiphy flags, see &enum wiphy_flags + * @bss_priv_size: each BSS struct has private data allocated with it, + * this variable determines its size + * @max_scan_ssids: maximum number of SSIDs the device can scan for in + * any given scan + * @max_scan_ie_len: maximum length of user-controlled IEs device can + * add to probe request frames transmitted during a scan, must not + * include fixed IEs like supported rates + * @coverage_class: current coverage class + * @fw_version: firmware version for ethtool reporting + * @hw_version: hardware version for ethtool reporting + * @max_num_pmkids: maximum number of PMKIDs supported by device + * @privid: a pointer that drivers can use to identify if an arbitrary + * wiphy is theirs, e.g. in global notifiers + * @bands: information about bands/channels supported by this device */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -1472,13 +1510,14 @@ struct cfg80211_cached_keys; * @ssid: (private) Used by the internal configuration code * @ssid_len: (private) Used by the internal configuration code * @wext: (private) Used by the internal wireless extensions compat code - * @wext_bssid: (private) Used by the internal wireless extensions compat code * @use_4addr: indicates 4addr mode is used on this interface, must be * set by driver (if supported) on add_interface BEFORE registering the * netdev and may otherwise be used by driver read-only, will be update * by cfg80211 on change_interface * @action_registrations: list of registrations for action frames * @action_registrations_lock: lock for the list + * @mtx: mutex used to lock data in this struct + * @cleanup_work: work struct used for cleanup that can't be done directly */ struct wireless_dev { struct wiphy *wiphy; @@ -1552,11 +1591,13 @@ static inline void *wdev_priv(struct wireless_dev *wdev) /** * ieee80211_channel_to_frequency - convert channel number to frequency + * @chan: channel number */ extern int ieee80211_channel_to_frequency(int chan); /** * ieee80211_frequency_to_channel - convert frequency to channel number + * @freq: center frequency */ extern int ieee80211_frequency_to_channel(int freq); @@ -1571,6 +1612,8 @@ extern struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, int freq); /** * ieee80211_get_channel - get channel struct from wiphy for specified frequency + * @wiphy: the struct wiphy to get the channel for + * @freq: the center frequency of the channel */ static inline struct ieee80211_channel * ieee80211_get_channel(struct wiphy *wiphy, int freq) @@ -1631,9 +1674,6 @@ struct ieee80211_radiotap_vendor_namespaces { * @is_radiotap_ns: indicates whether the current namespace is the default * radiotap namespace or not * - * @overrides: override standard radiotap fields - * @n_overrides: number of overrides - * * @_rtheader: pointer to the radiotap header we are walking through * @_max_length: length of radiotap header in cpu byte ordering * @_arg_index: next argument index @@ -1949,10 +1989,12 @@ int cfg80211_wext_giwap(struct net_device *dev, void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted); /** - * cfg80211_inform_bss - inform cfg80211 of a new BSS + * cfg80211_inform_bss_frame - inform cfg80211 of a received BSS frame * * @wiphy: the wiphy reporting the BSS - * @bss: the found BSS + * @channel: The channel the frame was received on + * @mgmt: the management frame (probe response or beacon) + * @len: length of the management frame * @signal: the signal strength, type depends on the wiphy's signal_type * @gfp: context flags * @@ -1965,6 +2007,23 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_mgmt *mgmt, size_t len, s32 signal, gfp_t gfp); +/** + * cfg80211_inform_bss - inform cfg80211 of a new BSS + * + * @wiphy: the wiphy reporting the BSS + * @channel: The channel the frame was received on + * @bssid: the BSSID of the BSS + * @timestamp: the TSF timestamp sent by the peer + * @capability: the capability field sent by the peer + * @beacon_interval: the beacon interval announced by the peer + * @ie: additional IEs sent by the peer + * @ielen: length of the additional IEs + * @signal: the signal strength, type depends on the wiphy's signal_type + * @gfp: context flags + * + * This informs cfg80211 that BSS information was found and + * the BSS should be updated/added. + */ struct cfg80211_bss* cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e3c1d479400..abb3b1a9ddc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -313,9 +313,10 @@ enum mac80211_tx_control_flags { IEEE80211_TX_INTFL_NL80211_FRAME_TX = BIT(21), IEEE80211_TX_CTL_LDPC = BIT(22), IEEE80211_TX_CTL_STBC = BIT(23) | BIT(24), -#define IEEE80211_TX_CTL_STBC_SHIFT 23 }; +#define IEEE80211_TX_CTL_STBC_SHIFT 23 + /** * enum mac80211_rate_control_flags - per-rate flags set by the * Rate Control algorithm. @@ -813,7 +814,6 @@ enum ieee80211_key_flags { * encrypted in hardware. * @alg: The key algorithm. * @flags: key flags, see &enum ieee80211_key_flags. - * @ap_addr: AP's MAC address * @keyidx: the key index (0-3) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) -- cgit v1.2.3-70-g09d2 From 5bfddbd46a95c978f4d3c992339cbdf4f4b790a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Jun 2010 16:09:52 +0200 Subject: netfilter: nf_conntrack: IPS_UNTRACKED bit NOTRACK makes all cpus share a cache line on nf_conntrack_untracked twice per packet. This is bad for performance. __read_mostly annotation is also a bad choice. This patch introduces IPS_UNTRACKED bit so that we can use later a per_cpu untrack structure more easily. A new helper, nf_ct_untracked_get() returns a pointer to nf_conntrack_untracked. Another one, nf_ct_untracked_status_or() is used by nf_nat_init() to add IPS_NAT_DONE_MASK bits to untracked status. nf_ct_is_untracked() prototype is changed to work on a nf_conn pointer. Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 4 ++++ include/net/netfilter/nf_conntrack.h | 12 +++++++++--- include/net/netfilter/nf_conntrack_core.h | 2 +- net/ipv4/netfilter/nf_nat_core.c | 2 +- net/ipv4/netfilter/nf_nat_standalone.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 11 ++++++++--- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/xt_CT.c | 4 ++-- net/netfilter/xt_NOTRACK.c | 2 +- net/netfilter/xt_TEE.c | 4 ++-- net/netfilter/xt_cluster.c | 2 +- net/netfilter/xt_conntrack.c | 11 ++++++----- net/netfilter/xt_socket.c | 2 +- net/netfilter/xt_state.c | 14 ++++++++------ 15 files changed, 47 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 14e6d32002c..1afd18c855e 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -76,6 +76,10 @@ enum ip_conntrack_status { /* Conntrack is a template */ IPS_TEMPLATE_BIT = 11, IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT), + + /* Conntrack is a fake untracked entry */ + IPS_UNTRACKED_BIT = 12, + IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), }; /* Connection tracking event types */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index bde095f7e84..3bc38c70bbb 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -261,7 +261,13 @@ extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Fake conntrack entry for untracked connections */ -extern struct nf_conn nf_conntrack_untracked; +static inline struct nf_conn *nf_ct_untracked_get(void) +{ + extern struct nf_conn nf_conntrack_untracked; + + return &nf_conntrack_untracked; +} +extern void nf_ct_untracked_status_or(unsigned long bits); /* Iterate over all conntracks: if iter returns true, it's deleted. */ extern void @@ -289,9 +295,9 @@ static inline int nf_ct_is_dying(struct nf_conn *ct) return test_bit(IPS_DYING_BIT, &ct->status); } -static inline int nf_ct_is_untracked(const struct sk_buff *skb) +static inline int nf_ct_is_untracked(const struct nf_conn *ct) { - return (skb->nfct == &nf_conntrack_untracked.ct_general); + return test_bit(IPS_UNTRACKED_BIT, &ct->status); } extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3d7524fba19..aced085132e 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) struct nf_conn *ct = (struct nf_conn *)skb->nfct; int ret = NF_ACCEPT; - if (ct && ct != &nf_conntrack_untracked) { + if (ct && !nf_ct_is_untracked(ct)) { if (!nf_ct_is_confirmed(ct)) ret = __nf_conntrack_confirm(skb); if (likely(ret == NF_ACCEPT)) diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 4f8bddb760c..c7719b283ad 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -742,7 +742,7 @@ static int __init nf_nat_init(void) spin_unlock_bh(&nf_nat_lock); /* Initialize fake conntrack so that NAT will skip it */ - nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index beb25819c9c..6723c682250 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum, return NF_ACCEPT; /* Don't try to NAT if this packet is not conntracked */ - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return NF_ACCEPT; nat = nfct_nat(ct); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 9be81776415..1df3c8b6bf4 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -208,7 +208,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index eeeb8bc7398..6c1da212380 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -struct nf_conn nf_conntrack_untracked __read_mostly; +struct nf_conn nf_conntrack_untracked; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); static int nf_conntrack_hash_rnd_initted; @@ -1321,6 +1321,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); +void nf_ct_untracked_status_or(unsigned long bits) +{ + nf_conntrack_untracked.status |= bits; +} +EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); + static int nf_conntrack_init_init_net(void) { int max_factor = 8; @@ -1368,8 +1374,7 @@ static int nf_conntrack_init_init_net(void) #endif atomic_set(&nf_conntrack_untracked.ct_general.use, 1); /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); - + nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); return 0; #ifdef CONFIG_NF_CONNTRACK_ZONES diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c42ff6aa441..5bae1cd15ee 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -480,7 +480,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) int err; /* ignore our fake conntrack entry */ - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return 0; if (events & (1 << IPCT_DESTROY)) { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 562bf3266e0..0cb6053f02f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -67,7 +67,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->flags & XT_CT_NOTRACK) { - ct = &nf_conntrack_untracked; + ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); goto out; } @@ -132,7 +132,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) struct nf_conn *ct = info->ct; struct nf_conn_help *help; - if (ct != &nf_conntrack_untracked) { + if (!nf_ct_is_untracked(ct)) { help = nfct_help(ct); if (help) module_put(help->helper->me); diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index 512b9123252..9d782181b6c 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -23,7 +23,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 859d9fd429c..7a118267c4c 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -104,7 +104,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) #ifdef WITH_CONNTRACK /* Avoid counting cloned packets towards the original connection. */ nf_conntrack_put(skb->nfct); - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); #endif @@ -177,7 +177,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) #ifdef WITH_CONNTRACK nf_conntrack_put(skb->nfct); - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); #endif diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index 30b95a1c1c8..f4af1bfafb1 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -120,7 +120,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return false; if (ct->master) diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 39681f10291..e536710ad91 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -123,11 +123,12 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, ct = nf_ct_get(skb, &ctinfo); - if (ct == &nf_conntrack_untracked) - statebit = XT_CONNTRACK_STATE_UNTRACKED; - else if (ct != NULL) - statebit = XT_CONNTRACK_STATE_BIT(ctinfo); - else + if (ct) { + if (nf_ct_is_untracked(ct)) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + } else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 3d54c236a1b..1ca89908cba 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -127,7 +127,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, * reply packet of an established SNAT-ted connection. */ ct = nf_ct_get(skb, &ctinfo); - if (ct && (ct != &nf_conntrack_untracked) && + if (ct && !nf_ct_is_untracked(ct) && ((iph->protocol != IPPROTO_ICMP && ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) || (iph->protocol == IPPROTO_ICMP && diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index e12e053d378..a507922d80c 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -26,14 +26,16 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_untracked(skb)) - statebit = XT_STATE_UNTRACKED; - else if (!nf_ct_get(skb, &ctinfo)) + if (!ct) statebit = XT_STATE_INVALID; - else - statebit = XT_STATE_BIT(ctinfo); - + else { + if (nf_ct_is_untracked(ct)) + statebit = XT_STATE_UNTRACKED; + else + statebit = XT_STATE_BIT(ctinfo); + } return (sinfo->statemask & statebit); } -- cgit v1.2.3-70-g09d2 From 50a323b73069b169385a8ac65633dee837a7d13f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:36 +0200 Subject: sched: define and use CPU_PRI_* enums for cpu notifier priorities Instead of hardcoding priority 10 and 20 in sched and perf, collect them into CPU_PRI_* enums. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/cpu.h | 9 +++++++++ include/linux/perf_event.h | 2 +- kernel/sched.c | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e287863ac05..2d9073883ea 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -48,6 +48,15 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; +/* + * CPU notifier priorities. + */ +enum { + /* migration should happen before other stuff but after perf */ + CPU_PRI_PERF = 20, + CPU_PRI_MIGRATION = 10, +}; + #ifdef CONFIG_SMP /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5d0266d9498..469e03e96fe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1068,7 +1068,7 @@ static inline void perf_event_disable(struct perf_event *event) { } #define perf_cpu_notifier(fn) \ do { \ static struct notifier_block fn##_nb __cpuinitdata = \ - { .notifier_call = fn, .priority = 20 }; \ + { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ (void *)(unsigned long)smp_processor_id()); \ fn(&fn##_nb, (unsigned long)CPU_STARTING, \ diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996228d..552faf8d358 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5801,7 +5801,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; static int __init migration_init(void) -- cgit v1.2.3-70-g09d2 From 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:36 +0200 Subject: sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining Currently, when a cpu goes down, cpu_active is cleared before CPU_DOWN_PREPARE starts and cpuset configuration is updated from a default priority cpu notifier. When a cpu is coming up, it's set before CPU_ONLINE but cpuset configuration again is updated from the same cpu notifier. For cpu notifiers, this presents an inconsistent state. Threads which a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be migrated to other cpus because the cpu is no more inactive. Fix it by updating cpu_active in the highest priority cpu notifier and cpuset configuration in the second highest when a cpu is coming up. Down path is updated similarly. This guarantees that all other cpu notifiers see consistent cpu_active and cpuset configuration. cpuset_track_online_cpus() notifier is converted to cpuset_update_active_cpus() which just updates the configuration and now called from cpuset_cpu_[in]active() notifiers registered from sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus() degenerates into partition_sched_domains() making separate notifier for !CONFIG_CPUSETS unnecessary. This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug callback creates a kthread and kthread_bind()s it to the target cpu, and the thread is expected to run on that cpu. * Ingo's test discovered __cpuinit/exit markups were incorrect. Fixed. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Ingo Molnar Cc: Paul Menage --- include/linux/cpu.h | 16 ++++++++++++ include/linux/cpuset.h | 6 +++++ kernel/cpu.c | 6 ----- kernel/cpuset.c | 21 ++-------------- kernel/sched.c | 67 +++++++++++++++++++++++++++++++++++++------------- 5 files changed, 74 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2d9073883ea..de6b1722cdc 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -52,6 +52,22 @@ struct notifier_block; * CPU notifier priorities. */ enum { + /* + * SCHED_ACTIVE marks a cpu which is coming up active during + * CPU_ONLINE and CPU_DOWN_FAILED and must be the first + * notifier. CPUSET_ACTIVE adjusts cpuset according to + * cpu_active mask right after SCHED_ACTIVE. During + * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are + * ordered in the similar way. + * + * This ordering guarantees consistent cpu_active mask and + * migration behavior to all cpu notifiers. + */ + CPU_PRI_SCHED_ACTIVE = INT_MAX, + CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, + CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, + CPU_PRI_CPUSET_INACTIVE = INT_MIN, + /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 457ed765a11..f20eb8f1602 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_update_active_cpus(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4a..f6e726f1849 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - set_cpu_active(cpu, false); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { - set_cpu_active(cpu, true); - nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { - set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - set_cpu_active(cpu, true); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadd..05727dcaa80 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. + * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) +void __cpuexit cpuset_update_active_cpus(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - cgroup_lock(); mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_wq = create_singlethread_workqueue("cpuset"); diff --git a/kernel/sched.c b/kernel/sched.c index 552faf8d358..2b942e49d0f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5804,17 +5804,46 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -7273,29 +7302,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7341,10 +7376,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); -- cgit v1.2.3-70-g09d2 From 21aa9af03d06cb1d19a3738e5cf12acff984e69b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:37 +0200 Subject: sched: add hooks for workqueue Concurrency managed workqueue needs to know when workers are going to sleep and waking up. Using these two hooks, cmwq keeps track of the current concurrency level and throttles execution of new works if it's too high and wakes up another worker from the sleep hook if it becomes too low. This patch introduces PF_WQ_WORKER to identify workqueue workers and adds the following two hooks. * wq_worker_waking_up(): called when a worker is woken up. * wq_worker_sleeping(): called when a worker is going to sleep and may return a pointer to a local task which should be woken up. The returned task is woken up using try_to_wake_up_local() which is simplified ttwu which is called under rq lock and can only wake up local tasks. Both hooks are currently defined as noop in kernel/workqueue_sched.h. Later cmwq implementation will replace them with proper implementation. These hooks are hard coded as they'll always be enabled. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Ingo Molnar --- include/linux/sched.h | 1 + kernel/fork.c | 2 +- kernel/sched.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/workqueue_sched.h | 16 +++++++++++++++ 4 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 kernel/workqueue_sched.h (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f118809c953..edc3dd168d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1696,6 +1696,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba04..a82a65cef74 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; diff --git a/kernel/sched.c b/kernel/sched.c index 96eafd5f345..edd5a54b95d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -77,6 +77,7 @@ #include #include "sched_cpupri.h" +#include "workqueue_sched.h" #define CREATE_TRACE_POINTS #include @@ -2306,6 +2307,9 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, rq->idle_stamp = 0; } #endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); } /** @@ -2413,6 +2417,37 @@ out: return success; } +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. @@ -3618,10 +3653,24 @@ need_resched_nonpreemptible: clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 00000000000..af040babb74 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,16 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +static inline void wq_worker_waking_up(struct task_struct *task, + unsigned int cpu) +{ +} + +static inline struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + return NULL; +} -- cgit v1.2.3-70-g09d2 From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 20 May 2010 07:47:21 +0200 Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller Drop this argument now that we always want to rewind only to the state of the first caller. It means frame pointers are not necessary anymore to reliably get the source of an event. But this also means we need this helper to be a macro now, as an inline function is not an option since we need to know when to provide a default implentation. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul Mackerras Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/powerpc/include/asm/perf_event.h | 12 ++++++++++++ arch/powerpc/kernel/misc.S | 26 -------------------------- arch/sparc/include/asm/perf_event.h | 8 ++++++++ arch/sparc/kernel/helpers.S | 6 +++--- arch/x86/include/asm/perf_event.h | 13 +++++++++++++ arch/x86/include/asm/stacktrace.h | 7 ++----- arch/x86/kernel/cpu/perf_event.c | 16 ---------------- include/linux/perf_event.h | 32 +++++++------------------------- include/trace/ftrace.h | 2 +- kernel/perf_event.c | 5 ----- kernel/trace/trace_event_perf.c | 2 -- 11 files changed, 46 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index e6d4ce69b12..5c16b891d50 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -21,3 +21,15 @@ #ifdef CONFIG_FSL_EMB_PERF_EVENT #include #endif + +#ifdef CONFIG_PERF_EVENTS +#include +#include + +#define perf_arch_fetch_caller_regs(regs, __ip) \ + do { \ + (regs)->nip = __ip; \ + (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ + } while (0) +#endif diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 22e507c8a55..2d29752cbe1 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -127,29 +127,3 @@ _GLOBAL(__setup_cpu_power7) _GLOBAL(__restore_cpu_power7) /* place holder */ blr - -/* - * Get a minimal set of registers for our caller's nth caller. - * r3 = regs pointer, r5 = n. - * - * We only get R1 (stack pointer), NIP (next instruction pointer) - * and LR (link register). These are all we can get in the - * general case without doing complicated stack unwinding, but - * fortunately they are enough to do a stack backtrace, which - * is all we need them for. - */ -_GLOBAL(perf_arch_fetch_caller_regs) - mr r6,r1 - cmpwi r5,0 - mflr r4 - ble 2f - mtctr r5 -1: PPC_LL r6,0(r6) - bdnz 1b - PPC_LL r4,PPC_LR_STKOFF(r6) -2: PPC_LL r7,0(r6) - PPC_LL r7,PPC_LR_STKOFF(r7) - PPC_STL r6,GPR1-STACK_FRAME_OVERHEAD(r3) - PPC_STL r4,_NIP-STACK_FRAME_OVERHEAD(r3) - PPC_STL r7,_LINK-STACK_FRAME_OVERHEAD(r3) - blr diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h index 7e2669894ce..74c4e0cd889 100644 --- a/arch/sparc/include/asm/perf_event.h +++ b/arch/sparc/include/asm/perf_event.h @@ -6,7 +6,15 @@ extern void set_perf_event_pending(void); #define PERF_EVENT_INDEX_OFFSET 0 #ifdef CONFIG_PERF_EVENTS +#include + extern void init_hw_perf_events(void); + +extern void +__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); + +#define perf_arch_fetch_caller_regs(pt_regs, ip) \ + __perf_arch_fetch_caller_regs(pt_regs, ip, 1); #else static inline void init_hw_perf_events(void) { } #endif diff --git a/arch/sparc/kernel/helpers.S b/arch/sparc/kernel/helpers.S index 92090cc9e82..682fee06a16 100644 --- a/arch/sparc/kernel/helpers.S +++ b/arch/sparc/kernel/helpers.S @@ -47,9 +47,9 @@ stack_trace_flush: .size stack_trace_flush,.-stack_trace_flush #ifdef CONFIG_PERF_EVENTS - .globl perf_arch_fetch_caller_regs - .type perf_arch_fetch_caller_regs,#function -perf_arch_fetch_caller_regs: + .globl __perf_arch_fetch_caller_regs + .type __perf_arch_fetch_caller_regs,#function +__perf_arch_fetch_caller_regs: /* We always read the %pstate into %o5 since we will use * that to construct a fake %tstate to store into the regs. */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d0c7e..02de29830ff 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) +#include + +/* + * We abuse bit 3 from flags to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ +#define perf_arch_fetch_caller_regs(regs, __ip) { \ + (regs)->ip = (__ip); \ + (regs)->bp = caller_frame_pointer(); \ + (regs)->cs = __KERNEL_CS; \ + regs->flags = 0; \ +} + #else static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a957463d3c7..2b16a2ad23d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,17 +78,14 @@ struct stack_frame_ia32 { u32 return_address; }; -static inline unsigned long rewind_frame_pointer(int n) +static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; get_bp(frame); #ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } + frame = frame->next_frame; #endif return (unsigned long)frame; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9632fb61e8f..2c075fe573d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ - regs->ip = ip; - /* - * perf_arch_fetch_caller_regs adds another call, we need to increment - * the skip level - */ - regs->bp = rewind_frame_pointer(skip + 1); - regs->cs = __KERNEL_CS; - /* - * We abuse bit 3 to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. - */ - regs->flags = 0; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb6c91eac7e..bea785cef49 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -905,8 +905,10 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); -extern void -perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); +#ifndef perf_arch_fetch_caller_regs +static inline void +perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { } +#endif /* * Take a snapshot of the regs. Skip ip and frame pointer to @@ -916,31 +918,11 @@ perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); * - bp for callchains * - eflags, for future purposes, just in case */ -static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip) +static inline void perf_fetch_caller_regs(struct pt_regs *regs) { - unsigned long ip; - memset(regs, 0, sizeof(*regs)); - switch (skip) { - case 1 : - ip = CALLER_ADDR0; - break; - case 2 : - ip = CALLER_ADDR1; - break; - case 3 : - ip = CALLER_ADDR2; - break; - case 4: - ip = CALLER_ADDR3; - break; - /* No need to support further for now */ - default: - ip = 0; - } - - return perf_arch_fetch_caller_regs(regs, ip, skip); + perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static inline void @@ -950,7 +932,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) struct pt_regs hot_regs; if (!regs) { - perf_fetch_caller_regs(&hot_regs, 1); + perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; } __perf_sw_event(event_id, nr, nmi, regs, addr); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3d685d1f2a0..8ee8b6e6b25 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -705,7 +705,7 @@ perf_trace_##call(void *__data, proto) \ int __data_size; \ int rctx; \ \ - perf_fetch_caller_regs(&__regs, 1); \ + perf_fetch_caller_regs(&__regs); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e099650cd24..9ae4dbcdf46 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2851,11 +2851,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } -__weak -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ -} - /* * We assume there is only KVM supporting the callbacks. diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cb6f365016e..21db1d3a48d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,8 +9,6 @@ #include #include "trace.h" -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); - static char *perf_trace_buf[4]; /* -- cgit v1.2.3-70-g09d2 From 30dbb20e68e6f7df974b77d2350ebad5eb6f6c9e Mon Sep 17 00:00:00 2001 From: Américo Wang Date: Wed, 26 May 2010 18:57:53 +0800 Subject: tracing: Remove boot tracer The boot tracer is useless. It simply logs the initcalls but in fact these initcalls are also logged through printk while using the initcall_debug kernel parameter. Nobody seem to be using it so far. Then just remove it. Signed-off-by: WANG Cong Cc: Chase Douglas Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Li Zefan LKML-Reference: <20100526105753.GA5677@cr0.nay.redhat.com> [ remove the hooks in main.c, and the headers ] Signed-off-by: Frederic Weisbecker --- include/trace/boot.h | 60 -------------- init/main.c | 27 +++---- kernel/trace/Kconfig | 17 ---- kernel/trace/Makefile | 1 - kernel/trace/trace.c | 3 - kernel/trace/trace.h | 8 -- kernel/trace/trace_boot.c | 185 ------------------------------------------- kernel/trace/trace_entries.h | 27 ------- 8 files changed, 10 insertions(+), 318 deletions(-) delete mode 100644 include/trace/boot.h delete mode 100644 kernel/trace/trace_boot.c (limited to 'include') diff --git a/include/trace/boot.h b/include/trace/boot.h deleted file mode 100644 index 088ea089e31..00000000000 --- a/include/trace/boot.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef _LINUX_TRACE_BOOT_H -#define _LINUX_TRACE_BOOT_H - -#include -#include -#include - -/* - * Structure which defines the trace of an initcall - * while it is called. - * You don't have to fill the func field since it is - * only used internally by the tracer. - */ -struct boot_trace_call { - pid_t caller; - char func[KSYM_SYMBOL_LEN]; -}; - -/* - * Structure which defines the trace of an initcall - * while it returns. - */ -struct boot_trace_ret { - char func[KSYM_SYMBOL_LEN]; - int result; - unsigned long long duration; /* nsecs */ -}; - -#ifdef CONFIG_BOOT_TRACER -/* Append the traces on the ring-buffer */ -extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn); -extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn); - -/* Tells the tracer that smp_pre_initcall is finished. - * So we can start the tracing - */ -extern void start_boot_trace(void); - -/* Resume the tracing of other necessary events - * such as sched switches - */ -extern void enable_boot_trace(void); - -/* Suspend this tracing. Actually, only sched_switches tracing have - * to be suspended. Initcalls doesn't need it.) - */ -extern void disable_boot_trace(void); -#else -static inline -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { } - -static inline -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { } - -static inline void start_boot_trace(void) { } -static inline void enable_boot_trace(void) { } -static inline void disable_boot_trace(void) { } -#endif /* CONFIG_BOOT_TRACER */ - -#endif /* __LINUX_TRACE_BOOT_H */ diff --git a/init/main.c b/init/main.c index 3bdb152f412..94f65efdc65 100644 --- a/init/main.c +++ b/init/main.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include @@ -715,38 +714,33 @@ int initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); static char msgbuf[64]; -static struct boot_trace_call call; -static struct boot_trace_ret ret; int do_one_initcall(initcall_t fn) { int count = preempt_count(); ktime_t calltime, delta, rettime; + unsigned long long duration; + int ret; if (initcall_debug) { - call.caller = task_pid_nr(current); - printk("calling %pF @ %i\n", fn, call.caller); + printk("calling %pF @ %i\n", fn, task_pid_nr(current)); calltime = ktime_get(); - trace_boot_call(&call, fn); - enable_boot_trace(); } - ret.result = fn(); + ret = fn(); if (initcall_debug) { - disable_boot_trace(); rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; - trace_boot_ret(&ret, fn); - printk("initcall %pF returned %d after %Ld usecs\n", fn, - ret.result, ret.duration); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk("initcall %pF returned %d after %lld usecs\n", fn, + ret, duration); } msgbuf[0] = 0; - if (ret.result && ret.result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", ret.result); + if (ret && ret != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", ret); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -760,7 +754,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return ret.result; + return ret; } @@ -880,7 +874,6 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); - start_boot_trace(); smp_init(); sched_init_smp(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545..572992abc71 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -229,23 +229,6 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. -config BOOT_TRACER - bool "Trace boot initcalls" - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer helps developers to optimize boot times: it records - the timings of the initcalls and traces key events and the identity - of tasks that can cause boot delays, such as context-switches. - - Its aim is to be parsed by the scripts/bootgraph.pl tool to - produce pretty graphics about boot inefficiencies, giving a visual - representation of the delays during initcalls - but the raw - /debug/tracing/trace text output is readable too. - - You must pass in initcall_debug and ftrace=initcall to the kernel - command line to enable this on bootup. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b0550..c3aaeba8237 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o -obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55e48511d7c..036fbc22858 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4603,9 +4603,6 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); current_trace = &nop_trace; -#ifdef CONFIG_BOOT_TRACER - register_tracer(&boot_tracer); -#endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463..75a5e800a73 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,10 +9,8 @@ #include #include #include -#include #include #include - #include #include @@ -29,8 +27,6 @@ enum trace_type { TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, - TRACE_BOOT_CALL, - TRACE_BOOT_RET, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, @@ -48,8 +44,6 @@ enum kmemtrace_type_id { KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -extern struct tracer boot_tracer; - #undef __field #define __field(type, item) type item; @@ -209,8 +203,6 @@ extern void __ftrace_bad_type(void); TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ - IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956a..00000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * ring buffer based initcalls tracer - * - * Copyright (C) 2008 Frederic Weisbecker - * - */ - -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *boot_trace; -static bool pre_initcalls_finished; - -/* Tells the boot tracer that the pre_smp_initcalls are finished. - * So we are ready . - * It doesn't enable sched events tracing however. - * You have to call enable_boot_trace to do so. - */ -void start_boot_trace(void) -{ - pre_initcalls_finished = true; -} - -void enable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_start_sched_switch_record(); -} - -void disable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_stop_sched_switch_record(); -} - -static int boot_trace_init(struct trace_array *tr) -{ - boot_trace = tr; - - if (!tr) - return 0; - - tracing_reset_online_cpus(tr); - - tracing_sched_switch_assign_trace(tr); - return 0; -} - -static enum print_line_t -initcall_call_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_call *field; - struct boot_trace_call *call; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - call = &field->boot_call; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - (unsigned long)ts, nsec_rem, call->func, call->caller); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -initcall_ret_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_ret *field; - struct boot_trace_ret *init_ret; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - init_ret = &field->boot_ret; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %llu msecs\n", - (unsigned long) ts, - nsec_rem, - init_ret->func, init_ret->result, init_ret->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t initcall_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_BOOT_CALL: - return initcall_call_print_line(iter); - case TRACE_BOOT_RET: - return initcall_ret_print_line(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -struct tracer boot_tracer __read_mostly = -{ - .name = "initcall", - .init = boot_trace_init, - .reset = tracing_reset_online_cpus, - .print_line = initcall_print_line, -}; - -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_call; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_call *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - /* Get its name now since this function could - * disappear because it is in the .init section. - */ - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_call = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_ret; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_ret *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_ret = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240d..c293364c984 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -271,33 +271,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __entry->map_id, __entry->opcode) ); -FTRACE_ENTRY(boot_call, trace_boot_call, - - TRACE_BOOT_CALL, - - F_STRUCT( - __field_struct( struct boot_trace_call, boot_call ) - __field_desc( pid_t, boot_call, caller ) - __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) - ), - - F_printk("%d %s", __entry->caller, __entry->func) -); - -FTRACE_ENTRY(boot_ret, trace_boot_ret, - - TRACE_BOOT_RET, - - F_STRUCT( - __field_struct( struct boot_trace_ret, boot_ret ) - __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) - __field_desc( int, boot_ret, result ) - __field_desc( unsigned long, boot_ret, duration ) - ), - - F_printk("%s %d %lx", - __entry->func, __entry->result, __entry->duration) -); #define TRACE_FUNC_SIZE 30 #define TRACE_FILE_SIZE 20 -- cgit v1.2.3-70-g09d2 From c676329abb2b8359d9a5d734dec0c81779823fd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 10:48:51 +0200 Subject: sched_clock: Add local_clock() API and improve documentation For people who otherwise get to write: cpu_clock(smp_processor_id()), there is now: local_clock(). Also, as per suggestion from Andrew, provide some documentation on the various clock interfaces, and minimize the unsigned long long vs u64 mess. Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: Jens Axboe LKML-Reference: <1275052414.1645.52.camel@laptop> Signed-off-by: Ingo Molnar --- arch/parisc/kernel/ftrace.c | 4 +- include/linux/sched.h | 37 ++++++++++-------- kernel/lockdep.c | 2 +- kernel/perf_event.c | 2 +- kernel/rcutorture.c | 3 +- kernel/sched.c | 2 +- kernel/sched_clock.c | 95 ++++++++++++++++++++++++++++++++++++++++----- kernel/trace/trace_clock.c | 2 +- 8 files changed, 113 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 9877372ffdb..5beb97bafbb 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -82,7 +82,7 @@ unsigned long ftrace_return_to_handler(unsigned long retval0, unsigned long ret; pop_return_trace(&trace, &ret); - trace.rettime = cpu_clock(raw_smp_processor_id()); + trace.rettime = local_clock(); ftrace_graph_return(&trace); if (unlikely(!ret)) { @@ -126,7 +126,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = cpu_clock(raw_smp_processor_id()); + calltime = local_clock(); if (push_return_trace(old, calltime, self_addr, &trace.depth) == -EBUSY) { diff --git a/include/linux/sched.h b/include/linux/sched.h index edc3dd168d8..c2d4316a04b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1791,20 +1791,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) #endif /* - * Architectures can set this to 1 if they have specified - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, - * but then during bootup it turns out that sched_clock() - * is reliable after all: + * Do not use outside of architecture code which knows its limitations. + * + * sched_clock() has no promise of monotonicity or bounded drift between + * CPUs, use (which you should not) requires disabling IRQs. + * + * Please use one of the three interfaces below. */ -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -extern int sched_clock_stable; -#endif - -/* ftrace calls sched_clock() directly */ extern unsigned long long notrace sched_clock(void); +/* + * See the comment in kernel/sched_clock.c + */ +extern u64 cpu_clock(int cpu); +extern u64 local_clock(void); +extern u64 sched_clock_cpu(int cpu); + extern void sched_clock_init(void); -extern u64 sched_clock_cpu(int cpu); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline void sched_clock_tick(void) @@ -1819,17 +1822,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } #else +/* + * Architectures can set this to 1 if they have specified + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, + * but then during bootup it turns out that sched_clock() + * is reliable after all: + */ +extern int sched_clock_stable; + extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #endif -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -extern unsigned long long cpu_clock(int cpu); - extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37..f2852a51023 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], static inline u64 lockstat_clock(void) { - return cpu_clock(smp_processor_id()); + return local_clock(); } static int lock_point(unsigned long points[], unsigned long ip) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 31d6afe9259..109c5ec8893 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } /* diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a..2e2726d790b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -239,8 +239,7 @@ static unsigned long rcu_random(struct rcu_random_state *rrsp) { if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); + rrsp->rrs_state += (unsigned long)local_clock(); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; diff --git a/kernel/sched.c b/kernel/sched.c index 8f351c56567..3abd8f780da 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1647,7 +1647,7 @@ static void update_shares(struct sched_domain *sd) if (root_task_group_empty()) return; - now = cpu_clock(raw_smp_processor_id()); + now = local_clock(); elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb..52f1a149bfb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -10,19 +10,55 @@ * Ingo Molnar * Guillaume Chazarain * - * Create a semi stable clock from a mixture of other events, including: - * - gtod + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) * - sched_clock() * - explicit idle events * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). */ #include #include @@ -170,6 +206,11 @@ again: return val; } +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -unsigned long long cpu_clock(int cpu) +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) { - unsigned long long clock; + u64 clock; unsigned long flags; local_irq_save(flags); @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) return clock; } +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -unsigned long long cpu_clock(int cpu) +u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1..1723e2b8c58 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -56,7 +56,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } -- cgit v1.2.3-70-g09d2 From 83cd4fe27ad8446619b2e030b171b858501de87d Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 21 May 2010 17:09:41 -0700 Subject: sched: Change nohz idle load balancing logic to push model In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +- kernel/hrtimer.c | 8 +- kernel/sched.c | 34 ++++- kernel/sched_fair.c | 329 ++++++++++++++++++++++++++++------------------- kernel/time/tick-sched.c | 8 +- kernel/timer.c | 8 +- 6 files changed, 237 insertions(+), 159 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c2d4316a04b..a3e5b1cd043 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) -extern int select_nohz_load_balancer(int cpu); -extern int get_nohz_load_balancer(void); +extern void select_nohz_load_balancer(int stop_tick); +extern int get_nohz_timer_target(void); extern int nohz_ratelimit(int cpu); #else -static inline int select_nohz_load_balancer(int cpu) -{ - return 0; -} +static inline void select_nohz_load_balancer(int stop_tick) { } static inline int nohz_ratelimit(int cpu) { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0..e934339fbbe 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, static int hrtimer_get_target(int this_cpu, int pinned) { #ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - return preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); #endif return this_cpu; } diff --git a/kernel/sched.c b/kernel/sched.c index a757f6b11cb..132950b33dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -460,7 +460,7 @@ struct rq { unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -1194,6 +1194,27 @@ static void resched_cpu(int cpu) } #ifdef CONFIG_NO_HZ +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} /* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event @@ -7791,6 +7812,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7835,8 +7860,11 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 22b8b4f2b61..6ee2e0af665 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3091,13 +3091,40 @@ out_unlock: } #ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ static struct { atomic_t load_balancer; - cpumask_var_t cpu_mask; - cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; int get_nohz_load_balancer(void) { @@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { - cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ - if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + if (cpumask_empty(nohz.grp_idle_mask)) return 0; - if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) return 0; return 1; @@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu) * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ - if (cpumask_weight(nohz.cpu_mask) < 2) + if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { @@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu) do { if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.ilb_grp_nohz_mask); + return cpumask_first(nohz.grp_idle_mask); ilb_group = ilb_group->next; @@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu) } out_done: - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #endif +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. + * load balancing on behalf of all those cpus. * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). */ -int select_nohz_load_balancer(int stop_tick) +void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) - return 0; + return; /* * If we are going offline and still the leader, * give up! */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); - return 0; + return; } - cpumask_set_cpu(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { int new_ilb; - if (!(sched_smt_power_savings || - sched_mc_power_savings)) - return 1; + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, -1); + atomic_set(&nohz.load_balancer, nr_cpu_ids); resched_cpu(new_ilb); - return 0; + return; } - return 1; + return; } } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; - cpumask_clear_cpu(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); } - return 0; + return; } #endif @@ -3383,11 +3428,101 @@ out: rq->next_balance = next_balance; } +#ifdef CONFIG_NO_HZ /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the + * In CONFIG_NO_HZ case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (!rq->nr_running) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(this_cpu, idle); -#ifdef CONFIG_NO_HZ /* - * If this cpu is the owner for idle load balancing, then do the + * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - rebalance_domains(balance_cpu, CPU_IDLE); - - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif + nohz_idle_balance(this_cpu, idle); } static inline int on_null_domain(int cpu) @@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - int ilb = find_new_ilb(cpu); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif } static void rq_online_fair(struct rq *rq) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c03..5f171f04ab0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8d4e1..48d6aec0789 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - cpu = preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); #endif new_base = per_cpu(tvec_bases, cpu); -- cgit v1.2.3-70-g09d2 From 9d5efe05eb0c904545a28b19c18b949f23334de0 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 8 Jun 2010 14:57:02 +1000 Subject: sched: Fix capacity calculations for SMT4 Handle cpu capacity being reported as 0 on cores with more number of hardware threads. For example on a Power7 core with 4 hardware threads, core power is 1177 and thus power of each hardware thread is 1177/4 = 294. This low power can lead to capacity for each hardware thread being calculated as 0, which leads to tasks bouncing within the core madly! Fix this by reporting capacity for hardware threads as 1, provided their power is not scaled down significantly because of frequency scaling or real-time tasks usage of cpu. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Michael Neuling Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven LKML-Reference: <20100608045702.21D03CC895@localhost.localdomain> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched_fair.c | 53 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a3e5b1cd043..c731296e5e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -857,7 +857,7 @@ struct sched_group { * CPU power of this group, SCHED_LOAD_SCALE being max power for a * single CPU. */ - unsigned int cpu_power; + unsigned int cpu_power, cpu_power_orig; /* * The CPUs this group covers. diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6ee2e0af665..b9b3462483b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2285,13 +2285,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_LOAD_SHIFT; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); @@ -2301,6 +2294,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_LOAD_SHIFT; } + sdg->cpu_power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; @@ -2333,6 +2335,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) sdg->cpu_power = power; } +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_LOAD_SCALE + */ + if (sd->level != SD_LV_SIBLING) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->cpu_power * 32 < group->cpu_power_orig * 29) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. @@ -2426,6 +2453,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); } /** @@ -2724,8 +2753,9 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2736,6 +2766,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; + if (!capacity) + capacity = fix_small_capacity(sd, group); + if (!cpumask_test_cpu(i, cpus)) continue; @@ -2852,7 +2885,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, cpus); + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; -- cgit v1.2.3-70-g09d2 From 532cb4c401e225b084c14d6bd6a2f8ee561de2f1 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 8 Jun 2010 14:57:02 +1000 Subject: sched: Add asymmetric group packing option for sibling domain Check to see if the group is packed in a sched doman. This is primarily intended to used at the sibling level. Some cores like POWER7 prefer to use lower numbered SMT threads. In the case of POWER7, it can move to lower SMT modes only when higher threads are idle. When in lower SMT modes, the threads will perform better since they share less core resources. Hence when we have idle threads, we want them to be the higher ones. This adds a hook into f_b_g() called check_asym_packing() to check the packing. This packing function is run on idle threads. It checks to see if the busiest CPU in this domain (core in the P7 case) has a higher CPU number than what where the packing function is being run on. If it is, calculate the imbalance and return the higher busier thread as the busiest group to f_b_g(). Here we are assuming a lower CPU number will be equivalent to a lower SMT thread number. It also creates a new SD_ASYM_PACKING flag to enable this feature at any scheduler domain level. It also creates an arch hook to enable this feature at the sibling level. The default function doesn't enable this feature. Based heavily on patch from Peter Zijlstra. Fixes from Srivatsa Vaddagiri. Signed-off-by: Michael Neuling Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: "H. Peter Anvin" Cc: Thomas Gleixner LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +- include/linux/topology.h | 1 + kernel/sched_fair.c | 139 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 126 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c731296e5e9..ff154e10752 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -801,7 +801,7 @@ enum cpu_idle_type { #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ - +#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ enum powersavings_balance_level { @@ -836,6 +836,8 @@ static inline int sd_balance_for_package_power(void) return SD_PREFER_SIBLING; } +extern int __weak arch_sd_sibiling_asym_packing(void); + /* * Optimise SD flags for power savings: * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. diff --git a/include/linux/topology.h b/include/linux/topology.h index c44df50a05a..cf57f30d0dc 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -103,6 +103,7 @@ int arch_update_cpu_topology(void); | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ + | arch_sd_sibiling_asym_packing() \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b9b3462483b..593424f91a8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2457,12 +2457,54 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = fix_small_capacity(sd, group); } +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sds: sched_group statistics + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; +} + /** * update_sd_lb_stats - Update sched_group's statistics for load balancing. * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. + * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -2473,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; - struct sched_group *group = sd->groups; + struct sched_group *sg = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; @@ -2486,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, do { int local_group; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->cpu_power; + sds->total_pwr += sg->cpu_power; /* * In case the child domain prefers tasks go to siblings - * first, lower the group capacity to one so that we'll try + * first, lower the sg capacity to one so that we'll try * and move all the excess tasks away. */ if (prefer_sibling) @@ -2508,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, if (local_group) { sds->this_load = sgs.avg_load; - sds->this = group; + sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds->max_load && - (sgs.sum_nr_running > sgs.group_capacity || - sgs.group_imb)) { + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; - sds->busiest = group; + sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(group, sds, local_group, &sgs); - group = group->next; - } while (group != sd->groups); + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibiling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + SCHED_LOAD_SCALE); + return 1; } /** @@ -2719,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!(*balance)) goto ret; + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; @@ -2808,9 +2902,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, + int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU @@ -2929,7 +3033,8 @@ redo: schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle)) { + if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), + this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, -- cgit v1.2.3-70-g09d2 From ecc55f84b2e9741f29daa787ded93986df6cbe17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 15:11:34 +0200 Subject: perf, trace: Inline perf_swevent_put_recursion_context() Inline perf_swevent_put_recursion_context into perf_tp_event(), this shrinks the per trace template code footprint and saves a function call. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 3 +-- include/linux/perf_event.h | 2 +- kernel/perf_event.c | 8 ++++---- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 3167f2df412..0af31cd335d 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -257,8 +257,7 @@ static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, u64 count, struct pt_regs *regs, void *head) { - perf_tp_event(addr, count, raw_data, size, regs, head); - perf_swevent_put_recursion_context(rctx); + perf_tp_event(addr, count, raw_data, size, regs, head, rctx); } #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5d0266d9498..c691a0b27bc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1001,7 +1001,7 @@ static inline bool perf_paranoid_kernel(void) extern void perf_event_init(void); extern void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, struct pt_regs *regs, - struct hlist_head *head); + struct hlist_head *head, int rctx); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4..4bd3b597bcc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4213,14 +4213,12 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void perf_swevent_put_recursion_context(int rctx) +void inline perf_swevent_put_recursion_context(int rctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); barrier(); cpuctx->recursion[rctx]--; } -EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); - void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4601,7 +4599,7 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head) + struct pt_regs *regs, struct hlist_head *head, int rctx) { struct perf_sample_data data; struct perf_event *event; @@ -4621,6 +4619,8 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_add(event, count, 1, &data, regs); } rcu_read_unlock(); + + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); -- cgit v1.2.3-70-g09d2 From 3af9e859281bda7eb7c20b51879cf43aa788ac2e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Tue, 18 May 2010 15:30:49 +0100 Subject: perf: Add non-exec mmap() tracking Add the capacility to track data mmap()s. This can be used together with PERF_SAMPLE_ADDR for data profiling. Signed-off-by: Anton Blanchard [Updated code for stable perf ABI] Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274193049-25997-1-git-send-email-ebmunson@us.ibm.com> Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/perf_event.h | 12 +++--------- kernel/perf_event.c | 34 +++++++++++++++++++++++----------- mm/mmap.c | 6 +++++- tools/perf/builtin-record.c | 4 +++- 5 files changed, 35 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index e19de6a8033..97d91a03fb1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -653,6 +653,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else stack_base = vma->vm_start - stack_expand; #endif + current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c691a0b27bc..36efad90cd4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -214,8 +214,9 @@ struct perf_event_attr { * See also PERF_RECORD_MISC_EXACT_IP */ precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non-exec mmap data */ - __reserved_1 : 47; + __reserved_1 : 46; union { __u32 wakeup_events; /* wakeup every n events */ @@ -962,14 +963,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) } } -extern void __perf_event_mmap(struct vm_area_struct *vma); - -static inline void perf_event_mmap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_EXEC) - __perf_event_mmap(vma); -} - +extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b39bec346e8..227ed9c8ec3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1891,7 +1891,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { atomic_dec(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); @@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event, /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task */ struct perf_task_event { @@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event) if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.comm || event->attr.mmap || event->attr.task) + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) return 1; return 0; @@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event, } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; @@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.mmap) + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) return 1; return 0; } static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event)) + if (perf_event_mmap_match(event, mmap_event, executable)) perf_event_mmap_output(event, mmap_event); } } @@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; } name = strncpy(tmp, "//anon", sizeof(tmp)); @@ -3846,17 +3858,17 @@ got_name: rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_event_mmap_ctx(ctx, mmap_event); + perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); put_cpu_var(perf_cpu_context); rcu_read_unlock(); kfree(buf); } -void __perf_event_mmap(struct vm_area_struct *vma) +void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -4911,7 +4923,7 @@ done: if (!event->parent) { atomic_inc(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); diff --git a/mm/mmap.c b/mm/mmap.c index 456ec6f2788..e38e910cb75 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1734,8 +1734,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) grow = (address - vma->vm_end) >> PAGE_SHIFT; error = acct_stack_growth(vma, size, grow); - if (!error) + if (!error) { vma->vm_end = address; + perf_event_mmap(vma); + } } anon_vma_unlock(vma); return error; @@ -1781,6 +1783,7 @@ static int expand_downwards(struct vm_area_struct *vma, if (!error) { vma->vm_start = address; vma->vm_pgoff -= grow; + perf_event_mmap(vma); } } anon_vma_unlock(vma); @@ -2208,6 +2211,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 5e5c6403a31..39c7247bc54 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -268,8 +268,10 @@ static void create_counter(int counter, int cpu) if (inherit_stat) attr->inherit_stat = 1; - if (sample_address) + if (sample_address) { attr->sample_type |= PERF_SAMPLE_ADDR; + attr->mmap_data = track; + } if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; -- cgit v1.2.3-70-g09d2 From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 17:49:05 +0200 Subject: perf: Cleanup {start,commit,cancel}_txn details Clarify some of the transactional group scheduling API details and change it so that a successfull ->commit_txn also closes the transaction. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274803086.5882.1752.camel@twins> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_event.c | 7 ++++--- arch/sparc/kernel/perf_event.c | 7 ++++--- arch/x86/kernel/cpu/perf_event.c | 14 +++++--------- include/linux/perf_event.h | 27 ++++++++++++++++++++++----- kernel/perf_event.c | 9 +-------- 5 files changed, 36 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 43b83c35cf5..ac2a8c2554d 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuhw->group_flag & PERF_EVENT_TXN) goto nocheck; if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) @@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag |= PERF_EVENT_TXN_STARTED; + cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; } @@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuhw->group_flag &= ~PERF_EVENT_TXN; } /* @@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu) for (i = cpuhw->n_txn_start; i < n; ++i) cpuhw->event[i]->hw.config = cpuhw->events[i]; + cpuhw->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 0ec92c8861d..beeb92fa3ac 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto nocheck; if (check_excludes(cpuc->event, n0, 1)) @@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag |= PERF_EVENT_TXN_STARTED; + cpuhw->group_flag |= PERF_EVENT_TXN; } /* @@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuhw->group_flag &= ~PERF_EVENT_TXN; } /* @@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu) if (sparc_check_constraints(cpuc->event, cpuc->events, n)) return -EAGAIN; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d65a1..af04c6fa59c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto out; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event) * The events never got scheduled and ->cancel_txn will truncate * the event_list. */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) return; x86_pmu_stop(event); @@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuc->group_flag &= ~PERF_EVENT_TXN; /* * Truncate the collected events. */ @@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - /* - * Clear out the txn count so that ->cancel_txn() which gets - * run after ->commit_txn() doesn't undo things. - */ - cpuc->n_txn = 0; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 36efad90cd4..f1b6ba0770e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -549,7 +549,10 @@ struct hw_perf_event { struct perf_event; -#define PERF_EVENT_TXN_STARTED 1 +/* + * Common implementation detail of pmu::{start,commit,cancel}_txn + */ +#define PERF_EVENT_TXN 0x1 /** * struct pmu - generic performance monitoring unit @@ -563,14 +566,28 @@ struct pmu { void (*unthrottle) (struct perf_event *event); /* - * group events scheduling is treated as a transaction, - * add group events as a whole and perform one schedulability test. - * If test fails, roll back the whole group + * Group events scheduling is treated as a transaction, add group + * events as a whole and perform one schedulability test. If the test + * fails, roll back the whole group */ + /* + * Start the transaction, after this ->enable() doesn't need + * to do schedulability tests. + */ void (*start_txn) (const struct pmu *pmu); - void (*cancel_txn) (const struct pmu *pmu); + /* + * If ->start_txn() disabled the ->enable() schedulability test + * then ->commit_txn() is required to perform one. On success + * the transaction is closed. On error the transaction is kept + * open until ->cancel_txn() is called. + */ int (*commit_txn) (const struct pmu *pmu); + /* + * Will cancel the transaction, assumes ->disable() is called for + * each successfull ->enable() during the transaction. + */ + void (*cancel_txn) (const struct pmu *pmu); }; /** diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 227ed9c8ec3..6f60920772b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event, struct perf_event *event, *partial_group = NULL; const struct pmu *pmu = group_event->pmu; bool txn = false; - int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -703,15 +702,9 @@ group_sched_in(struct perf_event *group_event, } } - if (!txn) + if (!txn || !pmu->commit_txn(pmu)) return 0; - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); - return 0; - } - group_error: /* * Groups can be scheduled in as one unit only, so undo any -- cgit v1.2.3-70-g09d2 From ca5135e6b4a3cbc7e187737520fbc4b508f6f7a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 May 2010 19:33:23 +0200 Subject: perf: Rename perf_mmap_data to perf_buffer Rename to clarify code. s/perf_mmap_data/perf_buffer/g and selective s/data/buffer/g Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 +- kernel/perf_event.c | 308 ++++++++++++++++++++++----------------------- 2 files changed, 157 insertions(+), 157 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f1b6ba0770e..2a0da021c23 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -602,7 +602,7 @@ enum perf_event_active_state { struct file; -struct perf_mmap_data { +struct perf_buffer { atomic_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC @@ -727,7 +727,7 @@ struct perf_event { atomic_t mmap_count; int mmap_locked; struct user_struct *mmap_user; - struct perf_mmap_data *data; + struct perf_buffer *buffer; /* poll related */ wait_queue_head_t waitq; @@ -825,7 +825,7 @@ struct perf_cpu_context { struct perf_output_handle { struct perf_event *event; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long wakeup; unsigned long size; void *addr; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6f60920772b..93d545801e4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1876,7 +1876,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); -static void perf_mmap_data_put(struct perf_mmap_data *data); +static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { @@ -1892,9 +1892,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->data) { - perf_mmap_data_put(event->data); - event->data = NULL; + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; } if (event->destroy) @@ -2119,13 +2119,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) - events = atomic_xchg(&data->poll, 0); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -2335,14 +2335,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; - userpg = data->user_page; + userpg = buffer->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -2376,15 +2376,15 @@ unlock: */ static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > data->nr_pages) + if (pgoff > buffer->nr_pages) return NULL; if (pgoff == 0) - return virt_to_page(data->user_page); + return virt_to_page(buffer->user_page); - return virt_to_page(data->data_pages[pgoff - 1]); + return virt_to_page(buffer->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) @@ -2400,42 +2400,42 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(struct perf_event *event, int nr_pages) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; int i; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - data->user_page = perf_mmap_alloc_page(event->cpu); - if (!data->user_page) + buffer->user_page = perf_mmap_alloc_page(event->cpu); + if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = perf_mmap_alloc_page(event->cpu); - if (!data->data_pages[i]) + buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu); + if (!buffer->data_pages[i]) goto fail_data_pages; } - data->nr_pages = nr_pages; + buffer->nr_pages = nr_pages; - return data; + return buffer; fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); + free_page((unsigned long)buffer->data_pages[i]); - free_page((unsigned long)data->user_page); + free_page((unsigned long)buffer->user_page); fail_user_page: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2449,17 +2449,17 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { int i; - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - kfree(data); + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); } -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { return 0; } @@ -2472,18 +2472,18 @@ static inline int page_order(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { - return data->page_order; + return buffer->page_order; } static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > (1UL << page_order(data))) + if (pgoff > (1UL << page_order(buffer))) return NULL; - return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) @@ -2493,57 +2493,57 @@ static void perf_mmap_unmark_page(void *addr) page->mapping = NULL; } -static void perf_mmap_data_free_work(struct work_struct *work) +static void perf_buffer_free_work(struct work_struct *work) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; void *base; int i, nr; - data = container_of(work, struct perf_mmap_data, work); - nr = 1 << page_order(data); + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); - base = data->user_page; + base = buffer->user_page; for (i = 0; i < nr + 1; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); - kfree(data); + kfree(buffer); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { - schedule_work(&data->work); + schedule_work(&buffer->work); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(struct perf_event *event, int nr_pages) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; void *all_buf; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - INIT_WORK(&data->work, perf_mmap_data_free_work); + INIT_WORK(&buffer->work, perf_buffer_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; - data->user_page = all_buf; - data->data_pages[0] = all_buf + PAGE_SIZE; - data->page_order = ilog2(nr_pages); - data->nr_pages = 1; + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; - return data; + return buffer; fail_all_buf: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2551,15 +2551,15 @@ fail: #endif -static unsigned long perf_data_size(struct perf_mmap_data *data) +static unsigned long perf_data_size(struct perf_buffer *buffer) { - return data->nr_pages << (PAGE_SHIFT + page_order(data)); + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -2569,14 +2569,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(data, vmf->pgoff); + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); if (!vmf->page) goto unlock; @@ -2592,51 +2592,51 @@ unlock: } static void -perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer) { - long max_size = perf_data_size(data); + long max_size = perf_data_size(buffer); if (event->attr.watermark) { - data->watermark = min_t(long, max_size, + buffer->watermark = min_t(long, max_size, event->attr.wakeup_watermark); } - if (!data->watermark) - data->watermark = max_size / 2; + if (!buffer->watermark) + buffer->watermark = max_size / 2; - atomic_set(&data->refcount, 1); - rcu_assign_pointer(event->data, data); + atomic_set(&buffer->refcount, 1); + rcu_assign_pointer(event->buffer, buffer); } -static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_data_free(data); + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); } -static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) +static struct perf_buffer *perf_buffer_get(struct perf_event *event) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) { - if (!atomic_inc_not_zero(&data->refcount)) - data = NULL; + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; } rcu_read_unlock(); - return data; + return buffer; } -static void perf_mmap_data_put(struct perf_mmap_data *data) +static void perf_buffer_put(struct perf_buffer *buffer) { - if (!atomic_dec_and_test(&data->refcount)) + if (!atomic_dec_and_test(&buffer->refcount)) return; - call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2651,16 +2651,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->data); + unsigned long size = perf_data_size(event->buffer); struct user_struct *user = event->mmap_user; - struct perf_mmap_data *data = event->data; + struct perf_buffer *buffer = event->buffer; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->data, NULL); + rcu_assign_pointer(event->buffer, NULL); mutex_unlock(&event->mmap_mutex); - perf_mmap_data_put(data); + perf_buffer_put(buffer); free_uid(user); } } @@ -2678,7 +2678,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2699,7 +2699,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have data pages ensure they're a power-of-two number, so we + * If we have buffer pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -2713,9 +2713,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->data) { - if (event->data->nr_pages == nr_pages) - atomic_inc(&event->data->refcount); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); else ret = -EINVAL; goto unlock; @@ -2745,17 +2745,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->data); + WARN_ON(event->buffer); - data = perf_mmap_data_alloc(event, nr_pages); - if (!data) { + buffer = perf_buffer_alloc(event, nr_pages); + if (!buffer) { ret = -ENOMEM; goto unlock; } - perf_mmap_data_init(event, data); + perf_buffer_init(event, buffer); if (vma->vm_flags & VM_WRITE) - event->data->writable = 1; + event->buffer->writable = 1; atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -2964,15 +2964,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); /* * Output */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, unsigned long offset, unsigned long head) { unsigned long mask; - if (!data->writable) + if (!buffer->writable) return true; - mask = perf_data_size(data) - 1; + mask = perf_data_size(buffer) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2985,7 +2985,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->data->poll, POLL_IN); + atomic_set(&handle->buffer->poll, POLL_IN); if (handle->nmi) { handle->event->pending_wakeup = 1; @@ -3005,45 +3005,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; preempt_disable(); - local_inc(&data->nest); - handle->wakeup = local_read(&data->wakeup); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; unsigned long head; again: - head = local_read(&data->head); + head = local_read(&buffer->head); /* * IRQ/NMI can happen here, which means we can miss a head update. */ - if (!local_dec_and_test(&data->nest)) + if (!local_dec_and_test(&buffer->nest)) goto out; /* * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the data->head read and this + * by atomic_dec_and_test() order the buffer->head read and this * write. */ - data->user_page->data_head = head; + buffer->user_page->data_head = head; /* * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read data->head. + * barrier in atomic_dec_and_test() to re-read buffer->head. */ - if (unlikely(head != local_read(&data->head))) { - local_inc(&data->nest); + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); goto again; } - if (handle->wakeup != local_read(&data->wakeup)) + if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); out: @@ -3063,12 +3063,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, buf += size; handle->size -= size; if (!handle->size) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; handle->page++; - handle->page &= data->nr_pages - 1; - handle->addr = data->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(data); + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); } } while (len); } @@ -3077,7 +3077,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; struct { @@ -3093,19 +3093,19 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto out; - handle->data = data; + handle->buffer = buffer; handle->event = event; handle->nmi = nmi; handle->sample = sample; - if (!data->nr_pages) + if (!buffer->nr_pages) goto out; - have_lost = local_read(&data->lost); + have_lost = local_read(&buffer->lost); if (have_lost) size += sizeof(lost_event); @@ -3117,30 +3117,30 @@ int perf_output_begin(struct perf_output_handle *handle, * tail pointer. So that all reads will be completed before the * write is issued. */ - tail = ACCESS_ONCE(data->user_page->data_tail); + tail = ACCESS_ONCE(buffer->user_page->data_tail); smp_rmb(); - offset = head = local_read(&data->head); + offset = head = local_read(&buffer->head); head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) + if (unlikely(!perf_output_space(buffer, tail, offset, head))) goto fail; - } while (local_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&buffer->head, offset, head) != offset); - if (head - local_read(&data->wakeup) > data->watermark) - local_add(data->watermark, &data->wakeup); + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(data)); - handle->page &= data->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); - handle->addr = data->data_pages[handle->page]; + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(data)) - handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = local_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); } @@ -3148,7 +3148,7 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - local_inc(&data->lost); + local_inc(&buffer->lost); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3159,15 +3159,15 @@ out: void perf_output_end(struct perf_output_handle *handle) { struct perf_event *event = handle->event; - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = local_inc_return(&data->events); + int events = local_inc_return(&buffer->events); if (events >= wakeup_events) { - local_sub(wakeup_events, &data->events); - local_inc(&data->wakeup); + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); } } @@ -5010,7 +5010,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_mmap_data *data = NULL, *old_data = NULL; + struct perf_buffer *buffer = NULL, *old_buffer = NULL; int ret = -EINVAL; if (!output_event) @@ -5040,19 +5040,19 @@ set: if (output_event) { /* get the buffer we want to redirect to */ - data = perf_mmap_data_get(output_event); - if (!data) + buffer = perf_buffer_get(output_event); + if (!buffer) goto unlock; } - old_data = event->data; - rcu_assign_pointer(event->data, data); + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_data) - perf_mmap_data_put(old_data); + if (old_buffer) + perf_buffer_put(old_buffer); out: return ret; } -- cgit v1.2.3-70-g09d2 From d57e34fdd60be7ffd0b1d86bfa1a553df86b7172 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 May 2010 19:41:35 +0200 Subject: perf: Simplify the ring-buffer logic: make perf_buffer_alloc() do everything needed Currently there are perf_buffer_alloc() + perf_buffer_init() + some separate bits, fold it all into a single perf_buffer_alloc() and only leave the attachment to the event separate. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/perf_event.c | 61 ++++++++++++++++++++++++++-------------------- 2 files changed, 36 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2a0da021c23..441992a9775 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -602,6 +602,8 @@ enum perf_event_active_state { struct file; +#define PERF_BUFFER_WRITABLE 0x01 + struct perf_buffer { atomic_t refcount; struct rcu_head rcu_head; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 93d545801e4..f75c9c9c817 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2369,6 +2369,25 @@ unlock: rcu_read_unlock(); } +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2401,7 +2420,7 @@ static void *perf_mmap_alloc_page(int cpu) } static struct perf_buffer * -perf_buffer_alloc(struct perf_event *event, int nr_pages) +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *buffer; unsigned long size; @@ -2414,18 +2433,20 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages) if (!buffer) goto fail; - buffer->user_page = perf_mmap_alloc_page(event->cpu); + buffer->user_page = perf_mmap_alloc_page(cpu); if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu); + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); if (!buffer->data_pages[i]) goto fail_data_pages; } buffer->nr_pages = nr_pages; + perf_buffer_init(buffer, watermark, flags); + return buffer; fail_data_pages: @@ -2516,7 +2537,7 @@ static void perf_buffer_free(struct perf_buffer *buffer) } static struct perf_buffer * -perf_buffer_alloc(struct perf_event *event, int nr_pages) +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *buffer; unsigned long size; @@ -2540,6 +2561,8 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages) buffer->page_order = ilog2(nr_pages); buffer->nr_pages = 1; + perf_buffer_init(buffer, watermark, flags); + return buffer; fail_all_buf: @@ -2591,23 +2614,6 @@ unlock: return ret; } -static void -perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer) -{ - long max_size = perf_data_size(buffer); - - if (event->attr.watermark) { - buffer->watermark = min_t(long, max_size, - event->attr.wakeup_watermark); - } - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - atomic_set(&buffer->refcount, 1); - rcu_assign_pointer(event->buffer, buffer); -} - static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { struct perf_buffer *buffer; @@ -2682,7 +2688,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; - int ret = 0; + int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -2747,15 +2753,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON(event->buffer); - buffer = perf_buffer_alloc(event, nr_pages); + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; + + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); if (!buffer) { ret = -ENOMEM; goto unlock; } - - perf_buffer_init(event, buffer); - if (vma->vm_flags & VM_WRITE) - event->buffer->writable = 1; + rcu_assign_pointer(event->buffer, buffer); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; -- cgit v1.2.3-70-g09d2 From 1996bda2a42480c275656233e631ee0966574be4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:05:13 +0200 Subject: arch: Implement local64_t On 64bit, local_t is of size long, and thus we make local64_t an alias. On 32bit, we fall back to atomic64_t. (architecture can provide optimized 32-bit version) (This new facility is to be used by perf events optimizations.) Signed-off-by: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/local64.h | 1 + arch/arm/include/asm/local64.h | 1 + arch/avr32/include/asm/local64.h | 1 + arch/blackfin/include/asm/local64.h | 1 + arch/cris/include/asm/local64.h | 1 + arch/frv/include/asm/local64.h | 1 + arch/frv/kernel/local64.h | 1 + arch/h8300/include/asm/local64.h | 1 + arch/ia64/include/asm/local64.h | 1 + arch/m32r/include/asm/local64.h | 1 + arch/m68k/include/asm/local64.h | 1 + arch/microblaze/include/asm/local64.h | 1 + arch/mips/include/asm/local64.h | 1 + arch/mn10300/include/asm/local64.h | 1 + arch/parisc/include/asm/local64.h | 1 + arch/powerpc/include/asm/local64.h | 1 + arch/s390/include/asm/local64.h | 1 + arch/score/include/asm/local64.h | 1 + arch/sh/include/asm/local64.h | 1 + arch/sparc/include/asm/local64.h | 1 + arch/x86/include/asm/local64.h | 1 + arch/xtensa/include/asm/local64.h | 1 + include/asm-generic/local64.h | 96 +++++++++++++++++++++++++++++++++++ 23 files changed, 118 insertions(+) create mode 100644 arch/alpha/include/asm/local64.h create mode 100644 arch/arm/include/asm/local64.h create mode 100644 arch/avr32/include/asm/local64.h create mode 100644 arch/blackfin/include/asm/local64.h create mode 100644 arch/cris/include/asm/local64.h create mode 100644 arch/frv/include/asm/local64.h create mode 100644 arch/frv/kernel/local64.h create mode 100644 arch/h8300/include/asm/local64.h create mode 100644 arch/ia64/include/asm/local64.h create mode 100644 arch/m32r/include/asm/local64.h create mode 100644 arch/m68k/include/asm/local64.h create mode 100644 arch/microblaze/include/asm/local64.h create mode 100644 arch/mips/include/asm/local64.h create mode 100644 arch/mn10300/include/asm/local64.h create mode 100644 arch/parisc/include/asm/local64.h create mode 100644 arch/powerpc/include/asm/local64.h create mode 100644 arch/s390/include/asm/local64.h create mode 100644 arch/score/include/asm/local64.h create mode 100644 arch/sh/include/asm/local64.h create mode 100644 arch/sparc/include/asm/local64.h create mode 100644 arch/x86/include/asm/local64.h create mode 100644 arch/xtensa/include/asm/local64.h create mode 100644 include/asm-generic/local64.h (limited to 'include') diff --git a/arch/alpha/include/asm/local64.h b/arch/alpha/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/alpha/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/arm/include/asm/local64.h b/arch/arm/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/arm/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/avr32/include/asm/local64.h b/arch/avr32/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/avr32/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/blackfin/include/asm/local64.h b/arch/blackfin/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/blackfin/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/cris/include/asm/local64.h b/arch/cris/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/cris/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/frv/include/asm/local64.h b/arch/frv/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/frv/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/frv/kernel/local64.h b/arch/frv/kernel/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/frv/kernel/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/h8300/include/asm/local64.h b/arch/h8300/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/h8300/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/ia64/include/asm/local64.h b/arch/ia64/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/ia64/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/m32r/include/asm/local64.h b/arch/m32r/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/m32r/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/m68k/include/asm/local64.h b/arch/m68k/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/m68k/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/microblaze/include/asm/local64.h b/arch/microblaze/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/microblaze/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/mips/include/asm/local64.h b/arch/mips/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/mips/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/mn10300/include/asm/local64.h b/arch/mn10300/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/mn10300/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/parisc/include/asm/local64.h b/arch/parisc/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/parisc/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/powerpc/include/asm/local64.h b/arch/powerpc/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/powerpc/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/s390/include/asm/local64.h b/arch/s390/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/s390/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/score/include/asm/local64.h b/arch/score/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/score/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/sh/include/asm/local64.h b/arch/sh/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/sh/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/sparc/include/asm/local64.h b/arch/sparc/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/sparc/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/x86/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/xtensa/include/asm/local64.h b/arch/xtensa/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/xtensa/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-generic/local64.h b/include/asm-generic/local64.h new file mode 100644 index 00000000000..02ac760c1a8 --- /dev/null +++ b/include/asm-generic/local64.h @@ -0,0 +1,96 @@ +#ifndef _ASM_GENERIC_LOCAL64_H +#define _ASM_GENERIC_LOCAL64_H + +#include +#include + +/* + * A signed long type for operations which are atomic for a single CPU. + * Usually used in combination with per-cpu variables. + * + * This is the default implementation, which uses atomic64_t. Which is + * rather pointless. The whole point behind local64_t is that some processors + * can perform atomic adds and subtracts in a manner which is atomic wrt IRQs + * running on this CPU. local64_t allows exploitation of such capabilities. + */ + +/* Implement in terms of atomics. */ + +#if BITS_PER_LONG == 64 + +#include + +typedef struct { + local_t a; +} local64_t; + +#define LOCAL64_INIT(i) { LOCAL_INIT(i) } + +#define local64_read(l) local_read(&(l)->a) +#define local64_set(l,i) local_set((&(l)->a),(i)) +#define local64_inc(l) local_inc(&(l)->a) +#define local64_dec(l) local_dec(&(l)->a) +#define local64_add(i,l) local_add((i),(&(l)->a)) +#define local64_sub(i,l) local_sub((i),(&(l)->a)) + +#define local64_sub_and_test(i, l) local_sub_and_test((i), (&(l)->a)) +#define local64_dec_and_test(l) local_dec_and_test(&(l)->a) +#define local64_inc_and_test(l) local_inc_and_test(&(l)->a) +#define local64_add_negative(i, l) local_add_negative((i), (&(l)->a)) +#define local64_add_return(i, l) local_add_return((i), (&(l)->a)) +#define local64_sub_return(i, l) local_sub_return((i), (&(l)->a)) +#define local64_inc_return(l) local_inc_return(&(l)->a) + +#define local64_cmpxchg(l, o, n) local_cmpxchg((&(l)->a), (o), (n)) +#define local64_xchg(l, n) local_xchg((&(l)->a), (n)) +#define local64_add_unless(l, _a, u) local_add_unless((&(l)->a), (_a), (u)) +#define local64_inc_not_zero(l) local_inc_not_zero(&(l)->a) + +/* Non-atomic variants, ie. preemption disabled and won't be touched + * in interrupt, etc. Some archs can optimize this case well. */ +#define __local64_inc(l) local64_set((l), local64_read(l) + 1) +#define __local64_dec(l) local64_set((l), local64_read(l) - 1) +#define __local64_add(i,l) local64_set((l), local64_read(l) + (i)) +#define __local64_sub(i,l) local64_set((l), local64_read(l) - (i)) + +#else /* BITS_PER_LONG != 64 */ + +#include + +/* Don't use typedef: don't want them to be mixed with atomic_t's. */ +typedef struct { + atomic64_t a; +} local64_t; + +#define LOCAL64_INIT(i) { ATOMIC_LONG_INIT(i) } + +#define local64_read(l) atomic64_read(&(l)->a) +#define local64_set(l,i) atomic64_set((&(l)->a),(i)) +#define local64_inc(l) atomic64_inc(&(l)->a) +#define local64_dec(l) atomic64_dec(&(l)->a) +#define local64_add(i,l) atomic64_add((i),(&(l)->a)) +#define local64_sub(i,l) atomic64_sub((i),(&(l)->a)) + +#define local64_sub_and_test(i, l) atomic64_sub_and_test((i), (&(l)->a)) +#define local64_dec_and_test(l) atomic64_dec_and_test(&(l)->a) +#define local64_inc_and_test(l) atomic64_inc_and_test(&(l)->a) +#define local64_add_negative(i, l) atomic64_add_negative((i), (&(l)->a)) +#define local64_add_return(i, l) atomic64_add_return((i), (&(l)->a)) +#define local64_sub_return(i, l) atomic64_sub_return((i), (&(l)->a)) +#define local64_inc_return(l) atomic64_inc_return(&(l)->a) + +#define local64_cmpxchg(l, o, n) atomic64_cmpxchg((&(l)->a), (o), (n)) +#define local64_xchg(l, n) atomic64_xchg((&(l)->a), (n)) +#define local64_add_unless(l, _a, u) atomic64_add_unless((&(l)->a), (_a), (u)) +#define local64_inc_not_zero(l) atomic64_inc_not_zero(&(l)->a) + +/* Non-atomic variants, ie. preemption disabled and won't be touched + * in interrupt, etc. Some archs can optimize this case well. */ +#define __local64_inc(l) local64_set((l), local64_read(l) + 1) +#define __local64_dec(l) local64_set((l), local64_read(l) - 1) +#define __local64_add(i,l) local64_set((l), local64_read(l) + (i)) +#define __local64_sub(i,l) local64_set((l), local64_read(l) - (i)) + +#endif /* BITS_PER_LONG != 64 */ + +#endif /* _ASM_GENERIC_LOCAL64_H */ -- cgit v1.2.3-70-g09d2 From a6e6dea68c18f705957573ee5596097c7e82d0e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:27:58 +0200 Subject: perf: Add perf_event::child_count Only child counters adding back their values into the parent counter are responsible for cross-cpu updates to event->count. So if we pull that out into a new child_count variable, we get an event->count that is only modified locally. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/perf_event.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 441992a9775..f34dab9b275 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -671,6 +671,7 @@ struct perf_event { enum perf_event_active_state state; unsigned int attach_state; atomic64_t count; + atomic64_t child_count; /* * These are the total time in nanoseconds that the event diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ab4c0ffc271..a395fda2d94 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return atomic64_read(&event->count); + return atomic64_read(&event->count) + atomic64_read(&event->child_count); } static u64 perf_event_read(struct perf_event *event) @@ -5379,7 +5379,7 @@ static void sync_child_event(struct perf_event *child_event, /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_event->count); + atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, -- cgit v1.2.3-70-g09d2 From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:43:08 +0200 Subject: perf: Convert perf_event to local_t Since now all modification to event->count (and ->prev_count and ->period_left) are local to a cpu, change then to local64_t so we avoid the LOCK'ed ops. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/arm/kernel/perf_event.c | 18 ++++++++--------- arch/powerpc/kernel/perf_event.c | 34 ++++++++++++++++---------------- arch/sh/kernel/perf_event.c | 6 +++--- arch/sparc/kernel/perf_event.c | 18 ++++++++--------- arch/x86/kernel/cpu/perf_event.c | 18 ++++++++--------- include/linux/perf_event.h | 7 ++++--- kernel/perf_event.c | 42 ++++++++++++++++++++-------------------- 7 files changed, 72 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index c45768614c8..5b7cfafc072 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -164,20 +164,20 @@ armpmu_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0; if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -185,7 +185,7 @@ armpmu_event_set_period(struct perf_event *event, if (left > (s64)armpmu->max_period) left = armpmu->max_period; - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); armpmu->write_counter(idx, (u64)(-left) & 0xffffffff); @@ -204,18 +204,18 @@ armpmu_event_update(struct perf_event *event, s64 delta; again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = armpmu->read_counter(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -478,7 +478,7 @@ __hw_perf_event_init(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = armpmu->max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } err = 0; diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index ac2a8c2554d..af1d9a7c65d 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -410,15 +410,15 @@ static void power_pmu_read(struct perf_event *event) * Therefore we treat them like NMIs. */ do { - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); /* The counters are only 32 bits wide */ delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &event->hw.period_left); + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); } /* @@ -444,10 +444,10 @@ static void freeze_limited_counters(struct cpu_hw_events *cpuhw, if (!event->hw.idx) continue; val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); event->hw.idx = 0; delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } } @@ -462,7 +462,7 @@ static void thaw_limited_counters(struct cpu_hw_events *cpuhw, event = cpuhw->limited_counter[i]; event->hw.idx = cpuhw->limited_hwidx[i]; val = (event->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); perf_event_update_userpage(event); } } @@ -666,11 +666,11 @@ void hw_perf_enable(void) } val = 0; if (event->hw.sample_period) { - left = atomic64_read(&event->hw.period_left); + left = local64_read(&event->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; } - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); event->hw.idx = idx; write_pmc(idx, val); perf_event_update_userpage(event); @@ -842,8 +842,8 @@ static void power_pmu_unthrottle(struct perf_event *event) if (left < 0x80000000L) val = 0x80000000L - left; write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); perf_enable(); local_irq_restore(flags); @@ -1109,7 +1109,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) event->hw.config = events[n]; event->hw.event_base = cflags[n]; event->hw.last_period = event->hw.sample_period; - atomic64_set(&event->hw.period_left, event->hw.last_period); + local64_set(&event->hw.period_left, event->hw.last_period); /* * See if we need to reserve the PMU. @@ -1147,16 +1147,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val, int record = 0; /* we don't have to worry about interrupts here */ - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); /* * See if the total period for this event has expired, * and update for the next period. */ val = 0; - left = atomic64_read(&event->hw.period_left) - delta; + left = local64_read(&event->hw.period_left) - delta; if (period) { if (left <= 0) { left += period; @@ -1194,8 +1194,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); } diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 81b6de41ae5..7a3dc356725 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -185,10 +185,10 @@ static void sh_perf_event_update(struct perf_event *event, * this is the simplest approach for maintaining consistency. */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = sh_pmu->read(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -203,7 +203,7 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static void sh_pmu_disable(struct perf_event *event) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index beeb92fa3ac..8a6660da8e0 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -572,18 +572,18 @@ static u64 sparc_perf_event_update(struct perf_event *event, s64 delta; again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = read_pmc(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -591,27 +591,27 @@ again: static int sparc_perf_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0; if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (left > MAX_PERIOD) left = MAX_PERIOD; - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); write_pmc(idx, (u64)(-left) & 0xffffffff); @@ -1087,7 +1087,7 @@ static int __hw_perf_event_init(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = MAX_PERIOD; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } return 0; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79e199843db..2d0d2906927 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); rdmsrl(hwc->event_base + idx, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -314,8 +314,8 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -886,7 +886,7 @@ static int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; @@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event) * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f34dab9b275..7342979f95f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -487,6 +487,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #define PERF_MAX_STACK_DEPTH 255 @@ -536,10 +537,10 @@ struct hw_perf_event { struct arch_hw_breakpoint info; #endif }; - atomic64_t prev_count; + local64_t prev_count; u64 sample_period; u64 last_period; - atomic64_t period_left; + local64_t period_left; u64 interrupts; u64 freq_time_stamp; @@ -670,7 +671,7 @@ struct perf_event { enum perf_event_active_state state; unsigned int attach_state; - atomic64_t count; + local64_t count; atomic64_t child_count; /* diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a395fda2d94..97c73018592 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1148,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event, * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_event->count); - value = atomic64_xchg(&event->count, value); - atomic64_set(&next_event->count, value); + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); @@ -1540,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; - if (atomic64_read(&hwc->period_left) > 8*sample_period) { + if (local64_read(&hwc->period_left) > 8*sample_period) { perf_disable(); perf_event_stop(event); - atomic64_set(&hwc->period_left, 0); + local64_set(&hwc->period_left, 0); perf_event_start(event); perf_enable(); } @@ -1584,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_disable(); event->pmu->read(event); - now = atomic64_read(&event->count); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; @@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return atomic64_read(&event->count) + atomic64_read(&event->child_count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } static u64 perf_event_read(struct perf_event *event) @@ -2141,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); - atomic64_set(&event->count, 0); + local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -2359,7 +2359,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= atomic64_read(&event->hw.prev_count); + userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -4035,14 +4035,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; again: - old = val = atomic64_read(&hwc->period_left); + old = val = local64_read(&hwc->period_left); if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + if (local64_cmpxchg(&hwc->period_left, old, val) != old) goto again; return nr; @@ -4081,7 +4081,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &event->count); + local64_add(nr, &event->count); if (!regs) return; @@ -4092,7 +4092,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, nmi, data, regs); - if (atomic64_add_negative(nr, &hwc->period_left)) + if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, nmi, data, regs); @@ -4383,8 +4383,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_xchg(&event->hw.prev_count, now); - atomic64_add(now - prev, &event->count); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } static int cpu_clock_perf_event_enable(struct perf_event *event) @@ -4392,7 +4392,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int cpu = raw_smp_processor_id(); - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + local64_set(&hwc->prev_count, cpu_clock(cpu)); perf_swevent_start_hrtimer(event); return 0; @@ -4424,9 +4424,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now) u64 prev; s64 delta; - prev = atomic64_xchg(&event->hw.prev_count, now); + prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static int task_clock_perf_event_enable(struct perf_event *event) @@ -4436,7 +4436,7 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; - atomic64_set(&hwc->prev_count, now); + local64_set(&hwc->prev_count, now); perf_swevent_start_hrtimer(event); @@ -4879,7 +4879,7 @@ perf_event_alloc(struct perf_event_attr *attr, hwc->sample_period = 1; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_FORMAT_GROUP on inherited events @@ -5313,7 +5313,7 @@ inherit_event(struct perf_event *parent_event, hwc->sample_period = sample_period; hwc->last_period = sample_period; - atomic64_set(&hwc->period_left, sample_period); + local64_set(&hwc->period_left, sample_period); } child_event->overflow_handler = parent_event->overflow_handler; -- cgit v1.2.3-70-g09d2 From 7be7923633a142402266d642ccebf74f556a649b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Jun 2010 11:57:23 +0200 Subject: perf: Fix build breakage for architecutes without atomic64_t The local64.h include dependency was not dependent on PERF_EVENT=y, which meant that arch's without atomic64_t support ended up including it and failed to build. Signed-off-by: Peter Zijlstra LKML-Reference: --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7342979f95f..1218d05728b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -462,6 +462,7 @@ enum perf_callchain_context { #ifdef CONFIG_PERF_EVENTS # include +# include #endif struct perf_guest_info_callbacks { @@ -487,7 +488,6 @@ struct perf_guest_info_callbacks { #include #include #include -#include #define PERF_MAX_STACK_DEPTH 255 -- cgit v1.2.3-70-g09d2 From b3c5163fe0193a74016dba1bb22491e0d1e9aaa4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jun 2010 14:43:38 +0200 Subject: netfilter: nf_conntrack: per_cpu untracking NOTRACK makes all cpus share a cache line on nf_conntrack_untracked twice per packet, slowing down performance. This patch converts it to a per_cpu variable. Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 5 ++--- net/netfilter/nf_conntrack_core.c | 36 ++++++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3bc38c70bbb..84a4b6fec99 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -261,11 +261,10 @@ extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Fake conntrack entry for untracked connections */ +DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked); static inline struct nf_conn *nf_ct_untracked_get(void) { - extern struct nf_conn nf_conntrack_untracked; - - return &nf_conntrack_untracked; + return &__raw_get_cpu_var(nf_conntrack_untracked); } extern void nf_ct_untracked_status_or(unsigned long bits); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6c1da212380..9c661413b82 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -62,8 +62,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -struct nf_conn nf_conntrack_untracked; -EXPORT_SYMBOL_GPL(nf_conntrack_untracked); +DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -1183,10 +1183,21 @@ static void nf_ct_release_dying_list(struct net *net) spin_unlock_bh(&nf_conntrack_lock); } +static int untrack_refs(void) +{ + int cnt = 0, cpu; + + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + + cnt += atomic_read(&ct->ct_general.use) - 1; + } + return cnt; +} + static void nf_conntrack_cleanup_init_net(void) { - /* wait until all references to nf_conntrack_untracked are dropped */ - while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + while (untrack_refs() > 0) schedule(); nf_conntrack_helper_fini(); @@ -1323,14 +1334,17 @@ module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, void nf_ct_untracked_status_or(unsigned long bits) { - nf_conntrack_untracked.status |= bits; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(nf_conntrack_untracked, cpu).status |= bits; } EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); static int nf_conntrack_init_init_net(void) { int max_factor = 8; - int ret; + int ret, cpu; /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1369,10 +1383,12 @@ static int nf_conntrack_init_init_net(void) goto err_extend; #endif /* Set up fake conntrack: to never be deleted, not in any hashes */ -#ifdef CONFIG_NET_NS - nf_conntrack_untracked.ct_net = &init_net; -#endif - atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + + write_pnet(&ct->ct_net, &init_net); + atomic_set(&ct->ct_general.use, 1); + } /* - and look it like as a confirmed connection */ nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); return 0; -- cgit v1.2.3-70-g09d2 From 039ca4e74a1cf60bd7487324a564ecf5c981f254 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 26 May 2010 17:22:17 +0800 Subject: tracing: Remove kmemtrace ftrace plugin We have been resisting new ftrace plugins and removing existing ones, and kmemtrace has been superseded by kmem trace events and perf-kmem, so we remove it. Signed-off-by: Li Zefan Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Ingo Molnar Cc: Steven Rostedt [ remove kmemtrace from the makefile, handle slob too ] Signed-off-by: Frederic Weisbecker --- Documentation/ABI/testing/debugfs-kmemtrace | 71 ---- Documentation/trace/kmemtrace.txt | 126 ------- MAINTAINERS | 7 - include/linux/kmemtrace.h | 25 -- include/linux/slab_def.h | 3 +- include/linux/slub_def.h | 3 +- init/main.c | 2 - kernel/trace/Kconfig | 20 -- kernel/trace/Makefile | 1 - kernel/trace/kmemtrace.c | 529 ---------------------------- kernel/trace/trace.h | 12 - kernel/trace/trace_entries.h | 35 -- mm/slab.c | 1 - mm/slob.c | 4 +- mm/slub.c | 1 - 15 files changed, 7 insertions(+), 833 deletions(-) delete mode 100644 Documentation/ABI/testing/debugfs-kmemtrace delete mode 100644 Documentation/trace/kmemtrace.txt delete mode 100644 include/linux/kmemtrace.h delete mode 100644 kernel/trace/kmemtrace.c (limited to 'include') diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace deleted file mode 100644 index 5e6a92a02d8..00000000000 --- a/Documentation/ABI/testing/debugfs-kmemtrace +++ /dev/null @@ -1,71 +0,0 @@ -What: /sys/kernel/debug/kmemtrace/ -Date: July 2008 -Contact: Eduard - Gabriel Munteanu -Description: - -In kmemtrace-enabled kernels, the following files are created: - -/sys/kernel/debug/kmemtrace/ - cpu (0400) Per-CPU tracing data, see below. (binary) - total_overruns (0400) Total number of bytes which were dropped from - cpu files because of full buffer condition, - non-binary. (text) - abi_version (0400) Kernel's kmemtrace ABI version. (text) - -Each per-CPU file should be read according to the relay interface. That is, -the reader should set affinity to that specific CPU and, as currently done by -the userspace application (though there are other methods), use poll() with -an infinite timeout before every read(). Otherwise, erroneous data may be -read. The binary data has the following _core_ format: - - Event ID (1 byte) Unsigned integer, one of: - 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) - 1 - represents a freeing of previously allocated memory - (KMEMTRACE_EVENT_FREE) - Type ID (1 byte) Unsigned integer, one of: - 0 - this is a kmalloc() / kfree() - 1 - this is a kmem_cache_alloc() / kmem_cache_free() - 2 - this is a __get_free_pages() et al. - Event size (2 bytes) Unsigned integer representing the - size of this event. Used to extend - kmemtrace. Discard the bytes you - don't know about. - Sequence number (4 bytes) Signed integer used to reorder data - logged on SMP machines. Wraparound - must be taken into account, although - it is unlikely. - Caller address (8 bytes) Return address to the caller. - Pointer to mem (8 bytes) Pointer to target memory area. Can be - NULL, but not all such calls might be - recorded. - -In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: - - Requested bytes (8 bytes) Total number of requested bytes, - unsigned, must not be zero. - Allocated bytes (8 bytes) Total number of actually allocated - bytes, unsigned, must not be lower - than requested bytes. - Requested flags (4 bytes) GFP flags supplied by the caller. - Target CPU (4 bytes) Signed integer, valid for event id 1. - If equal to -1, target CPU is the same - as origin CPU, but the reverse might - not be true. - -The data is made available in the same endianness the machine has. - -Other event ids and type ids may be defined and added. Other fields may be -added by increasing event size, but see below for details. -Every modification to the ABI, including new id definitions, are followed -by bumping the ABI version by one. - -Adding new data to the packet (features) is done at the end of the mandatory -data: - Feature size (2 byte) - Feature ID (1 byte) - Feature data (Feature size - 3 bytes) - - -Users: - kmemtrace-user - git://repo.or.cz/kmemtrace-user.git - diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt deleted file mode 100644 index 6308735e58c..00000000000 --- a/Documentation/trace/kmemtrace.txt +++ /dev/null @@ -1,126 +0,0 @@ - kmemtrace - Kernel Memory Tracer - - by Eduard - Gabriel Munteanu - - -I. Introduction -=============== - -kmemtrace helps kernel developers figure out two things: -1) how different allocators (SLAB, SLUB etc.) perform -2) how kernel code allocates memory and how much - -To do this, we trace every allocation and export information to the userspace -through the relay interface. We export things such as the number of requested -bytes, the number of bytes actually allocated (i.e. including internal -fragmentation), whether this is a slab allocation or a plain kmalloc() and so -on. - -The actual analysis is performed by a userspace tool (see section III for -details on where to get it from). It logs the data exported by the kernel, -processes it and (as of writing this) can provide the following information: -- the total amount of memory allocated and fragmentation per call-site -- the amount of memory allocated and fragmentation per allocation -- total memory allocated and fragmentation in the collected dataset -- number of cross-CPU allocation and frees (makes sense in NUMA environments) - -Moreover, it can potentially find inconsistent and erroneous behavior in -kernel code, such as using slab free functions on kmalloc'ed memory or -allocating less memory than requested (but not truly failed allocations). - -kmemtrace also makes provisions for tracing on some arch and analysing the -data on another. - -II. Design and goals -==================== - -kmemtrace was designed to handle rather large amounts of data. Thus, it uses -the relay interface to export whatever is logged to userspace, which then -stores it. Analysis and reporting is done asynchronously, that is, after the -data is collected and stored. By design, it allows one to log and analyse -on different machines and different arches. - -As of writing this, the ABI is not considered stable, though it might not -change much. However, no guarantees are made about compatibility yet. When -deemed stable, the ABI should still allow easy extension while maintaining -backward compatibility. This is described further in Documentation/ABI. - -Summary of design goals: - - allow logging and analysis to be done across different machines - - be fast and anticipate usage in high-load environments (*) - - be reasonably extensible - - make it possible for GNU/Linux distributions to have kmemtrace - included in their repositories - -(*) - one of the reasons Pekka Enberg's original userspace data analysis - tool's code was rewritten from Perl to C (although this is more than a - simple conversion) - - -III. Quick usage guide -====================== - -1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable -CONFIG_KMEMTRACE). - -2) Get the userspace tool and build it: -$ git clone git://repo.or.cz/kmemtrace-user.git # current repository -$ cd kmemtrace-user/ -$ ./autogen.sh -$ ./configure -$ make - -3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the -'single' runlevel (so that relay buffers don't fill up easily), and run -kmemtrace: -# '$' does not mean user, but root here. -$ mount -t debugfs none /sys/kernel/debug -$ mount -t proc none /proc -$ cd path/to/kmemtrace-user/ -$ ./kmemtraced -Wait a bit, then stop it with CTRL+C. -$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't - # overrun, should - # be zero. -$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to - check its correctness] -$ ./kmemtrace-report - -Now you should have a nice and short summary of how the allocator performs. - -IV. FAQ and known issues -======================== - -Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix -this? Should I worry? -A: If it's non-zero, this affects kmemtrace's accuracy, depending on how -large the number is. You can fix it by supplying a higher -'kmemtrace.subbufs=N' kernel parameter. ---- - -Q: kmemtrace_check reports errors, how do I fix this? Should I worry? -A: This is a bug and should be reported. It can occur for a variety of -reasons: - - possible bugs in relay code - - possible misuse of relay by kmemtrace - - timestamps being collected unorderly -Or you may fix it yourself and send us a patch. ---- - -Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? -A: This is a known issue and I'm working on it. These might be true errors -in kernel code, which may have inconsistent behavior (e.g. allocating memory -with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed -out this behavior may work with SLAB, but may fail with other allocators. - -It may also be due to lack of tracing in some unusual allocator functions. - -We don't want bug reports regarding this issue yet. ---- - -V. See also -=========== - -Documentation/kernel-parameters.txt -Documentation/ABI/testing/debugfs-kmemtrace - diff --git a/MAINTAINERS b/MAINTAINERS index 33047a60543..67e6e9d848d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3361,13 +3361,6 @@ F: include/linux/kmemleak.h F: mm/kmemleak.c F: mm/kmemleak-test.c -KMEMTRACE -M: Eduard - Gabriel Munteanu -S: Maintained -F: Documentation/trace/kmemtrace.txt -F: include/linux/kmemtrace.h -F: kernel/trace/kmemtrace.c - KPROBES M: Ananth N Mavinakayanahalli M: Anil S Keshavamurthy diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h deleted file mode 100644 index b616d3930c3..00000000000 --- a/include/linux/kmemtrace.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#ifndef _LINUX_KMEMTRACE_H -#define _LINUX_KMEMTRACE_H - -#ifdef __KERNEL__ - -#include - -#ifdef CONFIG_KMEMTRACE -extern void kmemtrace_init(void); -#else -static inline void kmemtrace_init(void) -{ -} -#endif - -#endif /* __KERNEL__ */ - -#endif /* _LINUX_KMEMTRACE_H */ - diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 1812dac8c49..1acfa73ce2a 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,7 +14,8 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include -#include + +#include #ifndef ARCH_KMALLOC_MINALIGN /* diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 55695c8d2f8..2345d3a033e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,9 +10,10 @@ #include #include #include -#include #include +#include + enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ diff --git a/init/main.c b/init/main.c index 94f65efdc65..e2a2bf3a169 100644 --- a/init/main.c +++ b/init/main.c @@ -66,7 +66,6 @@ #include #include #include -#include #include #include #include @@ -652,7 +651,6 @@ asmlinkage void __init start_kernel(void) #endif page_cgroup_init(); enable_debug_pagealloc(); - kmemtrace_init(); kmemleak_init(); debug_objects_mem_init(); idr_init_cache(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 572992abc71..f669092fdea 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -354,26 +354,6 @@ config STACK_TRACER Say N if unsure. -config KMEMTRACE - bool "Trace SLAB allocations" - select GENERIC_TRACER - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/trace/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - config WORKQUEUE_TRACER bool "Trace workqueues" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c3aaeba8237..469a1c7555a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -40,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660..00000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Memory allocator tracing - * - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * Copyright (C) 2008 Pekka Enberg - * Copyright (C) 2008 Frederic Weisbecker - */ - -#include -#include -#include -#include -#include - -#include - -#include "trace_output.h" -#include "trace.h" - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_KMEM_OPT_MINIMAL 0x1 - -static struct tracer_opt kmem_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, - { } -}; - -static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts -}; - -static struct trace_array *kmemtrace_array; - -/* Trace allocations */ -static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ftrace_event_call *call = &event_kmem_alloc; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_alloc_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_ALLOC; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static inline void kmemtrace_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ftrace_event_call *call = &event_kmem_free; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_free_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static void kmemtrace_kmalloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmem_cache_alloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmalloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kmem_cache_alloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void -kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); -} - -static void kmemtrace_kmem_cache_free(void *ignore, - unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); -} - -static int kmemtrace_start_probes(void) -{ - int err; - - err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - if (err) - return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - if (err) - return err; - err = register_trace_kfree(kmemtrace_kfree, NULL); - if (err) - return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - unregister_trace_kfree(kmemtrace_kfree, NULL); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); -} - -static int kmem_trace_init(struct trace_array *tr) -{ - kmemtrace_array = tr; - - tracing_reset_online_cpus(tr); - - kmemtrace_start_probes(); - - return 0; -} - -static void kmem_trace_reset(struct trace_array *tr) -{ - kmemtrace_stop_probes(); -} - -static void kmemtrace_headers(struct seq_file *s) -{ - /* Don't need headers for the original kmemtrace output */ - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return; - - seq_printf(s, "#\n"); - seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " - " POINTER NODE CALLER\n"); - seq_printf(s, "# FREE | | | | " - " | | | |\n"); - seq_printf(s, "# |\n\n"); -} - -/* - * The following functions give the original output from kmemtrace, - * plus the origin CPU, since reordering occurs in-kernel now. - */ - -#define KMEMTRACE_USER_ALLOC 0 -#define KMEMTRACE_USER_FREE 1 - -struct kmemtrace_user_event { - u8 event_id; - u8 type_id; - u16 event_size; - u32 cpu; - u64 timestamp; - unsigned long call_site; - unsigned long ptr; -}; - -struct kmemtrace_user_event_alloc { - size_t bytes_req; - size_t bytes_alloc; - unsigned gfp_flags; - int node; -}; - -static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, - (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, - (unsigned long)entry->gfp_flags, entry->node); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", - entry->type_id, (void *)entry->call_site, - (unsigned long)entry->ptr); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - struct kmemtrace_user_event *ev; - struct kmemtrace_user_event_alloc *ev_alloc; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_ALLOC; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); - if (!ev_alloc) - return TRACE_TYPE_PARTIAL_LINE; - - ev_alloc->bytes_req = entry->bytes_req; - ev_alloc->bytes_alloc = entry->bytes_alloc; - ev_alloc->gfp_flags = entry->gfp_flags; - ev_alloc->node = entry->node; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - struct kmemtrace_user_event *ev; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_FREE; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - return TRACE_TYPE_HANDLED; -} - -/* The two other following provide a more minimalistic output */ -static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter) -{ - struct kmemtrace_alloc_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Alloc entry */ - ret = trace_seq_printf(s, " + "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Requested */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Allocated */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Flags - * TODO: would be better to see the name of the GFP flag names - */ - ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Node and call site*/ - ret = trace_seq_printf(s, "%4d %pf\n", entry->node, - (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter) -{ - struct kmemtrace_free_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Free entry */ - ret = trace_seq_printf(s, " - "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip requested/allocated/flags */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip node and print call site*/ - ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return TRACE_TYPE_UNHANDLED; - - switch (entry->type) { - case TRACE_KMEM_ALLOC: - return kmemtrace_print_alloc_compress(iter); - case TRACE_KMEM_FREE: - return kmemtrace_print_free_compress(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -static struct trace_event_functions kmem_trace_alloc_funcs = { - .trace = kmemtrace_print_alloc, - .binary = kmemtrace_print_alloc_user, -}; - -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, - .funcs = &kmem_trace_alloc_funcs, -}; - -static struct trace_event_functions kmem_trace_free_funcs = { - .trace = kmemtrace_print_free, - .binary = kmemtrace_print_free_user, -}; - -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, - .funcs = &kmem_trace_free_funcs, -}; - -static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags -}; - -void kmemtrace_init(void) -{ - /* earliest opportunity to start kmem tracing */ -} - -static int __init init_kmem_tracer(void) -{ - if (!register_ftrace_event(&kmem_trace_alloc)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (!register_ftrace_event(&kmem_trace_free)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (register_tracer(&kmem_tracer) != 0) { - pr_warning("Warning: could not register the kmem tracer\n"); - return 1; - } - - return 0; -} -device_initcall(init_kmem_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 75a5e800a73..075cd2ea84a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -30,19 +29,12 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_KMEM_ALLOC, - TRACE_KMEM_FREE, TRACE_BLK, TRACE_KSYM, __TRACE_LAST_TYPE, }; -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; #undef __field #define __field(type, item) type item; @@ -208,10 +200,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ - TRACE_KMEM_ALLOC); \ - IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ - TRACE_KMEM_FREE); \ IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c293364c984..13abc157dba 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -291,41 +291,6 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, - - TRACE_KMEM_ALLOC, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" - " flags:%x node:%d", - __entry->type_id, __entry->call_site, __entry->ptr, - __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, __entry->node) -); - -FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, - - TRACE_KMEM_FREE, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - F_printk("type:%u call_site:%lx ptr:%p", - __entry->type_id, __entry->call_site, __entry->ptr) -); - FTRACE_ENTRY(ksym_trace, ksym_trace_entry, TRACE_KSYM, diff --git a/mm/slab.c b/mm/slab.c index e49f8f46f46..47360c3e5ab 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/slob.c b/mm/slob.c index 23631e2bb57..a82ab5811bd 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -66,8 +66,10 @@ #include #include #include -#include #include + +#include + #include /* diff --git a/mm/slub.c b/mm/slub.c index 26f0cb9cc58..a61f1aad107 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-70-g09d2 From 88e7594a9775e54dcd421cb246406dce62e48bee Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 7 Jun 2010 03:27:39 +0000 Subject: phonet: use call_rcu for phonet device free Use call_rcu rather than synchronize_rcu. Signed-off-by: Jiri Pirko Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/phonet/pn_dev.h | 1 + net/phonet/pn_dev.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index d7b989ca3d6..2d16783d5e2 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -34,6 +34,7 @@ struct phonet_device { struct list_head list; struct net_device *netdev; DECLARE_BITMAP(addrs, 64); + struct rcu_head rcu; }; int phonet_device_init(void); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index c33da657694..b18e48fae97 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -162,6 +162,14 @@ int phonet_address_add(struct net_device *dev, u8 addr) return err; } +static void phonet_device_rcu_free(struct rcu_head *head) +{ + struct phonet_device *pnd; + + pnd = container_of(head, struct phonet_device, rcu); + kfree(pnd); +} + int phonet_address_del(struct net_device *dev, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); @@ -179,10 +187,9 @@ int phonet_address_del(struct net_device *dev, u8 addr) pnd = NULL; mutex_unlock(&pndevs->lock); - if (pnd) { - synchronize_rcu(); - kfree(pnd); - } + if (pnd) + call_rcu(&pnd->rcu, phonet_device_rcu_free); + return err; } -- cgit v1.2.3-70-g09d2 From 8e4b50f94e8c1435a3e0ece42b7f97bc857d0145 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 10 Jun 2010 08:26:28 +0200 Subject: firewire: core: add CSR SPLIT_TIMEOUT support Implement the SPLIT_TIMEOUT registers. Besides being required by the spec, this is desirable for some IIDC devices and necessary for many audio devices to be able to increase the timeout from userspace. Signed-off-by: Clemens Ladisch --- drivers/firewire/core-card.c | 4 ++ drivers/firewire/core-transaction.c | 76 +++++++++++++++++++++++++++++++------ include/linux/firewire.h | 5 +++ 3 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 901435cdd5c..d0f15c2f1e1 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -428,6 +428,10 @@ void fw_card_initialize(struct fw_card *card, card->device = device; card->current_tlabel = 0; card->tlabel_mask = 0; + card->split_timeout_hi = 0; + card->split_timeout_lo = 800 << 19; + card->split_timeout_cycles = 800; + card->split_timeout_jiffies = DIV_ROUND_UP(HZ, 10); card->color = 0; card->broadcast_channel = BROADCAST_CHANNEL_INITIAL; diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 0034229dfd1..9a7d3ec23f2 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -339,7 +339,8 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode, setup_timer(&t->split_timeout_timer, split_transaction_timeout_callback, (unsigned long)t); /* FIXME: start this timer later, relative to t->timestamp */ - mod_timer(&t->split_timeout_timer, jiffies + DIV_ROUND_UP(HZ, 10)); + mod_timer(&t->split_timeout_timer, + jiffies + card->split_timeout_jiffies); t->callback = callback; t->callback_data = callback_data; @@ -673,11 +674,28 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header, } EXPORT_SYMBOL(fw_fill_response); -static struct fw_request *allocate_request(struct fw_packet *p) +static u32 compute_split_timeout_timestamp(struct fw_card *card, + u32 request_timestamp) +{ + unsigned int cycles; + u32 timestamp; + + cycles = card->split_timeout_cycles; + cycles += request_timestamp & 0x1fff; + + timestamp = request_timestamp & ~0x1fff; + timestamp += (cycles / 8000) << 13; + timestamp |= cycles % 8000; + + return timestamp; +} + +static struct fw_request *allocate_request(struct fw_card *card, + struct fw_packet *p) { struct fw_request *request; u32 *data, length; - int request_tcode, t; + int request_tcode; request_tcode = HEADER_GET_TCODE(p->header[0]); switch (request_tcode) { @@ -712,14 +730,9 @@ static struct fw_request *allocate_request(struct fw_packet *p) if (request == NULL) return NULL; - t = (p->timestamp & 0x1fff) + 4000; - if (t >= 8000) - t = (p->timestamp & ~0x1fff) + 0x2000 + t - 8000; - else - t = (p->timestamp & ~0x1fff) + t; - request->response.speed = p->speed; - request->response.timestamp = t; + request->response.timestamp = + compute_split_timeout_timestamp(card, p->timestamp); request->response.generation = p->generation; request->response.ack = 0; request->response.callback = free_response_callback; @@ -845,7 +858,7 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p) if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE) return; - request = allocate_request(p); + request = allocate_request(card, p); if (request == NULL) { /* FIXME: send statically allocated busy packet. */ return; @@ -993,6 +1006,19 @@ static u32 read_state_register(struct fw_card *card) return 0; } +static void update_split_timeout(struct fw_card *card) +{ + unsigned int cycles; + + cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19); + + cycles = max(cycles, 800u); /* minimum as per the spec */ + cycles = min(cycles, 3u * 8000u); /* maximum OHCI timeout */ + + card->split_timeout_cycles = cycles; + card->split_timeout_jiffies = DIV_ROUND_UP(cycles * HZ, 8000); +} + static void handle_registers(struct fw_card *card, struct fw_request *request, int tcode, int destination, int source, int generation, int speed, unsigned long long offset, @@ -1001,6 +1027,7 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, int reg = offset & ~CSR_REGISTER_BASE; __be32 *data = payload; int rcode = RCODE_COMPLETE; + unsigned long flags; switch (reg) { case CSR_STATE_CLEAR: @@ -1039,6 +1066,33 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, rcode = RCODE_TYPE_ERROR; break; + case CSR_SPLIT_TIMEOUT_HI: + if (tcode == TCODE_READ_QUADLET_REQUEST) { + *data = cpu_to_be32(card->split_timeout_hi); + } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { + spin_lock_irqsave(&card->lock, flags); + card->split_timeout_hi = be32_to_cpu(*data) & 7; + update_split_timeout(card); + spin_unlock_irqrestore(&card->lock, flags); + } else { + rcode = RCODE_TYPE_ERROR; + } + break; + + case CSR_SPLIT_TIMEOUT_LO: + if (tcode == TCODE_READ_QUADLET_REQUEST) { + *data = cpu_to_be32(card->split_timeout_lo); + } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { + spin_lock_irqsave(&card->lock, flags); + card->split_timeout_lo = + be32_to_cpu(*data) & 0xfff80000; + update_split_timeout(card); + spin_unlock_irqrestore(&card->lock, flags); + } else { + rcode = RCODE_TYPE_ERROR; + } + break; + case CSR_CYCLE_TIME: if (TCODE_IS_READ_REQUEST(tcode) && length == 4) *data = cpu_to_be32(card->driver-> diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 72e2b8ac2a5..cdf8213c68c 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -89,6 +89,11 @@ struct fw_card { struct list_head transaction_list; unsigned long reset_jiffies; + u32 split_timeout_hi; + u32 split_timeout_lo; + unsigned int split_timeout_cycles; + unsigned int split_timeout_jiffies; + unsigned long long guid; unsigned max_receive; int link_speed; -- cgit v1.2.3-70-g09d2 From a1a1132bd83d0aea51d4f19be4b4a58a064a0131 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 10 Jun 2010 08:35:06 +0200 Subject: firewire: add CSR PRIORITY_BUDGET support If supported by the OHCI controller, implement the PRIORITY_BUDGET register, which is required for nodes that can use asynchronous priority arbitration. To allow the core to determine what features the lowlevel device supports, add a new card driver callback. Signed-off-by: Clemens Ladisch --- drivers/firewire/core-transaction.c | 14 ++++++++++++++ drivers/firewire/core.h | 4 ++++ drivers/firewire/ohci.c | 27 +++++++++++++++++++++++++++ include/linux/firewire.h | 1 + 4 files changed, 46 insertions(+) (limited to 'include') diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 8146133818d..a61eb3fb957 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1126,6 +1126,20 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, rcode = RCODE_TYPE_ERROR; break; + case CSR_PRIORITY_BUDGET: + if (!(card->driver->get_features(card) & + FEATURE_PRIORITY_BUDGET)) + rcode = RCODE_ADDRESS_ERROR; + else if (tcode == TCODE_READ_QUADLET_REQUEST) + *data = cpu_to_be32(card->driver-> + read_csr_reg(card, CSR_PRIORITY_BUDGET)); + else if (tcode == TCODE_WRITE_QUADLET_REQUEST) + card->driver->write_csr_reg(card, CSR_PRIORITY_BUDGET, + be32_to_cpu(*data)); + else + rcode = RCODE_TYPE_ERROR; + break; + case CSR_BROADCAST_CHANNEL: if (tcode == TCODE_READ_QUADLET_REQUEST) *data = cpu_to_be32(card->broadcast_channel); diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index efcdeb2e31e..3b8c0f042f4 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -38,6 +38,8 @@ struct fw_packet; #define BROADCAST_CHANNEL_INITIAL (1 << 31 | 31) #define BROADCAST_CHANNEL_VALID (1 << 30) +#define FEATURE_PRIORITY_BUDGET 0x01 + struct fw_card_driver { /* * Enable the given card with the given initial config rom. @@ -78,6 +80,8 @@ struct fw_card_driver { u32 (*read_csr_reg)(struct fw_card *card, int csr_offset); void (*write_csr_reg)(struct fw_card *card, int csr_offset, u32 value); + unsigned int (*get_features)(struct fw_card *card); + struct fw_iso_context * (*allocate_iso_context)(struct fw_card *card, int type, int channel, size_t header_size); diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 9c588fd0125..0e541353178 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -170,6 +170,7 @@ struct fw_ohci { int generation; int request_generation; /* for timestamping incoming requests */ unsigned quirks; + unsigned int pri_req_max; u32 bus_time; /* @@ -1738,6 +1739,11 @@ static int ohci_enable(struct fw_card *card, reg_write(ohci, OHCI1394_IsochronousCycleTimer, seconds << 25); ohci->bus_time = seconds & ~0x3f; + /* Get implemented bits of the priority arbitration request counter. */ + reg_write(ohci, OHCI1394_FairnessControl, 0x3f); + ohci->pri_req_max = reg_read(ohci, OHCI1394_FairnessControl) & 0x3f; + reg_write(ohci, OHCI1394_FairnessControl, 0); + ar_context_run(&ohci->ar_request_ctx); ar_context_run(&ohci->ar_response_ctx); @@ -2028,6 +2034,10 @@ static u32 ohci_read_csr_reg(struct fw_card *card, int csr_offset) value = reg_read(ohci, OHCI1394_ATRetries); return (value >> 4) & 0x0ffff00f; + case CSR_PRIORITY_BUDGET: + return (reg_read(ohci, OHCI1394_FairnessControl) & 0x3f) | + (ohci->pri_req_max << 8); + default: WARN_ON(1); return 0; @@ -2065,12 +2075,28 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value) flush_writes(ohci); break; + case CSR_PRIORITY_BUDGET: + reg_write(ohci, OHCI1394_FairnessControl, value & 0x3f); + flush_writes(ohci); + break; + default: WARN_ON(1); break; } } +static unsigned int ohci_get_features(struct fw_card *card) +{ + struct fw_ohci *ohci = fw_ohci(card); + unsigned int features = 0; + + if (ohci->pri_req_max != 0) + features |= FEATURE_PRIORITY_BUDGET; + + return features; +} + static void copy_iso_headers(struct iso_context *ctx, void *p) { int i = ctx->header_length; @@ -2510,6 +2536,7 @@ static const struct fw_card_driver ohci_driver = { .enable_phys_dma = ohci_enable_phys_dma, .read_csr_reg = ohci_read_csr_reg, .write_csr_reg = ohci_write_csr_reg, + .get_features = ohci_get_features, .allocate_iso_context = ohci_allocate_iso_context, .free_iso_context = ohci_free_iso_context, diff --git a/include/linux/firewire.h b/include/linux/firewire.h index cdf8213c68c..a50377d9125 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -32,6 +32,7 @@ #define CSR_CYCLE_TIME 0x200 #define CSR_BUS_TIME 0x204 #define CSR_BUSY_TIMEOUT 0x210 +#define CSR_PRIORITY_BUDGET 0x218 #define CSR_BUS_MANAGER_ID 0x21c #define CSR_BANDWIDTH_AVAILABLE 0x220 #define CSR_CHANNELS_AVAILABLE 0x224 -- cgit v1.2.3-70-g09d2 From 3d1f46eb60b155c705e389ecdf313f11b4b91976 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 10 Jun 2010 08:35:37 +0200 Subject: firewire: core: add CSR MAINT_UTILITY support Implement the MAIN_UTILITY register, which is utterly optional but useful as a safe target for diagnostic read/write/broadcast transactions. Signed-off-by: Clemens Ladisch --- drivers/firewire/core-transaction.c | 9 +++++++++ include/linux/firewire.h | 3 +++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index a61eb3fb957..dd8ef650a7c 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1140,6 +1140,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, rcode = RCODE_TYPE_ERROR; break; + case CSR_MAINT_UTILITY: + if (tcode == TCODE_READ_QUADLET_REQUEST) + *data = card->maint_utility_register; + else if (tcode == TCODE_WRITE_QUADLET_REQUEST) + card->maint_utility_register = *data; + else + rcode = RCODE_TYPE_ERROR; + break; + case CSR_BROADCAST_CHANNEL: if (tcode == TCODE_READ_QUADLET_REQUEST) *data = cpu_to_be32(card->broadcast_channel); diff --git a/include/linux/firewire.h b/include/linux/firewire.h index a50377d9125..f1160e831da 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -38,6 +38,7 @@ #define CSR_CHANNELS_AVAILABLE 0x224 #define CSR_CHANNELS_AVAILABLE_HI 0x224 #define CSR_CHANNELS_AVAILABLE_LO 0x228 +#define CSR_MAINT_UTILITY 0x230 #define CSR_BROADCAST_CHANNEL 0x234 #define CSR_CONFIG_ROM 0x400 #define CSR_CONFIG_ROM_END 0x800 @@ -122,6 +123,8 @@ struct fw_card { bool broadcast_channel_allocated; u32 broadcast_channel; __be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + + __be32 maint_utility_register; }; struct fw_attribute_group { -- cgit v1.2.3-70-g09d2 From 7e0e314f198d5048b74c8f0ef9f4c1c02e5ecfc9 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 10 Jun 2010 08:37:15 +0200 Subject: firewire: core: add CSR abdicate support Implement the abdicate bit, which is required for bus manager capable nodes and tested by the Base 1394 Test Suite. Finally, something to do at a command reset! :-) Signed-off-by: Clemens Ladisch --- drivers/firewire/core-card.c | 3 ++- drivers/firewire/core-topology.c | 2 ++ drivers/firewire/core-transaction.c | 13 +++++++++++-- drivers/firewire/core.h | 1 + include/linux/firewire.h | 2 ++ 5 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index d0f15c2f1e1..7c4cf6cfa74 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -260,7 +260,8 @@ static void fw_card_bm_work(struct work_struct *work) grace = time_after(jiffies, card->reset_jiffies + DIV_ROUND_UP(HZ, 8)); - if (is_next_generation(generation, card->bm_generation) || + if ((is_next_generation(generation, card->bm_generation) && + !card->bm_abdicate) || (card->bm_generation != generation && grace)) { /* * This first step is to figure out who is IRM and diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 93ec64cdeef..ca3c6531816 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -552,6 +552,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, smp_wmb(); card->generation = generation; card->reset_jiffies = jiffies; + card->bm_abdicate = card->csr_abdicate; + card->csr_abdicate = false; fw_schedule_bm_work(card, 0); local_node = build_tree(card, self_ids, self_id_count); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index e0c6cce894c..85a54da243e 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1008,6 +1008,10 @@ static u32 read_state_register(struct fw_card *card) /* Bit 8 (cmstr): */ value |= card->driver->read_csr_reg(card, CSR_STATE_CLEAR); + /* Bit 10 (abdicate): */ + if (card->csr_abdicate) + value |= CSR_STATE_BIT_ABDICATE; + return value; } @@ -1041,6 +1045,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { card->driver->write_csr_reg(card, CSR_STATE_CLEAR, be32_to_cpu(*data)); + if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE)) + card->csr_abdicate = false; } else { rcode = RCODE_TYPE_ERROR; } @@ -1052,7 +1058,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { card->driver->write_csr_reg(card, CSR_STATE_SET, be32_to_cpu(*data)); - /* FIXME: implement abdicate */ + if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE)) + card->csr_abdicate = true; } else { rcode = RCODE_TYPE_ERROR; } @@ -1070,7 +1077,9 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, break; case CSR_RESET_START: - if (tcode != TCODE_WRITE_QUADLET_REQUEST) + if (tcode == TCODE_WRITE_QUADLET_REQUEST) + card->csr_abdicate = false; + else rcode = RCODE_TYPE_ERROR; break; diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index aaecdd1c176..a9ace1f8dc3 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -41,6 +41,7 @@ struct fw_packet; #define FEATURE_PRIORITY_BUDGET 0x01 #define CSR_STATE_BIT_CMSTR (1 << 8) +#define CSR_STATE_BIT_ABDICATE (1 << 10) struct fw_card_driver { /* diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f1160e831da..4d22643215e 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -119,6 +119,8 @@ struct fw_card { int bm_retries; int bm_generation; __be32 bm_transaction_data[2]; + bool bm_abdicate; /* value of csr_abdicate before last bus reset */ + bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */ bool broadcast_channel_allocated; u32 broadcast_channel; -- cgit v1.2.3-70-g09d2 From fb76dd10b91146e9cefbb3cd4e6812c5a95ee43b Mon Sep 17 00:00:00 2001 From: Luotao Fu Date: Thu, 10 Jun 2010 12:05:23 -0700 Subject: Input: matrix_keypad - add support for clustered irq This one adds support of a combined irq source for the whole matrix keypad. This can be useful if all rows and columns of the keypad are e.g. connected to a GPIO expander, which only has one interrupt line for all events on every single GPIO. Signed-off-by: Luotao Fu Acked-by: Eric Miao Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/matrix_keypad.c | 108 ++++++++++++++++++++++++--------- include/linux/input/matrix_keypad.h | 6 ++ 2 files changed, 86 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c index b443e088fd3..b02e4268e18 100644 --- a/drivers/input/keyboard/matrix_keypad.c +++ b/drivers/input/keyboard/matrix_keypad.c @@ -37,6 +37,7 @@ struct matrix_keypad { spinlock_t lock; bool scan_pending; bool stopped; + bool gpio_all_disabled; }; /* @@ -87,8 +88,12 @@ static void enable_row_irqs(struct matrix_keypad *keypad) const struct matrix_keypad_platform_data *pdata = keypad->pdata; int i; - for (i = 0; i < pdata->num_row_gpios; i++) - enable_irq(gpio_to_irq(pdata->row_gpios[i])); + if (pdata->clustered_irq > 0) + enable_irq(pdata->clustered_irq); + else { + for (i = 0; i < pdata->num_row_gpios; i++) + enable_irq(gpio_to_irq(pdata->row_gpios[i])); + } } static void disable_row_irqs(struct matrix_keypad *keypad) @@ -96,8 +101,12 @@ static void disable_row_irqs(struct matrix_keypad *keypad) const struct matrix_keypad_platform_data *pdata = keypad->pdata; int i; - for (i = 0; i < pdata->num_row_gpios; i++) - disable_irq_nosync(gpio_to_irq(pdata->row_gpios[i])); + if (pdata->clustered_irq > 0) + disable_irq_nosync(pdata->clustered_irq); + else { + for (i = 0; i < pdata->num_row_gpios; i++) + disable_irq_nosync(gpio_to_irq(pdata->row_gpios[i])); + } } /* @@ -216,45 +225,69 @@ static void matrix_keypad_stop(struct input_dev *dev) } #ifdef CONFIG_PM -static int matrix_keypad_suspend(struct device *dev) +static void matrix_keypad_enable_wakeup(struct matrix_keypad *keypad) { - struct platform_device *pdev = to_platform_device(dev); - struct matrix_keypad *keypad = platform_get_drvdata(pdev); const struct matrix_keypad_platform_data *pdata = keypad->pdata; + unsigned int gpio; int i; - matrix_keypad_stop(keypad->input_dev); + if (pdata->clustered_irq > 0) { + if (enable_irq_wake(pdata->clustered_irq) == 0) + keypad->gpio_all_disabled = true; + } else { - if (device_may_wakeup(&pdev->dev)) { for (i = 0; i < pdata->num_row_gpios; i++) { if (!test_bit(i, keypad->disabled_gpios)) { - unsigned int gpio = pdata->row_gpios[i]; + gpio = pdata->row_gpios[i]; if (enable_irq_wake(gpio_to_irq(gpio)) == 0) __set_bit(i, keypad->disabled_gpios); } } } - - return 0; } -static int matrix_keypad_resume(struct device *dev) +static void matrix_keypad_disable_wakeup(struct matrix_keypad *keypad) { - struct platform_device *pdev = to_platform_device(dev); - struct matrix_keypad *keypad = platform_get_drvdata(pdev); const struct matrix_keypad_platform_data *pdata = keypad->pdata; + unsigned int gpio; int i; - if (device_may_wakeup(&pdev->dev)) { + if (pdata->clustered_irq > 0) { + if (keypad->gpio_all_disabled) { + disable_irq_wake(pdata->clustered_irq); + keypad->gpio_all_disabled = false; + } + } else { for (i = 0; i < pdata->num_row_gpios; i++) { if (test_and_clear_bit(i, keypad->disabled_gpios)) { - unsigned int gpio = pdata->row_gpios[i]; - + gpio = pdata->row_gpios[i]; disable_irq_wake(gpio_to_irq(gpio)); } } } +} + +static int matrix_keypad_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct matrix_keypad *keypad = platform_get_drvdata(pdev); + + matrix_keypad_stop(keypad->input_dev); + + if (device_may_wakeup(&pdev->dev)) + matrix_keypad_enable_wakeup(keypad); + + return 0; +} + +static int matrix_keypad_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct matrix_keypad *keypad = platform_get_drvdata(pdev); + + if (device_may_wakeup(&pdev->dev)) + matrix_keypad_disable_wakeup(keypad); matrix_keypad_start(keypad->input_dev); @@ -296,17 +329,31 @@ static int __devinit init_matrix_gpio(struct platform_device *pdev, gpio_direction_input(pdata->row_gpios[i]); } - for (i = 0; i < pdata->num_row_gpios; i++) { - err = request_irq(gpio_to_irq(pdata->row_gpios[i]), + if (pdata->clustered_irq > 0) { + err = request_irq(pdata->clustered_irq, matrix_keypad_interrupt, - IRQF_DISABLED | - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, + pdata->clustered_irq_flags, "matrix-keypad", keypad); if (err) { dev_err(&pdev->dev, - "Unable to acquire interrupt for GPIO line %i\n", - pdata->row_gpios[i]); - goto err_free_irqs; + "Unable to acquire clustered interrupt\n"); + goto err_free_rows; + } + } else { + for (i = 0; i < pdata->num_row_gpios; i++) { + err = request_irq(gpio_to_irq(pdata->row_gpios[i]), + matrix_keypad_interrupt, + IRQF_DISABLED | + IRQF_TRIGGER_RISING | + IRQF_TRIGGER_FALLING, + "matrix-keypad", keypad); + if (err) { + dev_err(&pdev->dev, + "Unable to acquire interrupt " + "for GPIO line %i\n", + pdata->row_gpios[i]); + goto err_free_irqs; + } } } @@ -418,11 +465,16 @@ static int __devexit matrix_keypad_remove(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 0); - for (i = 0; i < pdata->num_row_gpios; i++) { - free_irq(gpio_to_irq(pdata->row_gpios[i]), keypad); - gpio_free(pdata->row_gpios[i]); + if (pdata->clustered_irq > 0) { + free_irq(pdata->clustered_irq, keypad); + } else { + for (i = 0; i < pdata->num_row_gpios; i++) + free_irq(gpio_to_irq(pdata->row_gpios[i]), keypad); } + for (i = 0; i < pdata->num_row_gpios; i++) + gpio_free(pdata->row_gpios[i]); + for (i = 0; i < pdata->num_col_gpios; i++) gpio_free(pdata->col_gpios[i]); diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h index c964cd7f436..80352ad6581 100644 --- a/include/linux/input/matrix_keypad.h +++ b/include/linux/input/matrix_keypad.h @@ -41,6 +41,9 @@ struct matrix_keymap_data { * @col_scan_delay_us: delay, measured in microseconds, that is * needed before we can keypad after activating column gpio * @debounce_ms: debounce interval in milliseconds + * @clustered_irq: may be specified if interrupts of all row/column GPIOs + * are bundled to one single irq + * @clustered_irq_flags: flags that are needed for the clustered irq * @active_low: gpio polarity * @wakeup: controls whether the device should be set up as wakeup * source @@ -63,6 +66,9 @@ struct matrix_keypad_platform_data { /* key debounce interval in milli-second */ unsigned int debounce_ms; + unsigned int clustered_irq; + unsigned int clustered_irq_flags; + bool active_low; bool wakeup; bool no_autorepeat; -- cgit v1.2.3-70-g09d2 From c6de9f08912311ddc1b3502b90e10fd449acd401 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 31 May 2010 16:48:09 +0800 Subject: x86, mce: Add HW_ERR printk prefix for hardware error logging This makes hardware error related log in printk log more explicit. So that the users can report it to hardware vendor instead of LKML or software vendor. Signed-off-by: Huang Ying Reviewed-by: Hidetoshi Seto LKML-Reference: <1275295689.3444.462.camel@yhuang-dev.sh.intel.com> Signed-off-by: H. Peter Anvin --- include/linux/kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8317ec4b9f3..3bf740bb069 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -247,6 +247,13 @@ extern struct pid *session_of_pgrp(struct pid *pgrp); #define FW_WARN "[Firmware Warn]: " #define FW_INFO "[Firmware Info]: " +/* + * HW_ERR + * Add this to a message for hardware errors, so that user can report + * it to hardware vendor instead of LKML or software vendor. + */ +#define HW_ERR "[Hardware Error]: " + #ifdef CONFIG_PRINTK asmlinkage int vprintk(const char *fmt, va_list args) __attribute__ ((format (printf, 1, 0))); -- cgit v1.2.3-70-g09d2 From 592fcb9dfafaa02dd0edc207bf5d3a0ee7a1f8df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jun 2010 16:21:07 +0000 Subject: ip: ip_ra_control() rcu fix commit 66018506e15b (ip: Router Alert RCU conversion) introduced RCU lookups to ip_call_ra_chain(). It missed proper deinit phase : When ip_ra_control() deletes an ip_ra_chain, it should make sure ip_call_ra_chain() users can not start to use socket during the rcu grace period. It should also delay the sock_put() after the grace period, or we risk a premature socket freeing and corruptions, as raw sockets are not rcu protected yet. This delay avoids using expensive atomic_inc_not_zero() in ip_call_ra_chain(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 5 ++++- net/ipv4/ip_sockglue.c | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 9982c97f0bd..d52f0118036 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -61,7 +61,10 @@ struct ipcm_cookie { struct ip_ra_chain { struct ip_ra_chain *next; struct sock *sk; - void (*destructor)(struct sock *); + union { + void (*destructor)(struct sock *); + struct sock *saved_sk; + }; struct rcu_head rcu; }; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 08b9519a24f..47fff528ff3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -241,9 +241,13 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) struct ip_ra_chain *ip_ra_chain; static DEFINE_SPINLOCK(ip_ra_lock); -static void ip_ra_free_rcu(struct rcu_head *head) + +static void ip_ra_destroy_rcu(struct rcu_head *head) { - kfree(container_of(head, struct ip_ra_chain, rcu)); + struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); + + sock_put(ra->saved_sk); + kfree(ra); } int ip_ra_control(struct sock *sk, unsigned char on, @@ -264,13 +268,20 @@ int ip_ra_control(struct sock *sk, unsigned char on, kfree(new_ra); return -EADDRINUSE; } + /* dont let ip_call_ra_chain() use sk again */ + ra->sk = NULL; rcu_assign_pointer(*rap, ra->next); spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); - sock_put(sk); - call_rcu(&ra->rcu, ip_ra_free_rcu); + /* + * Delay sock_put(sk) and kfree(ra) after one rcu grace + * period. This guarantee ip_call_ra_chain() dont need + * to mess with socket refcounts. + */ + ra->saved_sk = sk; + call_rcu(&ra->rcu, ip_ra_destroy_rcu); return 0; } } -- cgit v1.2.3-70-g09d2 From d8d1f30b95a635dbd610dcc5eb641aca8f4768cf Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 10 Jun 2010 23:31:35 -0700 Subject: net-next: remove useless union keyword remove useless union keyword in rtable, rt6_info and dn_route. Since there is only one member in a union, the union keyword isn't useful. Signed-off-by: Changli Gao Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_cm.c | 4 +- drivers/infiniband/hw/cxgb4/cm.c | 4 +- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/net/bonding/bond_main.c | 6 +- drivers/net/cnic.c | 2 +- drivers/scsi/cxgb3i/cxgb3i_offload.c | 4 +- include/net/dn_route.h | 4 +- include/net/ip6_fib.h | 10 +- include/net/ipip.h | 2 +- include/net/route.h | 6 +- net/atm/clip.c | 2 +- net/bridge/br_device.c | 2 +- net/bridge/br_netfilter.c | 20 +- net/dccp/ipv4.c | 4 +- net/decnet/dn_route.c | 158 ++++++------ net/ethernet/eth.c | 5 +- net/ipv4/af_inet.c | 4 +- net/ipv4/arp.c | 12 +- net/ipv4/datagram.c | 2 +- net/ipv4/icmp.c | 18 +- net/ipv4/igmp.c | 10 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/ip_forward.c | 10 +- net/ipv4/ip_gre.c | 14 +- net/ipv4/ip_input.c | 4 +- net/ipv4/ip_output.c | 60 ++--- net/ipv4/ipip.c | 8 +- net/ipv4/ipmr.c | 8 +- net/ipv4/netfilter.c | 8 +- net/ipv4/raw.c | 16 +- net/ipv4/route.c | 420 ++++++++++++++++---------------- net/ipv4/syncookies.c | 6 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 4 +- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/addrconf.c | 10 +- net/ipv6/anycast.c | 6 +- net/ipv6/fib6_rules.c | 10 +- net/ipv6/ip6_fib.c | 30 +-- net/ipv6/ip6_output.c | 38 +-- net/ipv6/ip6_tunnel.c | 8 +- net/ipv6/mcast.c | 4 +- net/ipv6/ndisc.c | 8 +- net/ipv6/raw.c | 12 +- net/ipv6/route.c | 300 +++++++++++------------ net/ipv6/sit.c | 8 +- net/l2tp/l2tp_ip.c | 6 +- net/netfilter/ipvs/ip_vs_xmit.c | 86 +++---- net/netfilter/nf_conntrack_h323_main.c | 12 +- net/netfilter/nf_conntrack_netbios_ns.c | 2 +- net/netfilter/xt_TCPMSS.c | 4 +- net/netfilter/xt_TEE.c | 4 +- net/rxrpc/ar-peer.c | 4 +- net/sctp/protocol.c | 4 +- 55 files changed, 694 insertions(+), 709 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 0b926e45afe..a5ea1bce968 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -215,7 +215,7 @@ static int addr4_resolve(struct sockaddr_in *src_in, neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); if (!neigh || !(neigh->nud_state & NUD_VALID)) { - neigh_event_send(rt->u.dst.neighbour, NULL); + neigh_event_send(rt->dst.neighbour, NULL); ret = -ENODATA; if (neigh) goto release; diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index ebfb117ba68..abd683ea326 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -1364,7 +1364,7 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) __func__); goto reject; } - dst = &rt->u.dst; + dst = &rt->dst; l2t = t3_l2t_get(tdev, dst->neighbour, dst->neighbour->dev); if (!l2t) { printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", @@ -1932,7 +1932,7 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = -EHOSTUNREACH; goto fail3; } - ep->dst = &rt->u.dst; + ep->dst = &rt->dst; /* get a l2t entry */ ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst->neighbour, diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 30ce0a8eca0..8b693c8c25e 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1364,7 +1364,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) __func__); goto reject; } - dst = &rt->u.dst; + dst = &rt->dst; if (dst->neighbour->dev->flags & IFF_LOOPBACK) { pdev = ip_dev_find(&init_net, peer_ip); BUG_ON(!pdev); @@ -1938,7 +1938,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = -EHOSTUNREACH; goto fail3; } - ep->dst = &rt->u.dst; + ep->dst = &rt->dst; /* get a l2t entry */ if (ep->dst->neighbour->dev->flags & IFF_LOOPBACK) { diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 986d6f32dde..d876d0435cd 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1146,7 +1146,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi } if ((neigh == NULL) || (!(neigh->nud_state & NUD_VALID))) - neigh_event_send(rt->u.dst.neighbour, NULL); + neigh_event_send(rt->dst.neighbour, NULL); ip_rt_put(rt); return rc; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1b19276cff1..ac4f94b7da3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2584,7 +2584,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) /* * This target is not on a VLAN */ - if (rt->u.dst.dev == bond->dev) { + if (rt->dst.dev == bond->dev) { ip_rt_put(rt); pr_debug("basa: rtdev == bond->dev: arp_send\n"); bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], @@ -2595,7 +2595,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) vlan_id = 0; list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { vlan_dev = vlan_group_get_device(bond->vlgrp, vlan->vlan_id); - if (vlan_dev == rt->u.dst.dev) { + if (vlan_dev == rt->dst.dev) { vlan_id = vlan->vlan_id; pr_debug("basa: vlan match on %s %d\n", vlan_dev->name, vlan_id); @@ -2613,7 +2613,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) if (net_ratelimit()) { pr_warning("%s: no path to arp_ip_target %pI4 via rt.dev %s\n", bond->dev->name, &fl.fl4_dst, - rt->u.dst.dev ? rt->u.dst.dev->name : "NULL"); + rt->dst.dev ? rt->dst.dev->name : "NULL"); } ip_rt_put(rt); } diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c index fe925663d39..908d89a4fe8 100644 --- a/drivers/net/cnic.c +++ b/drivers/net/cnic.c @@ -2824,7 +2824,7 @@ static int cnic_get_v4_route(struct sockaddr_in *dst_addr, err = ip_route_output_key(&init_net, &rt, &fl); if (!err) - *dst = &rt->u.dst; + *dst = &rt->dst; return err; #else return -ENETUNREACH; diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.c b/drivers/scsi/cxgb3i/cxgb3i_offload.c index a175be9c496..3b6a06eebf7 100644 --- a/drivers/scsi/cxgb3i/cxgb3i_offload.c +++ b/drivers/scsi/cxgb3i/cxgb3i_offload.c @@ -1587,7 +1587,7 @@ cxgb3i_find_dev(struct net_device *dev, __be32 ipaddr) err = ip_route_output_key(dev ? dev_net(dev) : &init_net, &rt, &fl); if (!err) - return (&rt->u.dst)->dev; + return (&rt->dst)->dev; return NULL; } @@ -1649,7 +1649,7 @@ int cxgb3i_c3cn_connect(struct net_device *dev, struct s3_conn *c3cn, c3cn->saddr.sin_addr.s_addr = rt->rt_src; /* now commit destination to connection */ - c3cn->dst_cache = &rt->u.dst; + c3cn->dst_cache = &rt->dst; /* try to establish an offloaded connection */ dev = cxgb3_egress_dev(c3cn->dst_cache->dev, c3cn, 0); diff --git a/include/net/dn_route.h b/include/net/dn_route.h index 60c9f22d869..ccadab3aa3f 100644 --- a/include/net/dn_route.h +++ b/include/net/dn_route.h @@ -65,9 +65,7 @@ extern void dn_rt_cache_flush(int delay); * packets to the originating host. */ struct dn_route { - union { - struct dst_entry dst; - } u; + struct dst_entry dst; struct flowi fl; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4b1dc1161c3..062a823d311 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -84,13 +84,11 @@ struct rt6key { struct fib6_table; struct rt6_info { - union { - struct dst_entry dst; - } u; + struct dst_entry dst; -#define rt6i_dev u.dst.dev -#define rt6i_nexthop u.dst.neighbour -#define rt6i_expires u.dst.expires +#define rt6i_dev dst.dev +#define rt6i_nexthop dst.neighbour +#define rt6i_expires dst.expires /* * Tail elements of dst_entry (__refcnt etc.) diff --git a/include/net/ipip.h b/include/net/ipip.h index 11e8513d2d0..65caea8b414 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -50,7 +50,7 @@ struct ip_tunnel_prl_entry { int pkt_len = skb->len - skb_transport_offset(skb); \ \ skb->ip_summed = CHECKSUM_NONE; \ - ip_select_ident(iph, &rt->u.dst, NULL); \ + ip_select_ident(iph, &rt->dst, NULL); \ \ err = ip_local_out(skb); \ if (likely(net_xmit_eval(err) == 0)) { \ diff --git a/include/net/route.h b/include/net/route.h index af6cf4b4c9d..bd732d62e1c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -50,9 +50,7 @@ struct fib_nh; struct inet_peer; struct rtable { - union { - struct dst_entry dst; - } u; + struct dst_entry dst; /* Cache lookup keys */ struct flowi fl; @@ -144,7 +142,7 @@ extern void fib_add_ifaddr(struct in_ifaddr *); static inline void ip_rt_put(struct rtable * rt) { if (rt) - dst_release(&rt->u.dst); + dst_release(&rt->dst); } #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) diff --git a/net/atm/clip.c b/net/atm/clip.c index 313aba11316..95fdd118506 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -522,7 +522,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) error = ip_route_output_key(&init_net, &rt, &fl); if (error) return error; - neigh = __neigh_lookup(&clip_tbl, &ip, rt->u.dst.dev, 1); + neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1); ip_rt_put(rt); if (!neigh) return -ENOMEM; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index eedf2c94820..b898364beaf 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -127,7 +127,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) #ifdef CONFIG_BRIDGE_NETFILTER /* remember the MTU in the rtable for PMTU */ - br->fake_rtable.u.dst.metrics[RTAX_MTU - 1] = new_mtu; + br->fake_rtable.dst.metrics[RTAX_MTU - 1] = new_mtu; #endif return 0; diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 44420992f72..0685b2558ab 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -117,12 +117,12 @@ void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.dev = br->dev; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.metrics[RTAX_MTU - 1] = 1500; - rt->u.dst.flags = DST_NOXFRM; - rt->u.dst.ops = &fake_dst_ops; + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + rt->dst.metrics[RTAX_MTU - 1] = 1500; + rt->dst.flags = DST_NOXFRM; + rt->dst.ops = &fake_dst_ops; } static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) @@ -244,8 +244,8 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) kfree_skb(skb); return 0; } - dst_hold(&rt->u.dst); - skb_dst_set(skb, &rt->u.dst); + dst_hold(&rt->dst); + skb_dst_set(skb, &rt->dst); skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); @@ -396,8 +396,8 @@ bridged_dnat: kfree_skb(skb); return 0; } - dst_hold(&rt->u.dst); - skb_dst_set(skb, &rt->u.dst); + dst_hold(&rt->dst); + skb_dst_set(skb, &rt->dst); } skb->dev = nf_bridge->physindev; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d9b11ef8694..d4a166f0f39 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -105,7 +105,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; /* OK, now commit destination to socket. */ - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, inet->inet_daddr, @@ -475,7 +475,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, return NULL; } - return &rt->u.dst; + return &rt->dst; } static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 812e6dff606..6585ea6d118 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -146,13 +146,13 @@ static __inline__ unsigned dn_hash(__le16 src, __le16 dst) static inline void dnrt_free(struct dn_route *rt) { - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline void dnrt_drop(struct dn_route *rt) { - dst_release(&rt->u.dst); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + dst_release(&rt->dst); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static void dn_dst_check_expire(unsigned long dummy) @@ -167,13 +167,13 @@ static void dn_dst_check_expire(unsigned long dummy) spin_lock(&dn_rt_hash_table[i].lock); while((rt=*rtp) != NULL) { - if (atomic_read(&rt->u.dst.__refcnt) || - (now - rt->u.dst.lastuse) < expire) { - rtp = &rt->u.dst.dn_next; + if (atomic_read(&rt->dst.__refcnt) || + (now - rt->dst.lastuse) < expire) { + rtp = &rt->dst.dn_next; continue; } - *rtp = rt->u.dst.dn_next; - rt->u.dst.dn_next = NULL; + *rtp = rt->dst.dn_next; + rt->dst.dn_next = NULL; dnrt_free(rt); } spin_unlock(&dn_rt_hash_table[i].lock); @@ -198,13 +198,13 @@ static int dn_dst_gc(struct dst_ops *ops) rtp = &dn_rt_hash_table[i].chain; while((rt=*rtp) != NULL) { - if (atomic_read(&rt->u.dst.__refcnt) || - (now - rt->u.dst.lastuse) < expire) { - rtp = &rt->u.dst.dn_next; + if (atomic_read(&rt->dst.__refcnt) || + (now - rt->dst.lastuse) < expire) { + rtp = &rt->dst.dn_next; continue; } - *rtp = rt->u.dst.dn_next; - rt->u.dst.dn_next = NULL; + *rtp = rt->dst.dn_next; + rt->dst.dn_next = NULL; dnrt_drop(rt); break; } @@ -287,25 +287,25 @@ static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route * while((rth = *rthp) != NULL) { if (compare_keys(&rth->fl, &rt->fl)) { /* Put it first */ - *rthp = rth->u.dst.dn_next; - rcu_assign_pointer(rth->u.dst.dn_next, + *rthp = rth->dst.dn_next; + rcu_assign_pointer(rth->dst.dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); - dst_use(&rth->u.dst, now); + dst_use(&rth->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); dnrt_drop(rt); *rp = rth; return 0; } - rthp = &rth->u.dst.dn_next; + rthp = &rth->dst.dn_next; } - rcu_assign_pointer(rt->u.dst.dn_next, dn_rt_hash_table[hash].chain); + rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); - dst_use(&rt->u.dst, now); + dst_use(&rt->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); *rp = rt; return 0; @@ -323,8 +323,8 @@ static void dn_run_flush(unsigned long dummy) goto nothing_to_declare; for(; rt; rt=next) { - next = rt->u.dst.dn_next; - rt->u.dst.dn_next = NULL; + next = rt->dst.dn_next; + rt->dst.dn_next = NULL; dst_free((struct dst_entry *)rt); } @@ -743,7 +743,7 @@ static int dn_forward(struct sk_buff *skb) /* Ensure that we have enough space for headers */ rt = (struct dn_route *)skb_dst(skb); header_len = dn_db->use_long ? 21 : 6; - if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+header_len)) + if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len)) goto drop; /* @@ -752,7 +752,7 @@ static int dn_forward(struct sk_buff *skb) if (++cb->hops > 30) goto drop; - skb->dev = rt->u.dst.dev; + skb->dev = rt->dst.dev; /* * If packet goes out same interface it came in on, then set @@ -792,7 +792,7 @@ static int dn_rt_bug(struct sk_buff *skb) static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) { struct dn_fib_info *fi = res->fi; - struct net_device *dev = rt->u.dst.dev; + struct net_device *dev = rt->dst.dev; struct neighbour *n; unsigned mss; @@ -800,25 +800,25 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) if (DN_FIB_RES_GW(*res) && DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = DN_FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); + memcpy(rt->dst.metrics, fi->fib_metrics, + sizeof(rt->dst.metrics)); } rt->rt_type = res->type; - if (dev != NULL && rt->u.dst.neighbour == NULL) { + if (dev != NULL && rt->dst.neighbour == NULL) { n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev); if (IS_ERR(n)) return PTR_ERR(n); - rt->u.dst.neighbour = n; + rt->dst.neighbour = n; } - if (dst_metric(&rt->u.dst, RTAX_MTU) == 0 || - dst_metric(&rt->u.dst, RTAX_MTU) > rt->u.dst.dev->mtu) - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst)); - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0 || - dst_metric(&rt->u.dst, RTAX_ADVMSS) > mss) - rt->u.dst.metrics[RTAX_ADVMSS-1] = mss; + if (dst_metric(&rt->dst, RTAX_MTU) == 0 || + dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu) + rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu; + mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst)); + if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0 || + dst_metric(&rt->dst, RTAX_ADVMSS) > mss) + rt->dst.metrics[RTAX_ADVMSS-1] = mss; return 0; } @@ -1096,8 +1096,8 @@ make_route: if (rt == NULL) goto e_nobufs; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.flags = DST_HOST; + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.flags = DST_HOST; rt->fl.fld_src = oldflp->fld_src; rt->fl.fld_dst = oldflp->fld_dst; @@ -1113,17 +1113,17 @@ make_route: rt->rt_dst_map = fl.fld_dst; rt->rt_src_map = fl.fld_src; - rt->u.dst.dev = dev_out; + rt->dst.dev = dev_out; dev_hold(dev_out); - rt->u.dst.neighbour = neigh; + rt->dst.neighbour = neigh; neigh = NULL; - rt->u.dst.lastuse = jiffies; - rt->u.dst.output = dn_output; - rt->u.dst.input = dn_rt_bug; + rt->dst.lastuse = jiffies; + rt->dst.output = dn_output; + rt->dst.input = dn_rt_bug; rt->rt_flags = flags; if (flags & RTCF_LOCAL) - rt->u.dst.input = dn_nsp_rx; + rt->dst.input = dn_nsp_rx; err = dn_rt_set_next_hop(rt, &res); if (err) @@ -1152,7 +1152,7 @@ e_nobufs: err = -ENOBUFS; goto done; e_neighbour: - dst_free(&rt->u.dst); + dst_free(&rt->dst); goto e_nobufs; } @@ -1168,15 +1168,15 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowi *fl if (!(flags & MSG_TRYHARD)) { rcu_read_lock_bh(); for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt; - rt = rcu_dereference_bh(rt->u.dst.dn_next)) { + rt = rcu_dereference_bh(rt->dst.dn_next)) { if ((flp->fld_dst == rt->fl.fld_dst) && (flp->fld_src == rt->fl.fld_src) && (flp->mark == rt->fl.mark) && (rt->fl.iif == 0) && (rt->fl.oif == flp->oif)) { - dst_use(&rt->u.dst, jiffies); + dst_use(&rt->dst, jiffies); rcu_read_unlock_bh(); - *pprt = &rt->u.dst; + *pprt = &rt->dst; return 0; } } @@ -1375,29 +1375,29 @@ make_route: rt->fl.iif = in_dev->ifindex; rt->fl.mark = fl.mark; - rt->u.dst.flags = DST_HOST; - rt->u.dst.neighbour = neigh; - rt->u.dst.dev = out_dev; - rt->u.dst.lastuse = jiffies; - rt->u.dst.output = dn_rt_bug; + rt->dst.flags = DST_HOST; + rt->dst.neighbour = neigh; + rt->dst.dev = out_dev; + rt->dst.lastuse = jiffies; + rt->dst.output = dn_rt_bug; switch(res.type) { case RTN_UNICAST: - rt->u.dst.input = dn_forward; + rt->dst.input = dn_forward; break; case RTN_LOCAL: - rt->u.dst.output = dn_output; - rt->u.dst.input = dn_nsp_rx; - rt->u.dst.dev = in_dev; + rt->dst.output = dn_output; + rt->dst.input = dn_nsp_rx; + rt->dst.dev = in_dev; flags |= RTCF_LOCAL; break; default: case RTN_UNREACHABLE: case RTN_BLACKHOLE: - rt->u.dst.input = dst_discard; + rt->dst.input = dst_discard; } rt->rt_flags = flags; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + if (rt->dst.dev) + dev_hold(rt->dst.dev); err = dn_rt_set_next_hop(rt, &res); if (err) @@ -1405,7 +1405,7 @@ make_route: hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst); dn_insert_route(rt, hash, &rt); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); done: if (neigh) @@ -1427,7 +1427,7 @@ e_nobufs: goto done; e_neighbour: - dst_free(&rt->u.dst); + dst_free(&rt->dst); goto done; } @@ -1442,13 +1442,13 @@ static int dn_route_input(struct sk_buff *skb) rcu_read_lock(); for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; - rt = rcu_dereference(rt->u.dst.dn_next)) { + rt = rcu_dereference(rt->dst.dn_next)) { if ((rt->fl.fld_src == cb->src) && (rt->fl.fld_dst == cb->dst) && (rt->fl.oif == 0) && (rt->fl.mark == skb->mark) && (rt->fl.iif == cb->iif)) { - dst_use(&rt->u.dst, jiffies); + dst_use(&rt->dst, jiffies); rcu_read_unlock(); skb_dst_set(skb, (struct dst_entry *)rt); return 0; @@ -1487,8 +1487,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, r->rtm_src_len = 16; RTA_PUT(skb, RTA_SRC, 2, &rt->fl.fld_src); } - if (rt->u.dst.dev) - RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + if (rt->dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->dst.dev->ifindex); /* * Note to self - change this if input routes reverse direction when * they deal only with inputs and not with replies like they do @@ -1497,11 +1497,11 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, RTA_PUT(skb, RTA_PREFSRC, 2, &rt->rt_local_src); if (rt->rt_daddr != rt->rt_gateway) RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway); - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) goto rtattr_failure; - expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; - if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, expires, - rt->u.dst.error) < 0) + expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, + rt->dst.error) < 0) goto rtattr_failure; if (rt->fl.iif) RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); @@ -1568,8 +1568,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void local_bh_enable(); memset(cb, 0, sizeof(struct dn_skb_cb)); rt = (struct dn_route *)skb_dst(skb); - if (!err && -rt->u.dst.error) - err = rt->u.dst.error; + if (!err && -rt->dst.error) + err = rt->dst.error; } else { int oif = 0; if (rta[RTA_OIF - 1]) @@ -1583,7 +1583,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void skb->dev = NULL; if (err) goto out_free; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -1632,10 +1632,10 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock_bh(); for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->u.dst.dn_next), idx++) { + rt = rcu_dereference_bh(rt->dst.dn_next), idx++) { if (idx < s_idx) continue; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, dst_clone(&rt->dst)); if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { @@ -1678,7 +1678,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou { struct dn_rt_cache_iter_state *s = seq->private; - rt = rt->u.dst.dn_next; + rt = rt->dst.dn_next; while(!rt) { rcu_read_unlock_bh(); if (--s->bucket < 0) @@ -1719,12 +1719,12 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v) char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", - rt->u.dst.dev ? rt->u.dst.dev->name : "*", + rt->dst.dev ? rt->dst.dev->name : "*", dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), - atomic_read(&rt->u.dst.__refcnt), - rt->u.dst.__use, - (int) dst_metric(&rt->u.dst, RTAX_RTT)); + atomic_read(&rt->dst.__refcnt), + rt->dst.__use, + (int) dst_metric(&rt->dst, RTAX_RTT)); return 0; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 61ec0329316..215c83986a9 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -158,7 +158,6 @@ EXPORT_SYMBOL(eth_rebuild_header); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { struct ethhdr *eth; - unsigned char *rawp; skb->dev = dev; skb_reset_mac_header(skb); @@ -199,15 +198,13 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (ntohs(eth->h_proto) >= 1536) return eth->h_proto; - rawp = skb->data; - /* * This is a magic hack to spot IPX packets. Older Novell breaks * the protocol design and runs IPX over 802.3 without an 802.2 LLC * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. */ - if (*(unsigned short *)rawp == 0xFFFF) + if (skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF) return htons(ETH_P_802_3); /* diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 551ce564b03..d99e7e02018 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1100,7 +1100,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (err) return err; - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); new_saddr = rt->rt_src; @@ -1166,7 +1166,7 @@ int inet_sk_rebuild_header(struct sock *sk) err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); } if (!err) - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); else { /* Routing failed... */ sk->sk_route_caps = 0; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 917d2d66162..cf78f41830c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -427,7 +427,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (ip_route_output_key(net, &rt, &fl) < 0) return 1; - if (rt->u.dst.dev != dev) { + if (rt->dst.dev != dev) { NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); flag = 1; } @@ -532,7 +532,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct in_device *out_dev; int imi, omi = -1; - if (rt->u.dst.dev == dev) + if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) @@ -545,7 +545,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, /* place to check for proxy_arp for routes */ - out_dev = __in_dev_get_rcu(rt->u.dst.dev); + out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); @@ -576,7 +576,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ - if (rt->u.dst.dev != dev) + if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ @@ -1042,7 +1042,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, struct rtable * rt; if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return -EINVAL; @@ -1149,7 +1149,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, struct rtable * rt; if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return -EINVAL; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index fb2465811b4..fe3daa7f07a 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -69,7 +69,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->sk_state = TCP_ESTABLISHED; inet->inet_id = jiffies; - sk_dst_set(sk, &rt->u.dst); + sk_dst_set(sk, &rt->dst); return(0); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index bdb6c71e72a..7569b21a3a2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -271,7 +271,7 @@ int xrlim_allow(struct dst_entry *dst, int timeout) static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, int type, int code) { - struct dst_entry *dst = &rt->u.dst; + struct dst_entry *dst = &rt->dst; int rc = 1; if (type > NR_ICMP_TYPES) @@ -327,7 +327,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net((*rt)->u.dst.dev)); + sk = icmp_sk(dev_net((*rt)->dst.dev)); if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, @@ -359,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); struct sock *sk; struct inet_sock *inet; __be32 daddr; @@ -427,7 +427,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!rt) goto out; - net = dev_net(rt->u.dst.dev); + net = dev_net(rt->dst.dev); /* * Find the original header. It is expected to be valid, of course. @@ -596,9 +596,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->u.dst.dev); + RT_TOS(tos), rt2->dst.dev); - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); skb_in->_skb_refdst = orefdst; /* restore old refdst */ } @@ -610,7 +610,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) XFRM_LOOKUP_ICMP); switch (err) { case 0: - dst_release(&rt->u.dst); + dst_release(&rt->dst); rt = rt2; break; case -EPERM: @@ -629,7 +629,7 @@ route_done: /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->u.dst); + room = dst_mtu(&rt->dst); if (room > 576) room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; @@ -972,7 +972,7 @@ int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 3294f547c48..b5580d42299 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -312,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -330,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = rt->rt_src; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(pip, &rt->u.dst, NULL); + ip_select_ident(pip, &rt->dst, NULL); ((u8*)&pip[1])[0] = IPOPT_RA; ((u8*)&pip[1])[1] = 4; ((u8*)&pip[1])[2] = 0; @@ -660,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return -1; } - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -676,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; @@ -1425,7 +1425,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) } if (!dev && !ip_route_output_key(net, &rt, &fl)) { - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); } if (dev) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 70eb3507c40..57c9e4d7b80 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -383,7 +383,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, goto no_route; if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto route_err; - return &rt->u.dst; + return &rt->dst; route_err: ip_rt_put(rt); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 56cdf68a074..99461f09320 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -87,16 +87,16 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && + if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { - IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(dst_mtu(&rt->dst))); goto drop; } /* We are about to mangle packet. Copy it! */ - if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len)) goto drop; iph = ip_hdr(skb); @@ -113,7 +113,7 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, - rt->u.dst.dev, ip_forward_finish); + rt->dst.dev, ip_forward_finish); sr_failed: /* diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 32618e11076..749e54889e8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -745,7 +745,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev goto tx_error; } } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); @@ -755,7 +755,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev df = tiph->frag_off; if (df) - mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; + mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -803,7 +803,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tunnel->err_count = 0; } - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { @@ -830,7 +830,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -853,7 +853,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; #endif else - iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); + iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); } ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; @@ -915,7 +915,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) .proto = IPPROTO_GRE }; struct rtable *rt; if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } @@ -1174,7 +1174,7 @@ static int ipgre_open(struct net_device *dev) struct rtable *rt; if (ip_route_output_key(dev_net(dev), &rt, &fl)) return -EADDRNOTAVAIL; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (__in_dev_get_rtnl(dev) == NULL) return -EADDRNOTAVAIL; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 08a3b121f90..db47a5a00ed 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -356,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9a4a6c96cb0..6cbeb2e108d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -151,15 +151,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->u.dst)) + if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->u.dst, sk); + ip_select_ident(iph, &rt->dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; @@ -240,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->u.dst.dev; + struct net_device *dev = rt->dst.dev; /* * If the indicated interface is up and running, send the packet. @@ -359,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb) if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); } - skb_dst_set_noref(skb, &rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -372,11 +372,11 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; @@ -387,7 +387,7 @@ packet_routed: ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, + ip_select_ident_more(iph, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); skb->priority = sk->sk_priority; @@ -452,7 +452,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) struct rtable *rt = skb_rtable(skb); int err = 0; - dev = rt->u.dst.dev; + dev = rt->dst.dev; /* * Point into the IP datagram header. @@ -473,7 +473,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -586,7 +586,7 @@ slow_path: * we need to make room for the encapsulating header */ pad = nf_bridge_pad(skb); - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad); + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, pad); mtu -= pad; /* @@ -833,13 +833,13 @@ int ip_append_data(struct sock *sk, */ *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->u.dst.dev->mtu : - dst_mtu(rt->u.dst.path); - inet->cork.dst = &rt->u.dst; + rt->dst.dev->mtu : + dst_mtu(rt->dst.path); + inet->cork.dst = &rt->dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->u.dst.header_len) != 0) { + if ((exthdrlen = rt->dst.header_len) != 0) { length += exthdrlen; transhdrlen += exthdrlen; } @@ -852,7 +852,7 @@ int ip_append_data(struct sock *sk, exthdrlen = 0; mtu = inet->cork.fragsize; } - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; @@ -869,14 +869,14 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features & NETIF_F_V4_CSUM && + rt->dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) csummode = CHECKSUM_PARTIAL; inet->cork.length += length; if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -924,7 +924,7 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; @@ -935,7 +935,7 @@ alloc_new_skb: * the last. */ if (datalen == length + fraggap) - alloclen += rt->u.dst.trailer_len; + alloclen += rt->dst.trailer_len; if (transhdrlen) { skb = sock_alloc_send_skb(sk, @@ -1008,7 +1008,7 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; @@ -1103,10 +1103,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); mtu = inet->cork.fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); @@ -1122,7 +1122,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } @@ -1274,8 +1274,8 @@ int ip_push_pending_frames(struct sock *sk) * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc >= IP_PMTUDISC_DO || - (skb->len <= dst_mtu(&rt->u.dst) && - ip_dont_fragment(sk, &rt->u.dst))) + (skb->len <= dst_mtu(&rt->dst) && + ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) @@ -1284,7 +1284,7 @@ int ip_push_pending_frames(struct sock *sk) if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else - ttl = ip_select_ttl(inet, &rt->u.dst); + ttl = ip_select_ttl(inet, &rt->dst); iph = (struct iphdr *)skb->data; iph->version = 4; @@ -1295,7 +1295,7 @@ int ip_push_pending_frames(struct sock *sk) } iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->u.dst, sk); + ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; @@ -1308,7 +1308,7 @@ int ip_push_pending_frames(struct sock *sk) * on dst refcount */ inet->cork.dst = NULL; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7fd63671103..ec036731a70 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -435,7 +435,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error_icmp; } } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); @@ -446,7 +446,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) df |= old_iph->frag_off & htons(IP_DF); if (df) { - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { stats->collisions++; @@ -503,7 +503,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -552,7 +552,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) .proto = IPPROTO_IPIP }; struct rtable *rt; if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 856123fe32f..8418afc357e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1551,9 +1551,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - dev = rt->u.dst.dev; + dev = rt->dst.dev; - if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not allow to send ICMP, so that packets will disappear to blackhole. @@ -1564,7 +1564,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; + encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1575,7 +1575,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->bytes_out += skb->len; skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 07de855e217..cfbc79af21c 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -43,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* Drop old route. */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ @@ -53,11 +53,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) orefdst = skb->_skb_refdst; if (ip_route_input(skb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); + RT_TOS(iph->tos), rt->dst.dev) != 0) { + dst_release(&rt->dst); return -1; } - dst_release(&rt->u.dst); + dst_release(&rt->dst); refdst_drop(orefdst); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 66cc3befcd4..009a7b2aa1e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -325,24 +325,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, int err; struct rtable *rt = *rtp; - if (length > rt->u.dst.dev->mtu) { + if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, - rt->u.dst.dev->mtu); + rt->dst.dev->mtu); return -EMSGSIZE; } if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, + length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); *rtp = NULL; skb_reset_network_header(skb); @@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -384,7 +384,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb_transport_header(skb))->type); err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, - rt->u.dst.dev, dst_output); + rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -606,7 +606,7 @@ out: return len; do_confirm: - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 883b5c7195a..a291edbbc97 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -286,10 +286,10 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); while (r) { - if (dev_net(r->u.dst.dev) == seq_file_net(seq) && + if (dev_net(r->dst.dev) == seq_file_net(seq) && r->rt_genid == st->genid) return r; - r = rcu_dereference_bh(r->u.dst.rt_next); + r = rcu_dereference_bh(r->dst.rt_next); } rcu_read_unlock_bh(); } @@ -301,7 +301,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; - r = r->u.dst.rt_next; + r = r->dst.rt_next; while (!r) { rcu_read_unlock_bh(); do { @@ -319,7 +319,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->u.dst.dev) != seq_file_net(seq)) + if (dev_net(r->dst.dev) != seq_file_net(seq)) continue; if (r->rt_genid == st->genid) break; @@ -377,19 +377,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->u.dst.dev ? r->u.dst.dev->name : "*", + r->dst.dev ? r->dst.dev->name : "*", (__force u32)r->rt_dst, (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (__force u32)r->rt_src, - (dst_metric(&r->u.dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), - dst_metric(&r->u.dst, RTAX_WINDOW), - (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + - dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->rt_flags, atomic_read(&r->dst.__refcnt), + r->dst.__use, 0, (__force u32)r->rt_src, + (dst_metric(&r->dst, RTAX_ADVMSS) ? + (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->dst, RTAX_WINDOW), + (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + + dst_metric(&r->dst, RTAX_RTTVAR)), r->fl.fl4_tos, - r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? (r->u.dst.hh->hh_output == + r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, + r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, r->rt_spec_dst, &len); @@ -608,13 +608,13 @@ static inline int ip_rt_proc_init(void) static inline void rt_free(struct rtable *rt) { - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline void rt_drop(struct rtable *rt) { ip_rt_put(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline int rt_fast_clean(struct rtable *rth) @@ -622,13 +622,13 @@ static inline int rt_fast_clean(struct rtable *rth) /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.dst.rt_next; + rth->fl.iif && rth->dst.rt_next; } static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; + rth->dst.expires; } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -636,15 +636,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t unsigned long age; int ret = 0; - if (atomic_read(&rth->u.dst.__refcnt)) + if (atomic_read(&rth->dst.__refcnt)) goto out; ret = 1; - if (rth->u.dst.expires && - time_after_eq(jiffies, rth->u.dst.expires)) + if (rth->dst.expires && + time_after_eq(jiffies, rth->dst.expires)) goto out; - age = jiffies - rth->u.dst.lastuse; + age = jiffies - rth->dst.lastuse; ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) @@ -660,7 +660,7 @@ out: return ret; */ static inline u32 rt_score(struct rtable *rt) { - u32 score = jiffies - rt->u.dst.lastuse; + u32 score = jiffies - rt->dst.lastuse; score = ~score & ~(3<<30); @@ -700,12 +700,12 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); + return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); } static inline int rt_is_expired(struct rtable *rth) { - return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); + return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } /* @@ -734,7 +734,7 @@ static void rt_do_flush(int process_context) rth = rt_hash_table[i].chain; /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->u.dst.rt_next) + for (tail = rth; tail; tail = tail->dst.rt_next) if (!rt_is_expired(tail)) break; if (rth != tail) @@ -743,9 +743,9 @@ static void rt_do_flush(int process_context) /* call rt_free on entries after the tail requiring flush */ prev = &rt_hash_table[i].chain; for (p = *prev; p; p = next) { - next = p->u.dst.rt_next; + next = p->dst.rt_next; if (!rt_is_expired(p)) { - prev = &p->u.dst.rt_next; + prev = &p->dst.rt_next; } else { *prev = next; rt_free(p); @@ -760,7 +760,7 @@ static void rt_do_flush(int process_context) spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth != tail; rth = next) { - next = rth->u.dst.rt_next; + next = rth->dst.rt_next; rt_free(rth); } } @@ -791,7 +791,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) while (aux != rth) { if (compare_hash_inputs(&aux->fl, &rth->fl)) return 0; - aux = aux->u.dst.rt_next; + aux = aux->dst.rt_next; } return ONE; } @@ -831,18 +831,18 @@ static void rt_check_expire(void) length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { - prefetch(rth->u.dst.rt_next); + prefetch(rth->dst.rt_next); if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); continue; } - if (rth->u.dst.expires) { + if (rth->dst.expires) { /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->u.dst.expires)) { + if (time_before_eq(jiffies, rth->dst.expires)) { nofree: tmo >>= 1; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; /* * We only count entries on * a chain with equal hash inputs once @@ -858,7 +858,7 @@ nofree: goto nofree; /* Cleanup aged off entries. */ - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); @@ -999,10 +999,10 @@ static int rt_garbage_collect(struct dst_ops *ops) if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; continue; } - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); goal--; } @@ -1068,7 +1068,7 @@ static int slow_chain_length(const struct rtable *head) while (rth) { length += has_noalias(head, rth); - rth = rth->u.dst.rt_next; + rth = rth->dst.rt_next; } return length >> FRACT_BITS; } @@ -1090,7 +1090,7 @@ restart: candp = NULL; now = jiffies; - if (!rt_caching(dev_net(rt->u.dst.dev))) { + if (!rt_caching(dev_net(rt->dst.dev))) { /* * If we're not caching, just tell the caller we * were successful and don't touch the route. The @@ -1108,7 +1108,7 @@ restart: */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); + int err = arp_bind_neighbour(&rt->dst); if (err) { if (net_ratelimit()) printk(KERN_WARNING @@ -1127,19 +1127,19 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); continue; } if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { /* Put it first */ - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; /* * Since lookup is lockfree, the deletion * must be visible to another weakly ordered CPU before * the insertion at the start of the hash chain. */ - rcu_assign_pointer(rth->u.dst.rt_next, + rcu_assign_pointer(rth->dst.rt_next, rt_hash_table[hash].chain); /* * Since lookup is lockfree, the update writes @@ -1147,18 +1147,18 @@ restart: */ rcu_assign_pointer(rt_hash_table[hash].chain, rth); - dst_use(&rth->u.dst, now); + dst_use(&rth->dst, now); spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); if (rp) *rp = rth; else - skb_dst_set(skb, &rth->u.dst); + skb_dst_set(skb, &rth->dst); return 0; } - if (!atomic_read(&rth->u.dst.__refcnt)) { + if (!atomic_read(&rth->dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { @@ -1170,7 +1170,7 @@ restart: chain_length++; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; } if (cand) { @@ -1181,17 +1181,17 @@ restart: * only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->u.dst.rt_next; + *candp = cand->dst.rt_next; rt_free(cand); } } else { if (chain_length > rt_chain_length_max && slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", - rt->u.dst.dev->name, num); + rt->dst.dev->name, num); } rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1206,7 +1206,7 @@ restart: route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); + int err = arp_bind_neighbour(&rt->dst); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1237,14 +1237,14 @@ restart: } } - rt->u.dst.rt_next = rt_hash_table[hash].chain; + rt->dst.rt_next = rt_hash_table[hash].chain; #if RT_CACHE_DEBUG >= 2 - if (rt->u.dst.rt_next) { + if (rt->dst.rt_next) { struct rtable *trt; printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); - for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) + for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) printk(" . %pI4", &trt->rt_dst); printk("\n"); } @@ -1262,7 +1262,7 @@ skip_hashing: if (rp) *rp = rt; else - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); return 0; } @@ -1334,11 +1334,11 @@ static void rt_del(unsigned hash, struct rtable *rt) ip_rt_put(rt); while ((aux = *rthp) != NULL) { if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->u.dst.rt_next; + *rthp = aux->dst.rt_next; rt_free(aux); continue; } - rthp = &aux->u.dst.rt_next; + rthp = &aux->dst.rt_next; } spin_unlock_bh(rt_hash_lock_addr(hash)); } @@ -1392,19 +1392,19 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || rt_is_expired(rth) || - !net_eq(dev_net(rth->u.dst.dev), net)) { - rthp = &rth->u.dst.rt_next; + !net_eq(dev_net(rth->dst.dev), net)) { + rthp = &rth->dst.rt_next; continue; } if (rth->rt_dst != daddr || rth->rt_src != saddr || - rth->u.dst.error || + rth->dst.error || rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) + rth->dst.dev != dev) break; - dst_hold(&rth->u.dst); + dst_hold(&rth->dst); rt = dst_alloc(&ipv4_dst_ops); if (rt == NULL) { @@ -1414,20 +1414,20 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, /* Copy all the information. */ *rt = *rth; - rt->u.dst.__use = 1; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.child = NULL; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + rt->dst.__use = 1; + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.child = NULL; + if (rt->dst.dev) + dev_hold(rt->dst.dev); if (rt->idev) in_dev_hold(rt->idev); - rt->u.dst.obsolete = -1; - rt->u.dst.lastuse = jiffies; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; + rt->dst.obsolete = -1; + rt->dst.lastuse = jiffies; + rt->dst.path = &rt->dst; + rt->dst.neighbour = NULL; + rt->dst.hh = NULL; #ifdef CONFIG_XFRM - rt->u.dst.xfrm = NULL; + rt->dst.xfrm = NULL; #endif rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; @@ -1436,23 +1436,23 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->rt_gateway = new_gw; /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); + dst_confirm(&rth->dst); if (rt->peer) atomic_inc(&rt->peer->refcnt); - if (arp_bind_neighbour(&rt->u.dst) || - !(rt->u.dst.neighbour->nud_state & + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { - if (rt->u.dst.neighbour) - neigh_event_send(rt->u.dst.neighbour, NULL); + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); ip_rt_put(rth); rt_drop(rt); goto do_next; } - netevent.old = &rth->u.dst; - netevent.new = &rt->u.dst; + netevent.old = &rth->dst; + netevent.new = &rt->dst; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); @@ -1488,8 +1488,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->u.dst.expires && - time_after_eq(jiffies, rt->u.dst.expires))) { + (rt->dst.expires && + time_after_eq(jiffies, rt->dst.expires))) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); @@ -1527,7 +1527,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) int log_martians; rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); + in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; @@ -1538,30 +1538,30 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) - rt->u.dst.rate_tokens = 0; + if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) + rt->dst.rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set u.dst.rate_last to the last seen redirected packet. + * set dst.rate_last to the last seen redirected packet. */ - if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { - rt->u.dst.rate_last = jiffies; + if (rt->dst.rate_tokens >= ip_rt_redirect_number) { + rt->dst.rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->u.dst.rate_tokens == 0 || + if (rt->dst.rate_tokens == 0 || time_after(jiffies, - (rt->u.dst.rate_last + - (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { + (rt->dst.rate_last + + (ip_rt_redirect_load << rt->dst.rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->u.dst.rate_last = jiffies; - ++rt->u.dst.rate_tokens; + rt->dst.rate_last = jiffies; + ++rt->dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->u.dst.rate_tokens == ip_rt_redirect_number && + rt->dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &rt->rt_src, rt->rt_iif, @@ -1576,7 +1576,7 @@ static int ip_error(struct sk_buff *skb) unsigned long now; int code; - switch (rt->u.dst.error) { + switch (rt->dst.error) { case EINVAL: default: goto out; @@ -1585,7 +1585,7 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IP_INC_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INNOROUTES); break; case EACCES: @@ -1594,12 +1594,12 @@ static int ip_error(struct sk_buff *skb) } now = jiffies; - rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; - if (rt->u.dst.rate_tokens > ip_rt_error_burst) - rt->u.dst.rate_tokens = ip_rt_error_burst; - rt->u.dst.rate_last = now; - if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { - rt->u.dst.rate_tokens -= ip_rt_error_cost; + rt->dst.rate_tokens += now - rt->dst.rate_last; + if (rt->dst.rate_tokens > ip_rt_error_burst) + rt->dst.rate_tokens = ip_rt_error_burst; + rt->dst.rate_last = now; + if (rt->dst.rate_tokens >= ip_rt_error_cost) { + rt->dst.rate_tokens -= ip_rt_error_cost; icmp_send(skb, ICMP_DEST_UNREACH, code, 0); } @@ -1644,7 +1644,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + rth = rcu_dereference(rth->dst.rt_next)) { unsigned short mtu = new_mtu; if (rth->fl.fl4_dst != daddr || @@ -1653,8 +1653,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->rt_src != iph->saddr || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - dst_metric_locked(&rth->u.dst, RTAX_MTU) || - !net_eq(dev_net(rth->u.dst.dev), net) || + dst_metric_locked(&rth->dst, RTAX_MTU) || + !net_eq(dev_net(rth->dst.dev), net) || rt_is_expired(rth)) continue; @@ -1662,22 +1662,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && - old_mtu >= dst_mtu(&rth->u.dst) && + old_mtu >= dst_mtu(&rth->dst) && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } - if (mtu <= dst_mtu(&rth->u.dst)) { - if (mtu < dst_mtu(&rth->u.dst)) { - dst_confirm(&rth->u.dst); + if (mtu <= dst_mtu(&rth->dst)) { + if (mtu < dst_mtu(&rth->dst)) { + dst_confirm(&rth->dst); if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; - rth->u.dst.metrics[RTAX_LOCK-1] |= + rth->dst.metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); } - rth->u.dst.metrics[RTAX_MTU-1] = mtu; - dst_set_expires(&rth->u.dst, + rth->dst.metrics[RTAX_MTU-1] = mtu; + dst_set_expires(&rth->dst, ip_rt_mtu_expires); } est_mtu = mtu; @@ -1750,7 +1750,7 @@ static void ipv4_link_failure(struct sk_buff *skb) rt = skb_rtable(skb); if (rt) - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct sk_buff *skb) @@ -1778,11 +1778,11 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { + else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, + src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); memcpy(addr, &src, 4); } @@ -1790,10 +1790,10 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) #ifdef CONFIG_NET_CLS_ROUTE static void set_class_tag(struct rtable *rt, u32 tag) { - if (!(rt->u.dst.tclassid & 0xFFFF)) - rt->u.dst.tclassid |= tag & 0xFFFF; - if (!(rt->u.dst.tclassid & 0xFFFF0000)) - rt->u.dst.tclassid |= tag & 0xFFFF0000; + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; } #endif @@ -1805,30 +1805,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); + memcpy(rt->dst.metrics, fi->fib_metrics, + sizeof(rt->dst.metrics)); if (fi->fib_mtu == 0) { - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && + rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu; + if (dst_metric_locked(&rt->dst, RTAX_MTU) && rt->rt_gateway != rt->rt_dst && - rt->u.dst.dev->mtu > 576) - rt->u.dst.metrics[RTAX_MTU-1] = 576; + rt->dst.dev->mtu > 576) + rt->dst.metrics[RTAX_MTU-1] = 576; } #ifdef CONFIG_NET_CLS_ROUTE - rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } else - rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - - if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) - rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) - rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, + rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; + + if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) + rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; + if (dst_mtu(&rt->dst) > IP_MAX_MTU) + rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; + if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) + rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40, ip_rt_min_advmss); - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) - rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; + if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40) + rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; #ifdef CONFIG_NET_CLS_ROUTE #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1873,13 +1873,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (!rth) goto e_nobufs; - rth->u.dst.output = ip_rt_bug; - rth->u.dst.obsolete = -1; + rth->dst.output = ip_rt_bug; + rth->dst.obsolete = -1; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -1887,13 +1887,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = init_net.loopback_dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; @@ -1901,13 +1901,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; if (our) { - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) - rth->u.dst.input = ip_mr_input; + rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); @@ -2016,12 +2016,12 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; + rth->dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2031,16 +2031,16 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = daddr; rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; - rth->u.dst.dev = (out_dev)->dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = (out_dev)->dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; - rth->u.dst.obsolete = -1; - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); + rth->dst.obsolete = -1; + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rt_set_nexthop(rth, res, itag); @@ -2074,7 +2074,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, - rt_genid(dev_net(rth->u.dst.dev))); + rt_genid(dev_net(rth->dst.dev))); return rt_intern_hash(hash, rth, NULL, skb, fl->iif); } @@ -2197,14 +2197,14 @@ local_input: if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - rth->u.dst.obsolete = -1; + rth->dst.output= ip_rt_bug; + rth->dst.obsolete = -1; rth->rt_genid = rt_genid(net); - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2212,20 +2212,20 @@ local_input: rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = net->loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = net->loopback_dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; + rth->dst.input= ip_error; + rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; @@ -2291,21 +2291,21 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, hash = rt_hash(daddr, saddr, iif, rt_genid(net)); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + rth = rcu_dereference(rth->dst.rt_next)) { if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && - net_eq(dev_net(rth->u.dst.dev), net) && + net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { - dst_use_noref(&rth->u.dst, jiffies); - skb_dst_set_noref(skb, &rth->u.dst); + dst_use_noref(&rth->dst, jiffies); + skb_dst_set_noref(skb, &rth->dst); } else { - dst_use(&rth->u.dst, jiffies); - skb_dst_set(skb, &rth->u.dst); + dst_use(&rth->dst, jiffies); + skb_dst_set(skb, &rth->dst); } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -2412,12 +2412,12 @@ static int __mkroute_output(struct rtable **result, goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; + rth->dst.flags |= DST_NOXFRM; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = oldflp->fl4_dst; rth->fl.fl4_tos = tos; @@ -2429,35 +2429,35 @@ static int __mkroute_output(struct rtable **result, rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ - rth->u.dst.dev = dev_out; + rth->dst.dev = dev_out; dev_hold(dev_out); rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; - rth->u.dst.output=ip_output; - rth->u.dst.obsolete = -1; + rth->dst.output=ip_output; + rth->dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { - rth->u.dst.input = ip_local_deliver; + rth->dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { - rth->u.dst.output = ip_mc_output; + rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(oldflp->fl4_dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; } } #endif @@ -2712,7 +2712,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; - rth = rcu_dereference_bh(rth->u.dst.rt_next)) { + rth = rcu_dereference_bh(rth->dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && @@ -2720,9 +2720,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->u.dst.dev), net) && + net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); + dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; @@ -2759,15 +2759,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_alloc(&ipv4_dst_blackhole_ops); if (rt) { - struct dst_entry *new = &rt->u.dst; + struct dst_entry *new = &rt->dst; atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); - new->dev = ort->u.dst.dev; + new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); @@ -2791,7 +2791,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_free(new); } - dst_release(&(*rp)->u.dst); + dst_release(&(*rp)->dst); *rp = rt; return (rt ? 0 : -ENOMEM); } @@ -2861,11 +2861,11 @@ static int rt_fill_info(struct net *net, r->rtm_src_len = 32; NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); } - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); + if (rt->dst.dev) + NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); #ifdef CONFIG_NET_CLS_ROUTE - if (rt->u.dst.tclassid) - NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); + if (rt->dst.tclassid) + NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt->fl.iif) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); @@ -2875,11 +2875,11 @@ static int rt_fill_info(struct net *net, if (rt->rt_dst != rt->rt_gateway) NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) goto nla_put_failure; - error = rt->u.dst.error; - expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; + error = rt->dst.error; + expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; if (rt->peer) { id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { @@ -2911,7 +2911,7 @@ static int rt_fill_info(struct net *net, NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); } - if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, + if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, expires, error) < 0) goto nla_put_failure; @@ -2976,8 +2976,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void local_bh_enable(); rt = skb_rtable(skb); - if (err == 0 && rt->u.dst.error) - err = -rt->u.dst.error; + if (err == 0 && rt->dst.error) + err = -rt->dst.error; } else { struct flowi fl = { .nl_u = { @@ -2995,7 +2995,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err) goto errout_free; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -3031,12 +3031,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; rcu_read_lock_bh(); for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) + rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { + if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) continue; if (rt_is_expired(rt)) continue; - skb_dst_set_noref(skb, &rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 5c48124332d..02bef6aa8b3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -354,15 +354,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(&rt->u.dst, RTAX_INITRWND)); + dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ret = get_cookie_sock(sk, skb, req, &rt->u.dst); + ret = get_cookie_sock(sk, skb, req, &rt->dst); out: return ret; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7f976af27bf..7f9515c0379 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -237,7 +237,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eec4ff456e3..32e0bef60d0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -914,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) - sk_dst_set(sk, dst_clone(&rt->u.dst)); + sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) @@ -978,7 +978,7 @@ out: return err; do_confirm: - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1705476670e..349327092c9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, fl.fl4_src = saddr->a4; err = __ip_route_output_key(net, &rt, &fl); - dst = &rt->u.dst; + dst = &rt->dst; if (err) dst = ERR_PTR(err); return dst; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e1a698df570..b97bb1f3080 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -557,7 +557,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) pr_warning("Freeing alive inet6 address %p\n", ifp); return; } - dst_release(&ifp->rt->u.dst); + dst_release(&ifp->rt->dst); call_rcu(&ifp->rcu, inet6_ifa_finish_destroy_rcu); } @@ -823,7 +823,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) rt->rt6i_flags |= RTF_EXPIRES; } } - dst_release(&rt->u.dst); + dst_release(&rt->dst); } out: @@ -1863,7 +1863,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) dev, expires, flags); } if (rt) - dst_release(&rt->u.dst); + dst_release(&rt->dst); } /* Try to figure out our local address for this prefix */ @@ -4093,11 +4093,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (ifp->idev->cnf.forwarding) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); - dst_hold(&ifp->rt->u.dst); + dst_hold(&ifp->rt->dst); if (ifp->state == INET6_IFADDR_STATE_DEAD && ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->u.dst); + dst_free(&ifp->rt->dst); break; } } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f058fbd808c..0e5e943446f 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -84,7 +84,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { dev = rt->rt6i_dev; - dst_release(&rt->u.dst); + dst_release(&rt->dst); } else if (ishost) { err = -EADDRNOTAVAIL; goto error; @@ -244,7 +244,7 @@ static void aca_put(struct ifacaddr6 *ac) { if (atomic_dec_and_test(&ac->aca_refcnt)) { in6_dev_put(ac->aca_idev); - dst_release(&ac->aca_rt->u.dst); + dst_release(&ac->aca_rt->dst); kfree(ac); } } @@ -350,7 +350,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->u.dst); + dst_hold(&aca->aca_rt->dst); ip6_del_rt(aca->aca_rt); aca_put(aca); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 8e44f8f9c18..b1108ede18e 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -43,8 +43,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl, if (arg.result) return arg.result; - dst_hold(&net->ipv6.ip6_null_entry->u.dst); - return &net->ipv6.ip6_null_entry->u.dst; + dst_hold(&net->ipv6.ip6_null_entry->dst); + return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, @@ -86,7 +86,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, struct in6_addr saddr; if (ipv6_dev_get_saddr(net, - ip6_dst_idev(&rt->u.dst)->dev, + ip6_dst_idev(&rt->dst)->dev, &flp->fl6_dst, rt6_flags2srcprefs(flags), &saddr)) @@ -99,12 +99,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - dst_release(&rt->u.dst); + dst_release(&rt->dst); rt = NULL; goto out; discard_pkt: - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); out: arg->result = rt; return rt == NULL ? -EAGAIN : 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 92a122b7795..b6a585909d3 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -165,7 +165,7 @@ static __inline__ void node_free(struct fib6_node * fn) static __inline__ void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) - dst_free(&rt->u.dst); + dst_free(&rt->dst); } static void fib6_link_table(struct net *net, struct fib6_table *tb) @@ -278,7 +278,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -619,7 +619,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; - for (iter = fn->leaf; iter; iter=iter->u.dst.rt6_next) { + for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) { /* * Search for duplicates */ @@ -647,7 +647,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (iter->rt6i_metric > rt->rt6i_metric) break; - ins = &iter->u.dst.rt6_next; + ins = &iter->dst.rt6_next; } /* Reset round-robin state, if necessary */ @@ -658,7 +658,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, * insert node */ - rt->u.dst.rt6_next = iter; + rt->dst.rt6_next = iter; *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); @@ -799,7 +799,7 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->u.dst); + dst_free(&rt->dst); } return err; @@ -810,7 +810,7 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->u.dst); + dst_free(&rt->dst); return err; #endif } @@ -1108,7 +1108,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, RT6_TRACE("fib6_del_route\n"); /* Unlink it */ - *rtp = rt->u.dst.rt6_next; + *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1122,14 +1122,14 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, FOR_WALKERS(w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->u.dst.rt6_next; + w->leaf = rt->dst.rt6_next; if (w->leaf == NULL) w->state = FWS_U; } } read_unlock(&fib6_walker_lock); - rt->u.dst.rt6_next = NULL; + rt->dst.rt6_next = NULL; /* If it was last route, expunge its radix tree node */ if (fn->leaf == NULL) { @@ -1168,7 +1168,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) struct rt6_info **rtp; #if RT6_DEBUG >= 2 - if (rt->u.dst.obsolete>0) { + if (rt->dst.obsolete>0) { WARN_ON(fn != NULL); return -ENOENT; } @@ -1195,7 +1195,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.dst.rt6_next) { + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { if (*rtp == rt) { fib6_del_route(fn, rtp, info); return 0; @@ -1334,7 +1334,7 @@ static int fib6_clean_node(struct fib6_walker_t *w) .nl_net = c->net, }; - for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; @@ -1448,8 +1448,8 @@ static int fib6_age(struct rt6_info *rt, void *arg) } gc_args.more++; } else if (rt->rt6i_flags & RTF_CACHE) { - if (atomic_read(&rt->u.dst.__refcnt) == 0 && - time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) { + if (atomic_read(&rt->dst.__refcnt) == 0 && + time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; } else if ((rt->rt6i_flags & RTF_GATEWAY) && diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 89425af0684..d40b330c0ee 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -698,7 +698,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); for (;;) { /* Prepare header of the next frame, @@ -726,7 +726,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) err = output(skb); if(!err) - IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst), + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); if (err || !frag) @@ -740,9 +740,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) kfree(tmp_hdr); if (err == 0) { - IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst), + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); - dst_release(&rt->u.dst); + dst_release(&rt->dst); return 0; } @@ -752,9 +752,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) frag = skb; } - IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst), + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); - dst_release(&rt->u.dst); + dst_release(&rt->dst); return err; } @@ -785,7 +785,7 @@ slow_path: * Allocate buffer. */ - if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { + if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) { NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); @@ -798,7 +798,7 @@ slow_path: */ ip6_copy_metadata(frag, skb); - skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev)); skb_put(frag, len + hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); @@ -1156,24 +1156,24 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, /* need source address above miyazawa*/ } - dst_hold(&rt->u.dst); - inet->cork.dst = &rt->u.dst; + dst_hold(&rt->dst); + inet->cork.dst = &rt->dst; inet->cork.fl = *fl; np->cork.hop_limit = hlimit; np->cork.tclass = tclass; mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? - rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); + rt->dst.dev->mtu : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } inet->cork.fragsize = mtu; - if (dst_allfrag(rt->u.dst.path)) + if (dst_allfrag(rt->dst.path)) inet->cork.flags |= IPCORK_ALLFRAG; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) - + exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; length += exthdrlen; transhdrlen += exthdrlen; @@ -1186,7 +1186,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, mtu = inet->cork.fragsize; } - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); @@ -1224,7 +1224,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, } if (proto == IPPROTO_UDP && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, @@ -1270,7 +1270,7 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; @@ -1281,7 +1281,7 @@ alloc_new_skb: * because we have no idea if we're the last one. */ if (datalen == length + fraggap) - alloclen += rt->u.dst.trailer_len; + alloclen += rt->dst.trailer_len; /* * We just reserve space for fragment header. @@ -1358,7 +1358,7 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; @@ -1503,7 +1503,7 @@ int ip6_push_pending_frames(struct sock *sk) skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 8f39893d808..0fd027f3f47 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -552,7 +552,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (ip_route_output_key(dev_net(skb->dev), &rt, &fl)) goto out; - skb2->dev = rt->u.dst.dev; + skb2->dev = rt->dst.dev; /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { @@ -562,7 +562,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) || - rt->u.dst.dev->type != ARPHRD_TUNNEL) { + rt->dst.dev->type != ARPHRD_TUNNEL) { ip_rt_put(rt); goto out; } @@ -626,7 +626,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, icmpv6_send(skb2, rel_type, rel_code, rel_info); if (rt) - dst_release(&rt->u.dst); + dst_release(&rt->dst); kfree_skb(skb2); } @@ -1135,7 +1135,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - dst_release(&rt->u.dst); + dst_release(&rt->dst); } } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 3e36d1538b6..d1444b95ad7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -158,7 +158,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { dev = rt->rt6i_dev; - dst_release(&rt->u.dst); + dst_release(&rt->dst); } } else dev = dev_get_by_index_rcu(net, ifindex); @@ -248,7 +248,7 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, if (rt) { dev = rt->rt6i_dev; dev_hold(dev); - dst_release(&rt->u.dst); + dst_release(&rt->dst); } } else dev = dev_get_by_index_rcu(net, ifindex); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0abdc242ddb..1fc46fc60ef 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1229,7 +1229,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK0(KERN_ERR "ICMPv6 RA: %s() got default router without neighbour.\n", __func__); - dst_release(&rt->u.dst); + dst_release(&rt->dst); in6_dev_put(in6_dev); return; } @@ -1244,7 +1244,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; if (rt) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit; + rt->dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit; } skip_defrtr: @@ -1363,7 +1363,7 @@ skip_linkparms: in6_dev->cnf.mtu6 = mtu; if (rt) - rt->u.dst.metrics[RTAX_MTU-1] = mtu; + rt->dst.metrics[RTAX_MTU-1] = mtu; rt6_mtu_change(skb->dev, mtu); } @@ -1384,7 +1384,7 @@ skip_linkparms: } out: if (rt) - dst_release(&rt->u.dst); + dst_release(&rt->dst); else if (neigh) neigh_release(neigh); in6_dev_put(in6_dev); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 968b9649072..e677937a07f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -611,23 +611,23 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, int err; struct rt6_info *rt = (struct rt6_info *)*dstp; - if (length > rt->u.dst.dev->mtu) { - ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu); + if (length > rt->dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl, rt->dst.dev->mtu); return -EMSGSIZE; } if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, + length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); *dstp = NULL; skb_put(skb, length); @@ -643,7 +643,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, - rt->u.dst.dev, dst_output); + rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 252d76199c4..f7702850d45 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -126,16 +126,14 @@ static struct dst_ops ip6_dst_blackhole_ops = { }; static struct rt6_info ip6_null_entry_template = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -ENETUNREACH, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_discard, - .output = ip6_pkt_discard_out, - } + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -ENETUNREACH, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -149,16 +147,14 @@ static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct sk_buff *skb); static struct rt6_info ip6_prohibit_entry_template = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -EACCES, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_prohibit, - .output = ip6_pkt_prohibit_out, - } + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -EACCES, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, + .input = ip6_pkt_prohibit, + .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -167,16 +163,14 @@ static struct rt6_info ip6_prohibit_entry_template = { }; static struct rt6_info ip6_blk_hole_entry_template = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -EINVAL, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = dst_discard, - .output = dst_discard, - } + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -EINVAL, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, + .input = dst_discard, + .output = dst_discard, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -249,7 +243,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) { + for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { struct net_device *dev = sprt->rt6i_dev; if (oif) { @@ -407,10 +401,10 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->u.dst.rt6_next) + rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match); for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->u.dst.rt6_next) + rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match); return match; @@ -432,7 +426,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) if (!match && (strict & RT6_LOOKUP_F_REACHABLE)) { - struct rt6_info *next = rt0->u.dst.rt6_next; + struct rt6_info *next = rt0->dst.rt6_next; /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -517,7 +511,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, rt->rt6i_expires = jiffies + HZ * lifetime; rt->rt6i_flags |= RTF_EXPIRES; } - dst_release(&rt->u.dst); + dst_release(&rt->dst); } return 0; } @@ -555,7 +549,7 @@ restart: rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags); BACKTRACK(net, &fl->fl6_src); out: - dst_use(&rt->u.dst, jiffies); + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); return rt; @@ -643,7 +637,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; - rt->u.dst.flags |= DST_HOST; + rt->dst.flags |= DST_HOST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -677,7 +671,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad if (net_ratelimit()) printk(KERN_WARNING "Neighbour table overflow.\n"); - dst_free(&rt->u.dst); + dst_free(&rt->dst); return NULL; } rt->rt6i_nexthop = neigh; @@ -694,7 +688,7 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *d ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; - rt->u.dst.flags |= DST_HOST; + rt->dst.flags |= DST_HOST; rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop); } return rt; @@ -726,7 +720,7 @@ restart: rt->rt6i_flags & RTF_CACHE) goto out; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) @@ -739,10 +733,10 @@ restart: #endif } - dst_release(&rt->u.dst); + dst_release(&rt->dst); rt = nrt ? : net->ipv6.ip6_null_entry; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); if (nrt) { err = ip6_ins_rt(nrt); if (!err) @@ -756,7 +750,7 @@ restart: * Race condition! In the gap, when table->tb6_lock was * released someone could insert this route. Relookup. */ - dst_release(&rt->u.dst); + dst_release(&rt->dst); goto relookup; out: @@ -764,11 +758,11 @@ out: reachable = 0; goto restart_2; } - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); out2: - rt->u.dst.lastuse = jiffies; - rt->u.dst.__use++; + rt->dst.lastuse = jiffies; + rt->dst.__use++; return rt; } @@ -835,15 +829,15 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl struct dst_entry *new = NULL; if (rt) { - new = &rt->u.dst; + new = &rt->dst; atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); - new->dev = ort->u.dst.dev; + memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); + new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); rt->rt6i_idev = ort->rt6i_idev; @@ -912,7 +906,7 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { if (rt->rt6i_flags&RTF_CACHE) { - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); rt->rt6i_flags |= RTF_EXPIRES; } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) rt->rt6i_node->fn_sernum = -1; @@ -986,14 +980,14 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_dev = dev; rt->rt6i_idev = idev; rt->rt6i_nexthop = neigh; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst)); - rt->u.dst.output = ip6_output; + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.metrics[RTAX_HOPLIMIT-1] = 255; + rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst)); + rt->dst.output = ip6_output; #if 0 /* there's no chance to use these for ndisc */ - rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST + rt->dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST ? DST_HOST : 0; ipv6_addr_copy(&rt->rt6i_dst.addr, addr); @@ -1001,14 +995,14 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, #endif spin_lock_bh(&icmp6_dst_lock); - rt->u.dst.next = icmp6_dst_gc_list; - icmp6_dst_gc_list = &rt->u.dst; + rt->dst.next = icmp6_dst_gc_list; + icmp6_dst_gc_list = &rt->dst; spin_unlock_bh(&icmp6_dst_lock); fib6_force_start_gc(net); out: - return &rt->u.dst; + return &rt->dst; } int icmp6_dst_gc(void) @@ -1159,7 +1153,7 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } - rt->u.dst.obsolete = -1; + rt->dst.obsolete = -1; rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ? jiffies + clock_t_to_jiffies(cfg->fc_expires) : 0; @@ -1171,16 +1165,16 @@ int ip6_route_add(struct fib6_config *cfg) addr_type = ipv6_addr_type(&cfg->fc_dst); if (addr_type & IPV6_ADDR_MULTICAST) - rt->u.dst.input = ip6_mc_input; + rt->dst.input = ip6_mc_input; else - rt->u.dst.input = ip6_forward; + rt->dst.input = ip6_forward; - rt->u.dst.output = ip6_output; + rt->dst.output = ip6_output; ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) - rt->u.dst.flags = DST_HOST; + rt->dst.flags = DST_HOST; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1208,9 +1202,9 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } } - rt->u.dst.output = ip6_pkt_discard_out; - rt->u.dst.input = ip6_pkt_discard; - rt->u.dst.error = -ENETUNREACH; + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + rt->dst.error = -ENETUNREACH; rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; goto install_route; } @@ -1244,7 +1238,7 @@ int ip6_route_add(struct fib6_config *cfg) goto out; if (dev) { if (dev != grt->rt6i_dev) { - dst_release(&grt->u.dst); + dst_release(&grt->dst); goto out; } } else { @@ -1255,7 +1249,7 @@ int ip6_route_add(struct fib6_config *cfg) } if (!(grt->rt6i_flags&RTF_GATEWAY)) err = 0; - dst_release(&grt->u.dst); + dst_release(&grt->dst); if (err) goto out; @@ -1294,18 +1288,18 @@ install_route: goto out; } - rt->u.dst.metrics[type - 1] = nla_get_u32(nla); + rt->dst.metrics[type - 1] = nla_get_u32(nla); } } } - if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; - if (!dst_mtu(&rt->u.dst)) - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); - if (!dst_metric(&rt->u.dst, RTAX_ADVMSS)) - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst)); - rt->u.dst.dev = dev; + if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) + rt->dst.metrics[RTAX_HOPLIMIT-1] = -1; + if (!dst_mtu(&rt->dst)) + rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); + if (!dst_metric(&rt->dst, RTAX_ADVMSS)) + rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst)); + rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -1319,7 +1313,7 @@ out: if (idev) in6_dev_put(idev); if (rt) - dst_free(&rt->u.dst); + dst_free(&rt->dst); return err; } @@ -1336,7 +1330,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) write_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); - dst_release(&rt->u.dst); + dst_release(&rt->dst); write_unlock_bh(&table->tb6_lock); @@ -1369,7 +1363,7 @@ static int ip6_route_del(struct fib6_config *cfg) &cfg->fc_src, cfg->fc_src_len); if (fn) { - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (cfg->fc_ifindex && (rt->rt6i_dev == NULL || rt->rt6i_dev->ifindex != cfg->fc_ifindex)) @@ -1379,7 +1373,7 @@ static int ip6_route_del(struct fib6_config *cfg) continue; if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) continue; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); return __ip6_del_rt(rt, &cfg->fc_nlinfo); @@ -1421,7 +1415,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); restart: - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { /* * Current route is on-link; redirect is always invalid. * @@ -1445,7 +1439,7 @@ restart: rt = net->ipv6.ip6_null_entry; BACKTRACK(net, &fl->fl6_src); out: - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); @@ -1513,10 +1507,10 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); /* Duplicate redirect: silently ignore. */ - if (neigh == rt->u.dst.neighbour) + if (neigh == rt->dst.neighbour) goto out; nrt = ip6_rt_copy(rt); @@ -1529,20 +1523,20 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); nrt->rt6i_dst.plen = 128; - nrt->u.dst.flags |= DST_HOST; + nrt->dst.flags |= DST_HOST; ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); nrt->rt6i_nexthop = neigh_clone(neigh); /* Reset pmtu, it may be better */ - nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); - nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev), - dst_mtu(&nrt->u.dst)); + nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); + nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev), + dst_mtu(&nrt->dst)); if (ip6_ins_rt(nrt)) goto out; - netevent.old = &rt->u.dst; - netevent.new = &nrt->u.dst; + netevent.old = &rt->dst; + netevent.new = &nrt->dst; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); if (rt->rt6i_flags&RTF_CACHE) { @@ -1551,7 +1545,7 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, } out: - dst_release(&rt->u.dst); + dst_release(&rt->dst); } /* @@ -1570,7 +1564,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, if (rt == NULL) return; - if (pmtu >= dst_mtu(&rt->u.dst)) + if (pmtu >= dst_mtu(&rt->dst)) goto out; if (pmtu < IPV6_MIN_MTU) { @@ -1588,7 +1582,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, They are sent only in response to data packets, so that this nexthop apparently is reachable. --ANK */ - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); /* Host route. If it is static, it would be better not to override it, but add new one, so that @@ -1596,10 +1590,10 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, would return automatically. */ if (rt->rt6i_flags & RTF_CACHE) { - rt->u.dst.metrics[RTAX_MTU-1] = pmtu; + rt->dst.metrics[RTAX_MTU-1] = pmtu; if (allfrag) - rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires); + rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; goto out; } @@ -1615,9 +1609,9 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, nrt = rt6_alloc_clone(rt, daddr); if (nrt) { - nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; + nrt->dst.metrics[RTAX_MTU-1] = pmtu; if (allfrag) - nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; /* According to RFC 1981, detecting PMTU increase shouldn't be * happened within 5 mins, the recommended timer is 10 mins. @@ -1625,13 +1619,13 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, * which is 10 mins. After 10 mins the decreased pmtu is expired * and detecting PMTU increase will be automatically happened. */ - dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires); + dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; ip6_ins_rt(nrt); } out: - dst_release(&rt->u.dst); + dst_release(&rt->dst); } /* @@ -1644,18 +1638,18 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops); if (rt) { - rt->u.dst.input = ort->u.dst.input; - rt->u.dst.output = ort->u.dst.output; - - memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); - rt->u.dst.error = ort->u.dst.error; - rt->u.dst.dev = ort->u.dst.dev; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + + memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); + rt->dst.error = ort->dst.error; + rt->dst.dev = ort->dst.dev; + if (rt->dst.dev) + dev_hold(rt->dst.dev); rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->u.dst.lastuse = jiffies; + rt->dst.lastuse = jiffies; rt->rt6i_expires = 0; ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); @@ -1689,14 +1683,14 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->rt6i_dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); break; } out: @@ -1744,14 +1738,14 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *d return NULL; write_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) { + for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { if (dev == rt->rt6i_dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) break; } if (rt) - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); write_unlock_bh(&table->tb6_lock); return rt; } @@ -1790,9 +1784,9 @@ void rt6_purge_dflt_routers(struct net *net) restart: read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); ip6_del_rt(rt); goto restart; @@ -1930,15 +1924,15 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, dev_hold(net->loopback_dev); in6_dev_hold(idev); - rt->u.dst.flags = DST_HOST; - rt->u.dst.input = ip6_input; - rt->u.dst.output = ip6_output; + rt->dst.flags = DST_HOST; + rt->dst.input = ip6_input; + rt->dst.output = ip6_output; rt->rt6i_dev = net->loopback_dev; rt->rt6i_idev = idev; - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst)); - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; - rt->u.dst.obsolete = -1; + rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst)); + rt->dst.metrics[RTAX_HOPLIMIT-1] = -1; + rt->dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) @@ -1947,7 +1941,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_flags |= RTF_LOCAL; neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); if (IS_ERR(neigh)) { - dst_free(&rt->u.dst); + dst_free(&rt->dst); /* We are casting this because that is the return * value type. But an errno encoded pointer is the @@ -1962,7 +1956,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); - atomic_set(&rt->u.dst.__refcnt, 1); + atomic_set(&rt->dst.__refcnt, 1); return rt; } @@ -2033,12 +2027,12 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->rt6i_dev == arg->dev && - !dst_metric_locked(&rt->u.dst, RTAX_MTU) && - (dst_mtu(&rt->u.dst) >= arg->mtu || - (dst_mtu(&rt->u.dst) < arg->mtu && - dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) { - rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu); + !dst_metric_locked(&rt->dst, RTAX_MTU) && + (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6))) { + rt->dst.metrics[RTAX_MTU-1] = arg->mtu; + rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu); } return 0; } @@ -2252,20 +2246,20 @@ static int rt6_fill_node(struct net *net, #endif NLA_PUT_U32(skb, RTA_IIF, iif); } else if (dst) { - struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst); + struct inet6_dev *idev = ip6_dst_idev(&rt->dst); struct in6_addr saddr_buf; if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, dst, 0, &saddr_buf) == 0) NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); } - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) goto nla_put_failure; - if (rt->u.dst.neighbour) - NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); + if (rt->dst.neighbour) + NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key); - if (rt->u.dst.dev) + if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex); NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); @@ -2277,8 +2271,8 @@ static int rt6_fill_node(struct net *net, else expires = INT_MAX; - if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, - expires, rt->u.dst.error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, + expires, rt->dst.error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -2364,7 +2358,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).pid, @@ -2416,12 +2410,12 @@ static int ip6_route_dev_notify(struct notifier_block *this, struct net *net = dev_net(dev); if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { - net->ipv6.ip6_null_entry->u.dst.dev = dev; + net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - net->ipv6.ip6_prohibit_entry->u.dst.dev = dev; + net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); - net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev; + net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif } @@ -2464,8 +2458,8 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg) seq_puts(m, "00000000000000000000000000000000"); } seq_printf(m, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt), - rt->u.dst.__use, rt->rt6i_flags, + rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), + rt->dst.__use, rt->rt6i_flags, rt->rt6i_dev ? rt->rt6i_dev->name : ""); return 0; } @@ -2646,9 +2640,9 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_ip6_dst_ops; - net->ipv6.ip6_null_entry->u.dst.path = + net->ipv6.ip6_null_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_null_entry; - net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops; + net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, @@ -2656,18 +2650,18 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->u.dst.path = + net->ipv6.ip6_prohibit_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_prohibit_entry; - net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops; + net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->u.dst.path = + net->ipv6.ip6_blk_hole_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; - net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops; + net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; #endif net->ipv6.sysctl.flush_delay = 0; @@ -2742,12 +2736,12 @@ int __init ip6_route_init(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #endif ret = fib6_init(); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 702c532ec21..4699cd3c311 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -712,7 +712,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, stats->tx_carrier_errors++; goto tx_error_icmp; } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); @@ -721,7 +721,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (df) { - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { stats->collisions++; @@ -780,7 +780,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags = 0; skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -829,7 +829,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) .proto = IPPROTO_IPV6 }; struct rtable *rt; if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0852512d392..226a0ae3bcf 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -348,7 +348,7 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len sk->sk_state = TCP_ESTABLISHED; inet->inet_id = jiffies; - sk_dst_set(sk, &rt->u.dst); + sk_dst_set(sk, &rt->dst); write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); @@ -496,9 +496,9 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); } - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, dst_clone(&rt->dst)); /* Queue the packet to IP for output */ rc = ip_queue_xmit(skb); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 93c15a107b2..02b078e11cf 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -90,10 +90,10 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) &dest->addr.ip); return NULL; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst)); IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", &dest->addr.ip, - atomic_read(&rt->u.dst.__refcnt), rtos); + atomic_read(&rt->dst.__refcnt), rtos); } spin_unlock(&dest->dst_lock); } else { @@ -148,10 +148,10 @@ __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) &dest->addr.in6); return NULL; } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst)); IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", &dest->addr.in6, - atomic_read(&rt->u.dst.__refcnt)); + atomic_read(&rt->dst.__refcnt)); } spin_unlock(&dest->dst_lock); } else { @@ -198,7 +198,7 @@ do { \ (skb)->ipvs_property = 1; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ + (rt)->dst.dev, dst_output); \ } while (0) @@ -245,7 +245,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -265,7 +265,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -309,9 +309,9 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; @@ -323,13 +323,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); return NF_STOLEN; } /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -376,7 +376,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -388,12 +388,12 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, sizeof(struct iphdr))) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) @@ -452,9 +452,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); @@ -465,12 +465,12 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) @@ -498,7 +498,7 @@ tx_error: kfree_skb(skb); return NF_STOLEN; tx_error_put: - dst_release(&rt->u.dst); + dst_release(&rt->dst); goto tx_error; } #endif @@ -549,9 +549,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) goto tx_error_icmp; - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { ip_rt_put(rt); IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); @@ -601,7 +601,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -615,7 +615,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -660,12 +660,12 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (!rt) goto tx_error_icmp; - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; - mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); /* TODO IPv6: do we need this check in IPv6? */ if (mtu < 1280) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__); goto tx_error; } @@ -674,7 +674,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->u.dst); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } @@ -689,7 +689,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); kfree_skb(skb); IP_VS_ERR_RL("%s(): no memory\n", __func__); return NF_STOLEN; @@ -707,7 +707,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -760,7 +760,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -780,7 +780,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -813,10 +813,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->u.dst); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } @@ -827,13 +827,13 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); return NF_STOLEN; } /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -888,7 +888,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -900,12 +900,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, offset)) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop the old route when skb is not shared */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_vs_nat_icmp(skb, pp, cp, 0); @@ -963,9 +963,9 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; @@ -975,12 +975,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, offset)) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop the old route when skb is not shared */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_vs_nat_icmp_v6(skb, pp, cp, 0); @@ -1001,7 +1001,7 @@ out: LeaveFunction(10); return rc; tx_error_put: - dst_release(&rt->u.dst); + dst_release(&rt->dst); goto tx_error; } #endif diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 6eaee7c8a33..b969025cf82 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -734,11 +734,11 @@ static int callforward_do_filter(const union nf_inet_addr *src, if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) { if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { if (rt1->rt_gateway == rt2->rt_gateway && - rt1->u.dst.dev == rt2->u.dst.dev) + rt1->dst.dev == rt2->dst.dev) ret = 1; - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); } - dst_release(&rt1->u.dst); + dst_release(&rt1->dst); } break; } @@ -753,11 +753,11 @@ static int callforward_do_filter(const union nf_inet_addr *src, if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && - rt1->u.dst.dev == rt2->u.dst.dev) + rt1->dst.dev == rt2->dst.dev) ret = 1; - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); } - dst_release(&rt1->u.dst); + dst_release(&rt1->dst); } break; } diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 497b2224536..aadde018a07 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -61,7 +61,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, goto out; rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); + in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { for_primary_ifa(in_dev) { if (ifa->ifa_broadcast == iph->daddr) { diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 62ec021fbd5..1841388c770 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -165,8 +165,8 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, rcu_read_unlock(); if (rt != NULL) { - mtu = dst_mtu(&rt->u.dst); - dst_release(&rt->u.dst); + mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); } return mtu; } diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 859d9fd429c..c77a85bbd9e 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -77,8 +77,8 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) return false; skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); - skb->dev = rt->u.dst.dev; + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; skb->protocol = htons(ETH_P_IP); return true; } diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c index f0f85b0123f..9f1729bd60d 100644 --- a/net/rxrpc/ar-peer.c +++ b/net/rxrpc/ar-peer.c @@ -64,8 +64,8 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) return; } - peer->if_mtu = dst_mtu(&rt->u.dst); - dst_release(&rt->u.dst); + peer->if_mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); _leave(" [if_mtu %u]", peer->if_mtu); } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 182749867c7..a0e1a7fdebb 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -490,7 +490,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, __func__, &fl.fl4_dst, &fl.fl4_src); if (!ip_route_output_key(&init_net, &rt, &fl)) { - dst = &rt->u.dst; + dst = &rt->dst; } /* If there is no association or if a source address is passed, no @@ -534,7 +534,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, fl.fl4_src = laddr->a.v4.sin_addr.s_addr; fl.fl_ip_sport = laddr->a.v4.sin_port; if (!ip_route_output_key(&init_net, &rt, &fl)) { - dst = &rt->u.dst; + dst = &rt->dst; goto out_unlock; } } -- cgit v1.2.3-70-g09d2 From c7de2cf053420d63bac85133469c965d4b1083e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jun 2010 02:09:23 +0000 Subject: pkt_sched: gen_kill_estimator() rcu fixes gen_kill_estimator() API is incomplete or not well documented, since caller should make sure an RCU grace period is respected before freeing stats_lock. This was partially addressed in commit 5d944c640b4 (gen_estimator: deadlock fix), but same problem exist for all gen_kill_estimator() users, if lock they use is not already RCU protected. A code review shows xt_RATEEST.c, act_api.c, act_police.c have this problem. Other are ok because they use qdisc lock, already RCU protected. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/act_api.h | 2 ++ include/net/netfilter/xt_rateest.h | 1 + net/core/gen_estimator.c | 1 + net/netfilter/xt_RATEEST.c | 12 +++++++++++- net/sched/act_api.c | 11 ++++++++++- net/sched/act_police.c | 12 +++++++++++- 6 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index c05fd717c58..bab385f13ac 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -20,6 +20,7 @@ struct tcf_common { struct gnet_stats_queue tcfc_qstats; struct gnet_stats_rate_est tcfc_rate_est; spinlock_t tcfc_lock; + struct rcu_head tcfc_rcu; }; #define tcf_next common.tcfc_next #define tcf_index common.tcfc_index @@ -32,6 +33,7 @@ struct tcf_common { #define tcf_qstats common.tcfc_qstats #define tcf_rate_est common.tcfc_rate_est #define tcf_lock common.tcfc_lock +#define tcf_rcu common.tcfc_rcu struct tcf_police { struct tcf_common common; diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index ddbf37e1961..5e142779592 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -9,6 +9,7 @@ struct xt_rateest { struct gnet_estimator params; struct gnet_stats_rate_est rstats; struct gnet_stats_basic_packed bstats; + struct rcu_head rcu; }; extern struct xt_rateest *xt_rateest_lookup(const char *name); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 785e5276a30..9fbe7f7429b 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -263,6 +263,7 @@ static void __gen_kill_estimator(struct rcu_head *head) * * Removes the rate estimator specified by &bstats and &rate_est. * + * Note : Caller should respect an RCU grace period before freeing stats_lock */ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 69c01e10f8a..de079abd5bc 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -60,13 +60,22 @@ struct xt_rateest *xt_rateest_lookup(const char *name) } EXPORT_SYMBOL_GPL(xt_rateest_lookup); +static void xt_rateest_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct xt_rateest, rcu)); +} + void xt_rateest_put(struct xt_rateest *est) { mutex_lock(&xt_rateest_mutex); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->bstats, &est->rstats); - kfree(est); + /* + * gen_estimator est_timer() might access est->lock or bstats, + * wait a RCU grace period before freeing 'est' + */ + call_rcu(&est->rcu, xt_rateest_free_rcu); } mutex_unlock(&xt_rateest_mutex); } @@ -179,6 +188,7 @@ static int __init xt_rateest_tg_init(void) static void __exit xt_rateest_tg_fini(void) { xt_unregister_target(&xt_rateest_tg_reg); + rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */ } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 972378f47f3..23b25f89e7e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -26,6 +26,11 @@ #include #include +static void tcf_common_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct tcf_common, tcfc_rcu)); +} + void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) { unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); @@ -38,7 +43,11 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) write_unlock_bh(hinfo->lock); gen_kill_estimator(&p->tcfc_bstats, &p->tcfc_rate_est); - kfree(p); + /* + * gen_estimator est_timer() might access p->tcfc_lock + * or bstats, wait a RCU grace period before freeing p + */ + call_rcu(&p->tcfc_rcu, tcf_common_free_rcu); return; } } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 654f73dff7c..537a48732e9 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -97,6 +97,11 @@ nla_put_failure: goto done; } +static void tcf_police_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct tcf_police, tcf_rcu)); +} + static void tcf_police_destroy(struct tcf_police *p) { unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); @@ -113,7 +118,11 @@ static void tcf_police_destroy(struct tcf_police *p) qdisc_put_rtab(p->tcfp_R_tab); if (p->tcfp_P_tab) qdisc_put_rtab(p->tcfp_P_tab); - kfree(p); + /* + * gen_estimator est_timer() might access p->tcf_lock + * or bstats, wait a RCU grace period before freeing p + */ + call_rcu(&p->tcf_rcu, tcf_police_free_rcu); return; } } @@ -397,6 +406,7 @@ static void __exit police_cleanup_module(void) { tcf_unregister_action(&act_police_ops); + rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */ } module_init(police_init_module); -- cgit v1.2.3-70-g09d2 From be1f3c2c027cc5ad735df6a45a542ed1db7ec48b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 8 Jun 2010 07:19:54 +0000 Subject: net: Enable 64-bit net device statistics on 32-bit architectures Use struct rtnl_link_stats64 as the statistics structure. On 32-bit architectures, insert 32 bits of padding after/before each field of struct net_device_stats to make its layout compatible with struct rtnl_link_stats64. Add an anonymous union in net_device; move stats into the union and add struct rtnl_link_stats64 stats64. Add net_device_ops::ndo_get_stats64, implementations of which will return a pointer to struct rtnl_link_stats64. Drivers that implement this operation must not update the structure asynchronously. Change dev_get_stats() to call ndo_get_stats64 if available, and to return a pointer to struct rtnl_link_stats64. Change callers of dev_get_stats() accordingly. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 13 +++--- include/linux/if_link.h | 3 +- include/linux/netdevice.h | 91 ++++++++++++++++++++++++----------------- net/8021q/vlanproc.c | 13 +++--- net/core/dev.c | 19 +++++---- net/core/net-sysfs.c | 12 +++--- net/core/rtnetlink.c | 6 +-- 7 files changed, 90 insertions(+), 67 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ac4f94b7da3..a95a41b74b4 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3804,20 +3804,21 @@ static int bond_close(struct net_device *bond_dev) return 0; } -static struct net_device_stats *bond_get_stats(struct net_device *bond_dev) +static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); - struct net_device_stats *stats = &bond_dev->stats; - struct net_device_stats local_stats; + struct rtnl_link_stats64 *stats = &bond_dev->stats64; + struct rtnl_link_stats64 local_stats; struct slave *slave; int i; - memset(&local_stats, 0, sizeof(struct net_device_stats)); + memset(&local_stats, 0, sizeof(local_stats)); read_lock_bh(&bond->lock); bond_for_each_slave(bond, slave, i) { - const struct net_device_stats *sstats = dev_get_stats(slave->dev); + const struct rtnl_link_stats64 *sstats = + dev_get_stats(slave->dev); local_stats.rx_packets += sstats->rx_packets; local_stats.rx_bytes += sstats->rx_bytes; @@ -4569,7 +4570,7 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_stop = bond_close, .ndo_start_xmit = bond_start_xmit, .ndo_select_queue = bond_select_queue, - .ndo_get_stats = bond_get_stats, + .ndo_get_stats64 = bond_get_stats, .ndo_do_ioctl = bond_do_ioctl, .ndo_set_multicast_list = bond_set_multicast_list, .ndo_change_mtu = bond_change_mtu, diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 85c812db5a3..7fcad2e1be3 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -4,7 +4,7 @@ #include #include -/* The struct should be in sync with struct net_device_stats */ +/* This struct should be in sync with struct rtnl_link_stats64 */ struct rtnl_link_stats { __u32 rx_packets; /* total packets received */ __u32 tx_packets; /* total packets transmitted */ @@ -37,6 +37,7 @@ struct rtnl_link_stats { __u32 tx_compressed; }; +/* The main device statistics structure */ struct rtnl_link_stats64 { __u64 rx_packets; /* total packets received */ __u64 tx_packets; /* total packets transmitted */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c319f28d699..4fbccc5f609 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -159,45 +159,49 @@ static inline bool dev_xmit_complete(int rc) #define MAX_HEADER (LL_MAX_HEADER + 48) #endif -#endif /* __KERNEL__ */ - /* - * Network device statistics. Akin to the 2.0 ether stats but - * with byte counters. + * Old network device statistics. Fields are native words + * (unsigned long) so they can be read and written atomically. + * Each field is padded to 64 bits for compatibility with + * rtnl_link_stats64. */ +#if BITS_PER_LONG == 64 +#define NET_DEVICE_STATS_DEFINE(name) unsigned long name +#elif defined(__LITTLE_ENDIAN) +#define NET_DEVICE_STATS_DEFINE(name) unsigned long name, pad_ ## name +#else +#define NET_DEVICE_STATS_DEFINE(name) unsigned long pad_ ## name, name +#endif + struct net_device_stats { - unsigned long rx_packets; /* total packets received */ - unsigned long tx_packets; /* total packets transmitted */ - unsigned long rx_bytes; /* total bytes received */ - unsigned long tx_bytes; /* total bytes transmitted */ - unsigned long rx_errors; /* bad packets received */ - unsigned long tx_errors; /* packet transmit problems */ - unsigned long rx_dropped; /* no space in linux buffers */ - unsigned long tx_dropped; /* no space available in linux */ - unsigned long multicast; /* multicast packets received */ - unsigned long collisions; - - /* detailed rx_errors: */ - unsigned long rx_length_errors; - unsigned long rx_over_errors; /* receiver ring buff overflow */ - unsigned long rx_crc_errors; /* recved pkt with crc error */ - unsigned long rx_frame_errors; /* recv'd frame alignment error */ - unsigned long rx_fifo_errors; /* recv'r fifo overrun */ - unsigned long rx_missed_errors; /* receiver missed packet */ - - /* detailed tx_errors */ - unsigned long tx_aborted_errors; - unsigned long tx_carrier_errors; - unsigned long tx_fifo_errors; - unsigned long tx_heartbeat_errors; - unsigned long tx_window_errors; - - /* for cslip etc */ - unsigned long rx_compressed; - unsigned long tx_compressed; + NET_DEVICE_STATS_DEFINE(rx_packets); + NET_DEVICE_STATS_DEFINE(tx_packets); + NET_DEVICE_STATS_DEFINE(rx_bytes); + NET_DEVICE_STATS_DEFINE(tx_bytes); + NET_DEVICE_STATS_DEFINE(rx_errors); + NET_DEVICE_STATS_DEFINE(tx_errors); + NET_DEVICE_STATS_DEFINE(rx_dropped); + NET_DEVICE_STATS_DEFINE(tx_dropped); + NET_DEVICE_STATS_DEFINE(multicast); + NET_DEVICE_STATS_DEFINE(collisions); + NET_DEVICE_STATS_DEFINE(rx_length_errors); + NET_DEVICE_STATS_DEFINE(rx_over_errors); + NET_DEVICE_STATS_DEFINE(rx_crc_errors); + NET_DEVICE_STATS_DEFINE(rx_frame_errors); + NET_DEVICE_STATS_DEFINE(rx_fifo_errors); + NET_DEVICE_STATS_DEFINE(rx_missed_errors); + NET_DEVICE_STATS_DEFINE(tx_aborted_errors); + NET_DEVICE_STATS_DEFINE(tx_carrier_errors); + NET_DEVICE_STATS_DEFINE(tx_fifo_errors); + NET_DEVICE_STATS_DEFINE(tx_heartbeat_errors); + NET_DEVICE_STATS_DEFINE(tx_window_errors); + NET_DEVICE_STATS_DEFINE(rx_compressed); + NET_DEVICE_STATS_DEFINE(tx_compressed); }; +#endif /* __KERNEL__ */ + /* Media selection options. */ enum { @@ -662,10 +666,19 @@ struct netdev_rx_queue { * Callback uses when the transmitter has not made any progress * for dev->watchdog ticks. * + * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev); * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); * Called when a user wants to get the network device usage - * statistics. If not defined, the counters in dev->stats will - * be used. + * statistics. Drivers must do one of the following: + * 1. Define @ndo_get_stats64 to update a rtnl_link_stats64 structure + * (which should normally be dev->stats64) and return a ponter to + * it. The structure must not be changed asynchronously. + * 2. Define @ndo_get_stats to update a net_device_stats64 structure + * (which should normally be dev->stats) and return a pointer to + * it. The structure may be changed asynchronously only if each + * field is written atomically. + * 3. Update dev->stats asynchronously and atomically, and define + * neither operation. * * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); * If device support VLAN receive accleration @@ -720,6 +733,7 @@ struct net_device_ops { struct neigh_parms *); void (*ndo_tx_timeout) (struct net_device *dev); + struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); void (*ndo_vlan_rx_register)(struct net_device *dev, @@ -869,7 +883,10 @@ struct net_device { int ifindex; int iflink; - struct net_device_stats stats; + union { + struct rtnl_link_stats64 stats64; + struct net_device_stats stats; + }; #ifdef CONFIG_WIRELESS_EXT /* List of functions to handle Wireless Extensions (instead of ioctl). @@ -2121,7 +2138,7 @@ extern void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ extern void dev_load(struct net *net, const char *name); extern void dev_mcast_init(void); -extern const struct net_device_stats *dev_get_stats(struct net_device *dev); +extern const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev); extern void dev_txq_stats_fold(const struct net_device *dev, struct net_device_stats *stats); extern int netdev_max_backlog; diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index afead353e21..df56f5ce887 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -278,8 +278,9 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) { struct net_device *vlandev = (struct net_device *) seq->private; const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); - const struct net_device_stats *stats; + const struct rtnl_link_stats64 *stats; static const char fmt[] = "%30s %12lu\n"; + static const char fmt64[] = "%30s %12llu\n"; int i; if (!is_vlan_dev(vlandev)) @@ -291,12 +292,12 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) vlandev->name, dev_info->vlan_id, (int)(dev_info->flags & 1), vlandev->priv_flags); - seq_printf(seq, fmt, "total frames received", stats->rx_packets); - seq_printf(seq, fmt, "total bytes received", stats->rx_bytes); - seq_printf(seq, fmt, "Broadcast/Multicast Rcvd", stats->multicast); + seq_printf(seq, fmt64, "total frames received", stats->rx_packets); + seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); + seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast); seq_puts(seq, "\n"); - seq_printf(seq, fmt, "total frames transmitted", stats->tx_packets); - seq_printf(seq, fmt, "total bytes transmitted", stats->tx_bytes); + seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets); + seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes); seq_printf(seq, fmt, "total headroom inc", dev_info->cnt_inc_headroom_on_tx); seq_printf(seq, fmt, "total encap on xmit", diff --git a/net/core/dev.c b/net/core/dev.c index 277844901ce..a1abc10db08 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3701,10 +3701,10 @@ void dev_seq_stop(struct seq_file *seq, void *v) static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { - const struct net_device_stats *stats = dev_get_stats(dev); + const struct rtnl_link_stats64 *stats = dev_get_stats(dev); - seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -5281,18 +5281,21 @@ EXPORT_SYMBOL(dev_txq_stats_fold); * @dev: device to get statistics from * * Get network statistics from device. The device driver may provide - * its own method by setting dev->netdev_ops->get_stats; otherwise - * the internal statistics structure is used. + * its own method by setting dev->netdev_ops->get_stats64 or + * dev->netdev_ops->get_stats; otherwise the internal statistics + * structure is used. */ -const struct net_device_stats *dev_get_stats(struct net_device *dev) +const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_get_stats64) + return ops->ndo_get_stats64(dev); if (ops->ndo_get_stats) - return ops->ndo_get_stats(dev); + return (struct rtnl_link_stats64 *)ops->ndo_get_stats(dev); dev_txq_stats_fold(dev, &dev->stats); - return &dev->stats; + return &dev->stats64; } EXPORT_SYMBOL(dev_get_stats); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 99e7052d732..ea3bb4c3b87 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -29,6 +29,7 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; +static const char fmt_u64[] = "%llu\n"; static inline int dev_isalive(const struct net_device *dev) { @@ -324,14 +325,13 @@ static ssize_t netstat_show(const struct device *d, struct net_device *dev = to_net_dev(d); ssize_t ret = -EINVAL; - WARN_ON(offset > sizeof(struct net_device_stats) || - offset % sizeof(unsigned long) != 0); + WARN_ON(offset > sizeof(struct rtnl_link_stats64) || + offset % sizeof(u64) != 0); read_lock(&dev_base_lock); if (dev_isalive(dev)) { - const struct net_device_stats *stats = dev_get_stats(dev); - ret = sprintf(buf, fmt_ulong, - *(unsigned long *)(((u8 *) stats) + offset)); + const struct rtnl_link_stats64 *stats = dev_get_stats(dev); + ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); } read_unlock(&dev_base_lock); return ret; @@ -343,7 +343,7 @@ static ssize_t show_##name(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ - offsetof(struct net_device_stats, name)); \ + offsetof(struct rtnl_link_stats64, name)); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1a2af24e9e3..e645778e9b7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -579,7 +579,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, - const struct net_device_stats *b) + const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; @@ -610,7 +610,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->tx_compressed = b->tx_compressed; } -static void copy_rtnl_link_stats64(void *v, const struct net_device_stats *b) +static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) { struct rtnl_link_stats64 a; @@ -791,7 +791,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - const struct net_device_stats *stats; + const struct rtnl_link_stats64 *stats; struct nlattr *attr; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); -- cgit v1.2.3-70-g09d2 From f5c5440d40a24c5dc8030cde0a03debe87de4afb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Jun 2010 16:15:23 +0200 Subject: netfilter: nfnetlink_log: RCU conversion, part 2 - must use atomic_inc_not_zero() in instance_lookup_get() - must use hlist_add_head_rcu() instead of hlist_add_head() - must use hlist_del_rcu() instead of hlist_del() - Introduce NFULNL_COPY_DISABLED to stop lockless reader from using an instance, before we do final instance_put() on it. Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- include/linux/netfilter/nfnetlink_log.h | 1 + net/netfilter/nfnetlink_log.c | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h index d3bab7a2c9b..1d0b84aa1d4 100644 --- a/include/linux/netfilter/nfnetlink_log.h +++ b/include/linux/netfilter/nfnetlink_log.h @@ -89,6 +89,7 @@ enum nfulnl_attr_config { #define NFULNL_COPY_NONE 0x00 #define NFULNL_COPY_META 0x01 #define NFULNL_COPY_PACKET 0x02 +#define NFULNL_COPY_DISABLED 0x03 #define NFULNL_CFG_F_SEQ 0x0001 #define NFULNL_CFG_F_SEQ_GLOBAL 0x0002 diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 8ec23ec568e..fb86a51bb65 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -109,8 +109,8 @@ instance_lookup_get(u_int16_t group_num) rcu_read_lock_bh(); inst = __instance_lookup(group_num); - if (inst) - instance_get(inst); + if (inst && !atomic_inc_not_zero(&inst->use)) + inst = NULL; rcu_read_unlock_bh(); return inst; @@ -171,7 +171,7 @@ instance_create(u_int16_t group_num, int pid) inst->copy_mode = NFULNL_COPY_PACKET; inst->copy_range = NFULNL_COPY_RANGE_MAX; - hlist_add_head(&inst->hlist, + hlist_add_head_rcu(&inst->hlist, &instance_table[instance_hashfn(group_num)]); spin_unlock_bh(&instances_lock); @@ -185,18 +185,23 @@ out_unlock: static void __nfulnl_flush(struct nfulnl_instance *inst); +/* called with BH disabled */ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ - hlist_del(&inst->hlist); + hlist_del_rcu(&inst->hlist); /* then flush all pending packets from skb */ - spin_lock_bh(&inst->lock); + spin_lock(&inst->lock); + + /* lockless readers wont be able to use us */ + inst->copy_mode = NFULNL_COPY_DISABLED; + if (inst->skb) __nfulnl_flush(inst); - spin_unlock_bh(&inst->lock); + spin_unlock(&inst->lock); /* and finally put the refcount */ instance_put(inst); @@ -624,6 +629,7 @@ nfulnl_log_packet(u_int8_t pf, size += nla_total_size(data_len); break; + case NFULNL_COPY_DISABLED: default: goto unlock_and_release; } -- cgit v1.2.3-70-g09d2 From 5d22c89b9bea17a0e48e7534a9b237885e2c0809 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Jun 2010 10:21:40 +0200 Subject: mac80211: remove non-irqsafe aggregation callbacks The non-irqsafe aggregation start/stop done callbacks are currently only used by ath9k_htc, and can cause callbacks into the driver again. This might lead to locking issues, which will only get worse as we modify locking. To avoid trouble, remove the non-irqsafe versions and change ath9k_htc to use those instead. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 6 ++--- include/net/mac80211.h | 32 +++++---------------------- net/mac80211/agg-tx.c | 2 -- net/mac80211/ieee80211_i.h | 2 ++ 4 files changed, 10 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 7aefbc63877..8c463f5965f 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -510,13 +510,13 @@ void ath9k_htc_aggr_work(struct work_struct *work) ret = ath9k_htc_aggr_oper(priv, wk->vif, wk->sta_addr, wk->tid, true); if (!ret) - ieee80211_start_tx_ba_cb(wk->vif, wk->sta_addr, - wk->tid); + ieee80211_start_tx_ba_cb_irqsafe(wk->vif, wk->sta_addr, + wk->tid); break; case IEEE80211_AMPDU_TX_STOP: ath9k_htc_aggr_oper(priv, wk->vif, wk->sta_addr, wk->tid, false); - ieee80211_stop_tx_ba_cb(wk->vif, wk->sta_addr, wk->tid); + ieee80211_stop_tx_ba_cb_irqsafe(wk->vif, wk->sta_addr, wk->tid); break; default: ath_print(ath9k_hw_common(priv->ah), ATH_DBG_FATAL, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index abb3b1a9ddc..7f9401b3d3c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1445,7 +1445,7 @@ enum ieee80211_filter_flags { * * Note that drivers MUST be able to deal with a TX aggregation * session being stopped even before they OK'ed starting it by - * calling ieee80211_start_tx_ba_cb(_irqsafe), because the peer + * calling ieee80211_start_tx_ba_cb_irqsafe, because the peer * might receive the addBA frame and send a delBA right away! * * @IEEE80211_AMPDU_RX_START: start Rx aggregation @@ -2313,17 +2313,6 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, */ int ieee80211_start_tx_ba_session(struct ieee80211_sta *sta, u16 tid); -/** - * ieee80211_start_tx_ba_cb - low level driver ready to aggregate. - * @vif: &struct ieee80211_vif pointer from the add_interface callback - * @ra: receiver address of the BA session recipient. - * @tid: the TID to BA on. - * - * This function must be called by low level driver once it has - * finished with preparations for the BA session. - */ -void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); - /** * ieee80211_start_tx_ba_cb_irqsafe - low level driver ready to aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback @@ -2331,8 +2320,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); * @tid: the TID to BA on. * * This function must be called by low level driver once it has - * finished with preparations for the BA session. - * This version of the function is IRQ-safe. + * finished with preparations for the BA session. It can be called + * from any context. */ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); @@ -2350,17 +2339,6 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, */ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid); -/** - * ieee80211_stop_tx_ba_cb - low level driver ready to stop aggregate. - * @vif: &struct ieee80211_vif pointer from the add_interface callback - * @ra: receiver address of the BA session recipient. - * @tid: the desired TID to BA on. - * - * This function must be called by low level driver once it has - * finished with preparations for the BA session tear down. - */ -void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); - /** * ieee80211_stop_tx_ba_cb_irqsafe - low level driver ready to stop aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback @@ -2368,8 +2346,8 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); * @tid: the desired TID to BA on. * * This function must be called by low level driver once it has - * finished with preparations for the BA session tear down. - * This version of the function is IRQ-safe. + * finished with preparations for the BA session tear down. It + * can be called from any context. */ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 7d8656d51c6..5a7ef51e302 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -479,7 +479,6 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid) spin_unlock_bh(&sta->lock); rcu_read_unlock(); } -EXPORT_SYMBOL(ieee80211_start_tx_ba_cb); void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid) @@ -619,7 +618,6 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) spin_unlock_bh(&sta->lock); rcu_read_unlock(); } -EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb); void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 71bdd8b3c3f..a3ae5130803 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1119,6 +1119,8 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_back_parties initiator); +void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); +void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3-70-g09d2 From 85ad181ea78861f69b007599cec9e6ba33fcdf8a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Jun 2010 10:21:49 +0200 Subject: mac80211: allow drivers to sleep in ampdu_action Allow drivers to sleep, and indicate this in the documentation. ath9k has some locking I don't understand, so keep it safe and disable BHs in it, all other drivers look fine with the context change. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/main.c | 4 ++++ include/net/mac80211.h | 2 +- net/mac80211/driver-ops.h | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index b8b76dd2c11..e1b8456f3d2 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1769,6 +1769,8 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw, struct ath_softc *sc = aphy->sc; int ret = 0; + local_bh_disable(); + switch (action) { case IEEE80211_AMPDU_RX_START: if (!(sc->sc_flags & SC_OP_RXAGGR)) @@ -1798,6 +1800,8 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw, "Unknown AMPDU action\n"); } + local_bh_enable(); + return ret; } diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7f9401b3d3c..bbae3d9b117 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1640,7 +1640,7 @@ enum ieee80211_ampdu_mlme_action { * is the first frame we expect to perform the action on. Notice * that TX/RX_STOP can pass NULL for this parameter. * Returns a negative error code on failure. - * The callback must be atomic. + * The callback can sleep. * * @get_survey: Return per-channel survey information * diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 7e86c6f89be..a4fcbcc4f45 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -352,11 +352,10 @@ static inline int drv_ampdu_action(struct ieee80211_local *local, might_sleep(); - local_bh_disable(); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, sta, tid, ssn); - local_bh_enable(); + trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, ret); return ret; } -- cgit v1.2.3-70-g09d2 From fbd2c8dcbc69616d2e15b8a269a86b3a05d45aea Mon Sep 17 00:00:00 2001 From: Teemu Paasikivi Date: Mon, 14 Jun 2010 12:55:31 +0300 Subject: mac80211: Set basic rates while joining ibss network This patch adds support to nl80211 and mac80211 to set basic rates when joining/creating ibss network. Original patch was posted by Johannes Berg on the linux-wireless posting list. Signed-off-by: Johannes Berg Signed-off-by: Teemu Paasikivi Signed-off-by: John W. Linville --- include/net/cfg80211.h | 2 ++ net/mac80211/ibss.c | 4 +++- net/mac80211/ieee80211_i.h | 2 ++ net/wireless/nl80211.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 22ab9d88cf4..64374f4cb7c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -810,6 +810,7 @@ struct cfg80211_disassoc_request { * @beacon_interval: beacon interval to use * @privacy: this is a protected network, keys will be configured * after joining + * @basic_rates: bitmap of basic rates to use when creating the IBSS */ struct cfg80211_ibss_params { u8 *ssid; @@ -818,6 +819,7 @@ struct cfg80211_ibss_params { u8 *ie; u8 ssid_len, ie_len; u16 beacon_interval; + u32 basic_rates; bool channel_fixed; bool privacy; }; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index bfd7286488c..9f4e64ed8b8 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -172,6 +172,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, rcu_assign_pointer(ifibss->presp, skb); sdata->vif.bss_conf.beacon_int = beacon_int; + sdata->vif.bss_conf.basic_rates = basic_rates; bss_change = BSS_CHANGED_BEACON_INT; bss_change |= ieee80211_reset_erp_info(sdata); bss_change |= BSS_CHANGED_BSSID; @@ -529,7 +530,7 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) sdata->drop_unencrypted = 0; __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, - ifibss->channel, 3, /* first two are basic */ + ifibss->channel, ifibss->basic_rates, capability, 0); } @@ -859,6 +860,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.fixed_bssid = false; sdata->u.ibss.privacy = params->privacy; + sdata->u.ibss.basic_rates = params->basic_rates; sdata->vif.bss_conf.beacon_int = params->beacon_interval; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9d753a02a2e..c3c2be3f8a2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -387,6 +387,8 @@ struct ieee80211_if_ibss { unsigned long request; unsigned long last_scan_completed; + u32 basic_rates; + bool timer_running; bool fixed_bssid; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c65e67e9231..41529aca794 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3955,6 +3955,55 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + u8 *rates = + nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + int n_rates = + nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + struct ieee80211_supported_band *sband = + wiphy->bands[ibss.channel->band]; + int i, j; + + if (n_rates == 0) { + err = -EINVAL; + goto out; + } + + for (i = 0; i < n_rates; i++) { + int rate = (rates[i] & 0x7f) * 5; + bool found = false; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) { + found = true; + ibss.basic_rates |= BIT(j); + break; + } + } + if (!found) { + err = -EINVAL; + goto out; + } + } + } else { + /* + * If no rates were explicitly configured, + * use the mandatory rate set for 11b or + * 11a for maximum compatibility. + */ + struct ieee80211_supported_band *sband = + wiphy->bands[ibss.channel->band]; + int j; + u32 flag = ibss.channel->band == IEEE80211_BAND_5GHZ ? + IEEE80211_RATE_MANDATORY_A : + IEEE80211_RATE_MANDATORY_B; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].flags & flag) + ibss.basic_rates |= BIT(j); + } + } + err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); out: -- cgit v1.2.3-70-g09d2 From 685429623f88d84f98bd5daffc3c427c408740d4 Mon Sep 17 00:00:00 2001 From: Juuso Oikarinen Date: Wed, 9 Jun 2010 13:43:26 +0300 Subject: mac80211: Fix circular locking dependency in ARP filter handling There is a circular locking dependency when configuring the hardware ARP filters on association, occurring when flushing the mac80211 workqueue. This is what happens: [ 92.026800] ======================================================= [ 92.030507] [ INFO: possible circular locking dependency detected ] [ 92.030507] 2.6.34-04781-g2b2c009 #85 [ 92.030507] ------------------------------------------------------- [ 92.030507] modprobe/5225 is trying to acquire lock: [ 92.030507] ((wiphy_name(local->hw.wiphy))){+.+.+.}, at: [] flush_workq ueue+0x0/0xb0 [ 92.030507] [ 92.030507] but task is already holding lock: [ 92.030507] (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x12/0x20 [ 92.030507] [ 92.030507] which lock already depends on the new lock. [ 92.030507] [ 92.030507] [ 92.030507] the existing dependency chain (in reverse order) is: [ 92.030507] [ 92.030507] -> #2 (rtnl_mutex){+.+.+.}: [ 92.030507] [] lock_acquire+0xdb/0x110 [ 92.030507] [] mutex_lock_nested+0x44/0x300 [ 92.030507] [] rtnl_lock+0x12/0x20 [ 92.030507] [] ieee80211_assoc_done+0x6c/0xe0 [mac80211] [ 92.030507] [] ieee80211_work_work+0x31d/0x1280 [mac80211] [ 92.030507] -> #1 ((&local->work_work)){+.+.+.}: [ 92.030507] [] lock_acquire+0xdb/0x110 [ 92.030507] [] worker_thread+0x22a/0x370 [ 92.030507] [] kthread+0x96/0xb0 [ 92.030507] [] kernel_thread_helper+0x4/0x10 [ 92.030507] [ 92.030507] -> #0 ((wiphy_name(local->hw.wiphy))){+.+.+.}: [ 92.030507] [] __lock_acquire+0x1c0c/0x1d50 [ 92.030507] [] lock_acquire+0xdb/0x110 [ 92.030507] [] flush_workqueue+0x4e/0xb0 [ 92.030507] [] ieee80211_stop_device+0x2b/0xb0 [mac80211] [ 92.030507] [] ieee80211_stop+0x3e5/0x680 [mac80211] The locking in this case is quite complex. Fix the problem by rewriting the way the hardware ARP filter list is handled - i.e. make a copy of the address list to the bss_conf struct, and provide that list to the hardware driver when needed. The current patch will enable filtering also in promiscuous mode. This may need to be changed in the future. Reported-by: Reinette Chatre Signed-off-by: Juuso Oikarinen Signed-off-by: John W. Linville --- include/net/mac80211.h | 35 +++++++++++++++++------------ net/mac80211/driver-ops.h | 17 -------------- net/mac80211/driver-trace.h | 22 ------------------ net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/iface.c | 3 +++ net/mac80211/main.c | 54 ++++++++++++++++++++++++++++----------------- net/mac80211/mlme.c | 34 +++++++++++++++------------- 7 files changed, 79 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bbae3d9b117..3a47877f496 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -19,7 +19,6 @@ #include #include #include -#include #include /** @@ -147,6 +146,7 @@ struct ieee80211_low_level_stats { * enabled/disabled (beaconing modes) * @BSS_CHANGED_CQM: Connection quality monitor config changed * @BSS_CHANGED_IBSS: IBSS join status changed + * @BSS_CHANGED_ARP_FILTER: Hardware ARP filter address list or state changed. */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -161,10 +161,18 @@ enum ieee80211_bss_change { BSS_CHANGED_BEACON_ENABLED = 1<<9, BSS_CHANGED_CQM = 1<<10, BSS_CHANGED_IBSS = 1<<11, + BSS_CHANGED_ARP_FILTER = 1<<12, /* when adding here, make sure to change ieee80211_reconfig */ }; +/* + * The maximum number of IPv4 addresses listed for ARP filtering. If the number + * of addresses for an interface increase beyond this value, hardware ARP + * filtering will be disabled. + */ +#define IEEE80211_BSS_ARP_ADDR_LIST_LEN 4 + /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * @@ -200,6 +208,15 @@ enum ieee80211_bss_change { * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value * implies disabled * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis + * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The + * may filter ARP queries targeted for other addresses than listed here. + * The driver must allow ARP queries targeted for all address listed here + * to pass through. An empty list implies no ARP queries need to pass. + * @arp_addr_cnt: Number of addresses currently on the list. + * @arp_filter_enabled: Enable ARP filtering - if enabled, the hardware may + * filter ARP queries based on the @arp_addr_list, if disabled, the + * hardware must not perform any ARP filtering. Note, that the filter will + * be enabled also in promiscuous mode. */ struct ieee80211_bss_conf { const u8 *bssid; @@ -220,6 +237,9 @@ struct ieee80211_bss_conf { s32 cqm_rssi_thold; u32 cqm_rssi_hyst; enum nl80211_channel_type channel_type; + __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; + u8 arp_addr_cnt; + bool arp_filter_enabled; }; /** @@ -1529,16 +1549,6 @@ enum ieee80211_ampdu_mlme_action { * of the bss parameters has changed when a call is made. The callback * can sleep. * - * @configure_arp_filter: Configuration function for hardware ARP query filter. - * This function is called with all the IP addresses configured to the - * interface as argument - all ARP queries targeted to any of these - * addresses must pass through. If the hardware filter does not support - * enought addresses, hardware filtering must be disabled. The ifa_list - * argument may be NULL, indicating that filtering must be disabled. - * This function is called upon association complete with current - * address(es), and while associated whenever the IP address(es) change. - * The callback can sleep. - * * @prepare_multicast: Prepare for multicast filter configuration. * This callback is optional, and its return value is passed * to configure_filter(). This callback must be atomic. @@ -1678,9 +1688,6 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u32 changed); - int (*configure_arp_filter)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct in_ifaddr *ifa_list); u64 (*prepare_multicast)(struct ieee80211_hw *hw, struct netdev_hw_addr_list *mc_list); void (*configure_filter)(struct ieee80211_hw *hw, diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 965d64f6856..c33317320ee 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -89,23 +89,6 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, trace_drv_return_void(local); } -struct in_ifaddr; -static inline int drv_configure_arp_filter(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct in_ifaddr *ifa_list) -{ - int ret = 0; - - might_sleep(); - - trace_drv_configure_arp_filter(local, vif_to_sdata(vif)); - if (local->ops->configure_arp_filter) - ret = local->ops->configure_arp_filter(&local->hw, vif, - ifa_list); - trace_drv_return_int(local, ret); - return ret; -} - static inline u64 drv_prepare_multicast(struct ieee80211_local *local, struct netdev_hw_addr_list *mc_list) { diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index 06444ea67bc..8da31caff93 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -251,28 +251,6 @@ TRACE_EVENT(drv_bss_info_changed, ) ); -TRACE_EVENT(drv_configure_arp_filter, - TP_PROTO(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata), - - TP_ARGS(local, sdata), - - TP_STRUCT__entry( - LOCAL_ENTRY - VIF_ENTRY - ), - - TP_fast_assign( - LOCAL_ASSIGN; - VIF_ASSIGN; - ), - - TP_printk( - VIF_PR_FMT LOCAL_PR_FMT, - VIF_PR_ARG, LOCAL_PR_ARG - ) -); - TRACE_EVENT(drv_prepare_multicast, TP_PROTO(struct ieee80211_local *local, int mc_count), diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c3c2be3f8a2..9b3c3f971d2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -514,6 +514,8 @@ struct ieee80211_sub_if_data { struct work_struct work; struct sk_buff_head skb_queue; + bool arp_filter_state; + /* * AP this belongs to: self in AP mode and * corresponding AP in VLAN mode, NULL for diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 490be2f3af2..910729fc18c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1076,6 +1076,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sdata->wdev.wiphy = local->hw.wiphy; sdata->local = local; sdata->dev = ndev; +#ifdef CONFIG_INET + sdata->arp_filter_state = true; +#endif for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) skb_queue_head_init(&sdata->fragments[i].skb_list); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index c2e46e88f3c..a1bf46c64b9 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -317,23 +318,6 @@ static void ieee80211_recalc_smps_work(struct work_struct *work) } #ifdef CONFIG_INET -int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata) -{ - struct in_device *idev; - int ret = 0; - - BUG_ON(!sdata); - ASSERT_RTNL(); - - idev = sdata->dev->ip_ptr; - if (!idev) - return 0; - - ret = drv_configure_arp_filter(sdata->local, &sdata->vif, - idev->ifa_list); - return ret; -} - static int ieee80211_ifa_changed(struct notifier_block *nb, unsigned long data, void *arg) { @@ -343,8 +327,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, ifa_notifier); struct net_device *ndev = ifa->ifa_dev->dev; struct wireless_dev *wdev = ndev->ieee80211_ptr; + struct in_device *idev; struct ieee80211_sub_if_data *sdata; + struct ieee80211_bss_conf *bss_conf; struct ieee80211_if_managed *ifmgd; + int c = 0; if (!netif_running(ndev)) return NOTIFY_DONE; @@ -356,17 +343,44 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, if (wdev->wiphy != local->hw.wiphy) return NOTIFY_DONE; - /* We are concerned about IP addresses only when associated */ sdata = IEEE80211_DEV_TO_SUB_IF(ndev); + bss_conf = &sdata->vif.bss_conf; /* ARP filtering is only supported in managed mode */ if (sdata->vif.type != NL80211_IFTYPE_STATION) return NOTIFY_DONE; + idev = sdata->dev->ip_ptr; + if (!idev) + return NOTIFY_DONE; + ifmgd = &sdata->u.mgd; mutex_lock(&ifmgd->mtx); - if (ifmgd->associated) - ieee80211_set_arp_filter(sdata); + + /* Copy the addresses to the bss_conf list */ + ifa = idev->ifa_list; + while (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN && ifa) { + bss_conf->arp_addr_list[c] = ifa->ifa_address; + ifa = ifa->ifa_next; + c++; + } + + /* If not all addresses fit the list, disable filtering */ + if (ifa) { + sdata->arp_filter_state = false; + c = 0; + } else { + sdata->arp_filter_state = true; + } + bss_conf->arp_addr_cnt = c; + + /* Configure driver only if associated */ + if (ifmgd->associated) { + bss_conf->arp_filter_enabled = sdata->arp_filter_state; + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_ARP_FILTER); + } + mutex_unlock(&ifmgd->mtx); return NOTIFY_DONE; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 583b34686a2..74479c2d12d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -806,11 +806,12 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, { struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; bss_info_changed |= BSS_CHANGED_ASSOC; /* set timing information */ - sdata->vif.bss_conf.beacon_int = cbss->beacon_interval; - sdata->vif.bss_conf.timestamp = cbss->tsf; + bss_conf->beacon_int = cbss->beacon_interval; + bss_conf->timestamp = cbss->tsf; bss_info_changed |= BSS_CHANGED_BEACON_INT; bss_info_changed |= ieee80211_handle_bss_capability(sdata, @@ -835,7 +836,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, ieee80211_led_assoc(local, 1); - sdata->vif.bss_conf.assoc = 1; + bss_conf->assoc = 1; /* * For now just always ask the driver to update the basic rateset * when we have associated, we aren't checking whether it actually @@ -848,9 +849,15 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, /* Tell the driver to monitor connection quality (if supported) */ if ((local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI) && - sdata->vif.bss_conf.cqm_rssi_thold) + bss_conf->cqm_rssi_thold) bss_info_changed |= BSS_CHANGED_CQM; + /* Enable ARP filtering */ + if (bss_conf->arp_filter_enabled != sdata->arp_filter_state) { + bss_conf->arp_filter_enabled = sdata->arp_filter_state; + bss_info_changed |= BSS_CHANGED_ARP_FILTER; + } + ieee80211_bss_info_change_notify(sdata, bss_info_changed); mutex_lock(&local->iflist_mtx); @@ -932,6 +939,12 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_hw_config(local, config_changed); + /* Disable ARP filtering */ + if (sdata->vif.bss_conf.arp_filter_enabled) { + sdata->vif.bss_conf.arp_filter_enabled = false; + changed |= BSS_CHANGED_ARP_FILTER; + } + /* The BSSID (not really interesting) and HT changed */ changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT; ieee80211_bss_info_change_notify(sdata, changed); @@ -2018,18 +2031,9 @@ static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk, cfg80211_send_assoc_timeout(wk->sdata->dev, wk->filter_ta); return WORK_DONE_DESTROY; - } else { - mutex_unlock(&wk->sdata->u.mgd.mtx); -#ifdef CONFIG_INET - /* - * configure ARP filter IP addresses to the driver, - * intentionally outside the mgd mutex. - */ - rtnl_lock(); - ieee80211_set_arp_filter(wk->sdata); - rtnl_unlock(); -#endif } + + mutex_unlock(&wk->sdata->u.mgd.mtx); } cfg80211_send_rx_assoc(wk->sdata->dev, skb->data, skb->len); -- cgit v1.2.3-70-g09d2 From 551d55a944b143ef26fbd482d1c463199d6f65cf Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 17 Apr 2010 08:48:42 -0400 Subject: tree/tiny rcu: Add debug RCU head objects Helps finding racy users of call_rcu(), which results in hangs because list entries are overwritten and/or skipped. Changelog since v4: - Bissectability is now OK - Now generate a WARN_ON_ONCE() for non-initialized rcu_head passed to call_rcu(). Statically initialized objects are detected with object_is_static(). - Rename rcu_head_init_on_stack to init_rcu_head_on_stack. - Remove init_rcu_head() completely. Changelog since v3: - Include comments from Lai Jiangshan This new patch version is based on the debugobjects with the newly introduced "active state" tracker. Non-initialized entries are all considered as "statically initialized". An activation fixup (triggered by call_rcu()) takes care of performing the debug object initialization without issuing any warning. Since we cannot increase the size of struct rcu_head, I don't see much room to put an identifier for statically initialized rcu_head structures. So for now, we have to live without "activation without explicit init" detection. But the main purpose of this debug option is to detect double-activations (double call_rcu() use of a rcu_head before the callback is executed), which is correctly addressed here. This also detects potential internal RCU callback corruption, which would cause the callbacks to be executed twice. Signed-off-by: Mathieu Desnoyers CC: David S. Miller CC: "Paul E. McKenney" CC: akpm@linux-foundation.org CC: mingo@elte.hu CC: laijs@cn.fujitsu.com CC: dipankar@in.ibm.com CC: josh@joshtriplett.org CC: dvhltc@us.ibm.com CC: niv@us.ibm.com CC: tglx@linutronix.de CC: peterz@infradead.org CC: rostedt@goodmis.org CC: Valdis.Kletnieks@vt.edu CC: dhowells@redhat.com CC: eric.dumazet@gmail.com CC: Alexey Dobriyan Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 49 +++++++++++++++ kernel/rcupdate.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutiny.c | 2 + kernel/rcutree.c | 2 + lib/Kconfig.debug | 6 ++ 5 files changed, 219 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b653b4aaa8a..2b7fc506e47 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; /* for sysctl */ @@ -79,6 +80,16 @@ extern void rcu_init(void); (ptr)->next = NULL; (ptr)->func = NULL; \ } while (0) +/* + * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic + * initialization and destruction of rcu_head on the stack. rcu_head structures + * allocated dynamically in the heap or defined statically don't need any + * initialization. + */ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +extern void init_rcu_head_on_stack(struct rcu_head *head); +extern void destroy_rcu_head_on_stack(struct rcu_head *head); +#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head_on_stack(struct rcu_head *head) { } @@ -86,6 +97,7 @@ static inline void init_rcu_head_on_stack(struct rcu_head *head) static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } +#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -517,4 +529,41 @@ extern void call_rcu(struct rcu_head *head, extern void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head)); +/* + * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally + * by call_rcu() and rcu callback execution, and are therefore not part of the + * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +# define STATE_RCU_HEAD_READY 0 +# define STATE_RCU_HEAD_QUEUED 1 + +extern struct debug_obj_descr rcuhead_debug_descr; + +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ + debug_object_activate(head, &rcuhead_debug_descr); + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_READY, + STATE_RCU_HEAD_QUEUED); +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_QUEUED, + STATE_RCU_HEAD_READY); + debug_object_deactivate(head, &rcuhead_debug_descr); +} +#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ +} +#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f..4d169835fb3 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void) } EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); #endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_init(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_activate(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#else + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_free(head, &rcuhead_debug_descr); + return 1; +#endif + default: + return 0; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_init = rcuhead_fixup_init, + .fixup_activate = rcuhead_fixup_activate, + .fixup_free = rcuhead_fixup_free, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd23..196ec02f8be 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; } @@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head, { unsigned long flags; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706..d5bc43976c5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e722e9d6222..142faa2ec66 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -307,6 +307,12 @@ config DEBUG_OBJECTS_WORK work queue routines to track the life time of work objects and validate the work operations. +config DEBUG_OBJECTS_RCU_HEAD + bool "Debug RCU callbacks objects" + depends on DEBUG_OBJECTS && PREEMPT + help + Enable this to turn on debugging of RCU list heads (call_rcu() usage). + config DEBUG_OBJECTS_ENABLE_DEFAULT int "debug_objects bootup default value (0-1)" range 0 1 -- cgit v1.2.3-70-g09d2 From f5155b33277c9678041a27869165619bb34f722f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Apr 2010 06:42:01 -0700 Subject: rcu: add an rcu_dereference_index_check() The sparse RCU-pointer checking relies on type magic that dereferences the pointer in question. This does not work if the pointer is in fact an array index. This commit therefore supplies a new RCU API that omits the sparse checking to continue to support rcu_dereference() on integers. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2b7fc506e47..9fbc54a2585 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -566,4 +566,37 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +#ifndef CONFIG_PROVE_RCU +#define __do_rcu_dereference_check(c) do { } while (0) +#endif /* #ifdef CONFIG_PROVE_RCU */ + +#define __rcu_dereference_index_check(p, c) \ + ({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + __do_rcu_dereference_check(c); \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) + +/** + * rcu_dereference_index_check() - rcu_dereference for indices with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * Similar to rcu_dereference_check(), but omits the sparse checking. + * This allows rcu_dereference_index_check() to be used on integers, + * which can then be used as array indices. Attempting to use + * rcu_dereference_check() on an integer will give compiler warnings + * because the sparse address-space mechanism relies on dereferencing + * the RCU-protected pointer. Dereferencing integers is not something + * that even gcc will put up with. + * + * Note that this function does not implicitly check for RCU read-side + * critical sections. If this function gains lots of uses, it might + * make sense to provide versions for each flavor of RCU, but it does + * not make sense as of early 2010. + */ +#define rcu_dereference_index_check(p, c) \ + __rcu_dereference_index_check((p), (c)) + #endif /* __LINUX_RCUPDATE_H */ -- cgit v1.2.3-70-g09d2 From 71d1d5c722db9ae3b3f9c08ef7ddcd7759fbb1e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 May 2010 16:13:14 -0700 Subject: rcu: add __rcu API for later sparse checking This commit defines an __rcu API, but provides only vacuous definitions for it. This breaks dependencies among most of the subsequent patches, allowing them to reach mainline asynchronously via whatever trees are appropriate. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Christopher Li Cc: Josh Triplett --- include/linux/compiler.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index a5a472b1074..c1a62c56a66 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -16,6 +16,7 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) +# define __rcu extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else @@ -34,6 +35,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __release(x) (void)0 # define __cond_lock(x,c) (c) # define __percpu +# define __rcu #endif #ifdef __KERNEL__ -- cgit v1.2.3-70-g09d2 From a25909a4d4a29e272f953e12595bf2f04a292dbd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 May 2010 12:32:28 -0700 Subject: lockdep: Add an in_workqueue_context() lockdep-based test function Some recent uses of RCU make use of workqueues. In these uses, execution within the context of a specific workqueue takes the place of the usual RCU read-side primitives such as rcu_read_lock(), and flushing of workqueues takes the place of the usual RCU grace-period primitives. Checking for correct use of rcu_dereference() in such cases requires a test of whether the code is executing in the context of a particular workqueue. This commit adds an in_workqueue_context() function that provides this test. This new function is only defined when lockdep is enabled, which allows it to be used as the second argument of rcu_dereference_check(). Signed-off-by: Paul E. McKenney --- include/linux/workqueue.h | 4 ++++ kernel/workqueue.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 9466e860d8c..d0f7c817849 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -297,4 +297,8 @@ static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) #else long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ + +#ifdef CONFIG_LOCKDEP +int in_workqueue_context(struct workqueue_struct *wq); +#endif #endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb445..59fef1531dd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,21 @@ struct workqueue_struct { #endif }; +#ifdef CONFIG_LOCKDEP +/** + * in_workqueue_context() - in context of specified workqueue? + * @wq: the workqueue of interest + * + * Checks lockdep state to see if the current task is executing from + * within a workqueue item. This function exists only if lockdep is + * enabled. + */ +int in_workqueue_context(struct workqueue_struct *wq) +{ + return lock_is_held(&wq->lockdep_map); +} +#endif + #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; -- cgit v1.2.3-70-g09d2 From 2c666df80764389886110c942a7916ba9622583d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Apr 2010 20:48:47 -0700 Subject: vfs: add fs.h to define struct file The sparse RCU-pointer annotations require definition of the underlying type of any pointer passed to rcu_dereference() and friends. So fcheck_files() needs "struct file" to be defined, so include fs.h. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Al Viro --- include/linux/fdtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 013dc529e95..551671e8792 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -11,6 +11,7 @@ #include #include #include +#include #include -- cgit v1.2.3-70-g09d2 From 81bdf5bd7349bd4523538cbd7878f334bc2bfe14 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 May 2010 18:10:06 -0700 Subject: net: Make accesses to ->br_port safe for sparse RCU The new versions of the rcu_dereference() APIs requires that any pointers passed to one of these APIs be fully defined. The ->br_port field in struct net_device points to a struct net_bridge_port, which is an incomplete type. This commit therefore changes ->br_port to be a void*, and introduces a br_port() helper function to convert the type to struct net_bridge_port, and applies this new helper function where required. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: David Miller Cc: Stephen Hemminger Cc: Eric Dumazet --- include/linux/if_bridge.h | 3 +++ net/bridge/br_fdb.c | 2 +- net/bridge/br_private.h | 5 +++++ net/bridge/netfilter/ebt_redirect.c | 2 +- net/bridge/netfilter/ebt_ulog.c | 4 ++-- net/bridge/netfilter/ebtables.c | 4 ++-- net/netfilter/nfnetlink_log.c | 4 ++-- net/netfilter/nfnetlink_queue.c | 4 ++-- 8 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 938b7e81df9..d001d782922 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -101,6 +101,9 @@ struct __fdb_entry { #include +/* br_handle_frame_hook() needs the following forward declaration. */ +struct net_bridge_port; + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff *skb); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 26637439965..845710bca49 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -246,7 +246,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) return 0; rcu_read_lock(); - fdb = __br_fdb_get(dev->br_port->br, addr); + fdb = __br_fdb_get(br_port(dev)->br, addr); ret = fdb && fdb->dst->dev != dev && fdb->dst->state == BR_STATE_FORWARDING; rcu_read_unlock(); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 0f4a74bc6a9..3255188355b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -268,6 +268,11 @@ static inline int br_is_root_bridge(const struct net_bridge *br) return !memcmp(&br->bridge_id, &br->designated_root, 8); } +static inline struct net_bridge_port *br_port(const struct net_device *dev) +{ + return rcu_dereference(dev->br_port); +} + /* br_device.c */ extern void br_dev_setup(struct net_device *dev); extern netdev_tx_t br_dev_xmit(struct sk_buff *skb, diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 9e19166ba45..a39df0ae0f8 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -25,7 +25,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) if (par->hooknum != NF_BR_BROUTING) memcpy(eth_hdr(skb)->h_dest, - par->in->br_port->br->dev->dev_addr, ETH_ALEN); + br_port(par->in)->br->dev->dev_addr, ETH_ALEN); else memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN); skb->pkt_type = PACKET_HOST; diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index ae3c7cef148..5a4996bbb09 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -178,7 +178,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, strcpy(pm->physindev, in->name); /* If in isn't a bridge, then physindev==indev */ if (in->br_port) - strcpy(pm->indev, in->br_port->br->dev->name); + strcpy(pm->indev, br_port(in)->br->dev->name); else strcpy(pm->indev, in->name); } else @@ -187,7 +187,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, if (out) { /* If out exists, then out is a bridge port */ strcpy(pm->physoutdev, out->name); - strcpy(pm->outdev, out->br_port->br->dev->name); + strcpy(pm->outdev, br_port(out)->br->dev->name); } else pm->outdev[0] = pm->physoutdev[0] = '\0'; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 59ca00e40de..4c2aab8cbfc 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -141,10 +141,10 @@ ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h, if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT)) return 1; if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check( - e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN)) + e->logical_in, br_port(in)->br->dev), EBT_ILOGICALIN)) return 1; if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check( - e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT)) + e->logical_out, br_port(out)->br->dev), EBT_ILOGICALOUT)) return 1; if (e->bitmask & EBT_SOURCEMAC) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index fc9a211e629..78957cfa3bd 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -404,7 +404,7 @@ __build_packet_message(struct nfulnl_instance *inst, htonl(indev->ifindex)); /* this is the bridge group "brX" */ NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, - htonl(indev->br_port->br->dev->ifindex)); + htonl(br_port(indev)->br->dev->ifindex)); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -431,7 +431,7 @@ __build_packet_message(struct nfulnl_instance *inst, htonl(outdev->ifindex)); /* this is the bridge group "brX" */ NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, - htonl(outdev->br_port->br->dev->ifindex)); + htonl(br_port(outdev)->br->dev->ifindex)); } else { /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 12e1ab37fcd..c3c17498298 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -297,7 +297,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, htonl(indev->ifindex)); /* this is the bridge group "brX" */ NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, - htonl(indev->br_port->br->dev->ifindex)); + htonl(br_port(indev)->br->dev->ifindex)); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -322,7 +322,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, htonl(outdev->ifindex)); /* this is the bridge group "brX" */ NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, - htonl(outdev->br_port->br->dev->ifindex)); + htonl(br_port(outdev)->br->dev->ifindex)); } else { /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ -- cgit v1.2.3-70-g09d2 From 1be3b5fe9dffe3300f995584d8f996dd20e29412 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 14 Jun 2010 08:53:26 +0000 Subject: ethtool: Revert incorrect indentation changes commit 97f8aefbbfb5aa5c9944e5fa8149f1fdaf71c7b6 "net: fix ethtool coding style errors and warnings" changed the indentation of several macro definitions in ethtool.h. These definitions line up in the diff where there is an extra character at the start of each line, but not in the resulting file. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/ethtool.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 276b40a1683..2c8af093d8b 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -586,29 +586,29 @@ struct ethtool_ops { #define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ #define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ #define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ -#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ -#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ +#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ +#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ #define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ #define ETHTOOL_GLINK 0x0000000a /* Get link status (ethtool_value) */ -#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ -#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ +#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ +#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ #define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ #define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ #define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ #define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ #define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ #define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ -#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ -#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ -#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ -#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ +#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ +#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ +#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ +#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ #define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable * (ethtool_value) */ #define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable * (ethtool_value). */ #define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ #define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ -#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ +#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ #define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ #define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ #define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ @@ -619,8 +619,8 @@ struct ethtool_ops { #define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ -#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ -#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ +#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ +#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ #define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ #define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ @@ -645,18 +645,18 @@ struct ethtool_ops { /* Indicates what features are supported by the interface. */ #define SUPPORTED_10baseT_Half (1 << 0) #define SUPPORTED_10baseT_Full (1 << 1) -#define SUPPORTED_100baseT_Half (1 << 2) -#define SUPPORTED_100baseT_Full (1 << 3) +#define SUPPORTED_100baseT_Half (1 << 2) +#define SUPPORTED_100baseT_Full (1 << 3) #define SUPPORTED_1000baseT_Half (1 << 4) #define SUPPORTED_1000baseT_Full (1 << 5) #define SUPPORTED_Autoneg (1 << 6) #define SUPPORTED_TP (1 << 7) #define SUPPORTED_AUI (1 << 8) #define SUPPORTED_MII (1 << 9) -#define SUPPORTED_FIBRE (1 << 10) +#define SUPPORTED_FIBRE (1 << 10) #define SUPPORTED_BNC (1 << 11) #define SUPPORTED_10000baseT_Full (1 << 12) -#define SUPPORTED_Pause (1 << 13) +#define SUPPORTED_Pause (1 << 13) #define SUPPORTED_Asym_Pause (1 << 14) #define SUPPORTED_2500baseX_Full (1 << 15) #define SUPPORTED_Backplane (1 << 16) @@ -666,8 +666,8 @@ struct ethtool_ops { #define SUPPORTED_10000baseR_FEC (1 << 20) /* Indicates what features are advertised by the interface. */ -#define ADVERTISED_10baseT_Half (1 << 0) -#define ADVERTISED_10baseT_Full (1 << 1) +#define ADVERTISED_10baseT_Half (1 << 0) +#define ADVERTISED_10baseT_Full (1 << 1) #define ADVERTISED_100baseT_Half (1 << 2) #define ADVERTISED_100baseT_Full (1 << 3) #define ADVERTISED_1000baseT_Half (1 << 4) @@ -706,12 +706,12 @@ struct ethtool_ops { #define DUPLEX_FULL 0x01 /* Which connector port. */ -#define PORT_TP 0x00 +#define PORT_TP 0x00 #define PORT_AUI 0x01 #define PORT_MII 0x02 #define PORT_FIBRE 0x03 #define PORT_BNC 0x04 -#define PORT_DA 0x05 +#define PORT_DA 0x05 #define PORT_NONE 0xef #define PORT_OTHER 0xff @@ -725,7 +725,7 @@ struct ethtool_ops { /* Enable or disable autonegotiation. If this is set to enable, * the forced link modes above are completely ignored. */ -#define AUTONEG_DISABLE 0x00 +#define AUTONEG_DISABLE 0x00 #define AUTONEG_ENABLE 0x01 /* Mode MDI or MDI-X */ -- cgit v1.2.3-70-g09d2 From 0902b469bd25065aa0688c3cee6f11744c817e7c Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Tue, 15 Jun 2010 15:04:00 +0200 Subject: netfilter: xtables: idletimer target implementation This patch implements an idletimer Xtables target that can be used to identify when interfaces have been idle for a certain period of time. Timers are identified by labels and are created when a rule is set with a new label. The rules also take a timeout value (in seconds) as an option. If more than one rule uses the same timer label, the timer will be restarted whenever any of the rules get a hit. One entry for each timer is created in sysfs. This attribute contains the timer remaining for the timer to expire. The attributes are located under the xt_idletimer class: /sys/class/xt_idletimer/timers/