aboutsummaryrefslogtreecommitdiff
path: root/lib/Bytecode/Reader/ArchiveReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Bytecode/Reader/ArchiveReader.cpp')
0 files changed, 0 insertions, 0 deletions
rofile_write_commit(struct op_entry *entry); #endif /* OPROFILE_H */ -- cgit v1.2.3-70-g09d2 From a635f9dd83f3382577f4544a96df12356e951a40 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 12 Jun 2009 15:20:55 +0200 Subject: HID: use debugfs for report dumping descriptor It is a little bit inconvenient for people who have some non-standard HID hardware (usually violating the HID specification) to have to recompile kernel with CONFIG_HID_DEBUG to be able to see kernel's perspective of the HID report descriptor and observe the parsed events. Plus the messages are then mixed up inconveniently with the rest of the dmesg stuff. This patch implements /sys/kernel/debug/hid//rdesc file, which represents the kernel's view of report descriptor (both the raw report descriptor data and parsed contents). With all the device-specific debug data being available through debugfs, there is no need for keeping CONFIG_HID_DEBUG, as the 'debug' parameter to the hid module will now only output only driver-specific debugging options, which has absolutely minimal memory footprint, just a few error messages and one global flag (hid_debug). We use the current set of output formatting functions. The ones that need to be used both for one-shot rdesc seq_file and also for continuous flow of data (individual reports, as being sent by the device) distinguish according to the passed seq_file parameter, and if it is NULL, it still output to kernel ringbuffer, otherwise the corresponding seq_file is used for output. The format of the output is preserved. Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 15 --- drivers/hid/Makefile | 5 +- drivers/hid/hid-core.c | 13 ++- drivers/hid/hid-debug.c | 227 +++++++++++++++++++++++++++++------------- drivers/hid/hid-input.c | 13 +-- drivers/hid/usbhid/hid-core.c | 8 +- include/linux/hid-debug.h | 32 +++--- include/linux/hid.h | 4 + 8 files changed, 195 insertions(+), 122 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 7e67dcb3d4f..05950a78356 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -31,21 +31,6 @@ config HID If unsure, say Y. -config HID_DEBUG - bool "HID debugging support" - default y - depends on HID - ---help--- - This option lets the HID layer output diagnostics about its internal - state, resolve HID usages, dump HID fields, etc. Individual HID drivers - use this debugging facility to output information about individual HID - devices, etc. - - This feature is useful for those who are either debugging the HID parser - or any HID hardware device. - - If unsure, say Y. - config HIDRAW bool "/dev/hidraw raw HID device support" depends on HID diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index 1f7cb0fd450..cf3687d1ed6 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -3,9 +3,12 @@ # hid-objs := hid-core.o hid-input.o +ifdef CONFIG_DEBUG_FS + hid-objs += hid-debug.o +endif + obj-$(CONFIG_HID) += hid.o -hid-$(CONFIG_HID_DEBUG) += hid-debug.o hid-$(CONFIG_HIDRAW) += hidraw.o hid-logitech-objs := hid-lg.o diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 8551693d645..d4317db85b5 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -44,12 +44,10 @@ #define DRIVER_DESC "HID core driver" #define DRIVER_LICENSE "GPL" -#ifdef CONFIG_HID_DEBUG int hid_debug = 0; module_param_named(debug, hid_debug, int, 0600); MODULE_PARM_DESC(debug, "HID debugging (0=off, 1=probing info, 2=continuous data dumping)"); EXPORT_SYMBOL_GPL(hid_debug); -#endif /* * Register a new report for a device. @@ -987,7 +985,6 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) if (offset >= field->report_count) { dbg_hid("offset (%d) exceeds report_count (%d)\n", offset, field->report_count); - hid_dump_field(field, 8); return -1; } if (field->logical_minimum < 0) { @@ -1721,6 +1718,8 @@ int hid_add_device(struct hid_device *hdev) if (!ret) hdev->status |= HID_STAT_ADDED; + hid_debug_register(hdev, dev_name(&hdev->dev)); + return ret; } EXPORT_SYMBOL_GPL(hid_add_device); @@ -1768,6 +1767,7 @@ static void hid_remove_device(struct hid_device *hdev) { if (hdev->status & HID_STAT_ADDED) { device_del(&hdev->dev); + hid_debug_unregister(hdev); hdev->status &= ~HID_STAT_ADDED; } } @@ -1843,6 +1843,10 @@ static int __init hid_init(void) { int ret; + if (hid_debug) + printk(KERN_WARNING "HID: hid_debug parameter has been deprecated. " + "Debugging data are now provided via debugfs\n"); + ret = bus_register(&hid_bus_type); if (ret) { printk(KERN_ERR "HID: can't register hid bus\n"); @@ -1853,6 +1857,8 @@ static int __init hid_init(void) if (ret) goto err_bus; + hid_debug_init(); + return 0; err_bus: bus_unregister(&hid_bus_type); @@ -1862,6 +1868,7 @@ err: static void __exit hid_exit(void) { + hid_debug_exit(); hidraw_exit(); bus_unregister(&hid_bus_type); } diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 47ac1a7d66e..067e173aa3e 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -1,9 +1,9 @@ /* * (c) 1999 Andreas Gal * (c) 2000-2001 Vojtech Pavlik - * (c) 2007 Jiri Kosina + * (c) 2007-2009 Jiri Kosina * - * Some debug stuff for the HID parser. + * HID debugging support */ /* @@ -26,9 +26,13 @@ * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ +#include +#include #include #include +static struct dentry *hid_debug_root; + struct hid_usage_entry { unsigned page; unsigned usage; @@ -331,72 +335,83 @@ static const struct hid_usage_entry hid_usage_table[] = { { 0, 0, NULL } }; -static void resolv_usage_page(unsigned page) { +static void resolv_usage_page(unsigned page, struct seq_file *f) { const struct hid_usage_entry *p; for (p = hid_usage_table; p->description; p++) if (p->page == page) { - printk("%s", p->description); + if (!f) + printk("%s", p->description); + else + seq_printf(f, "%s", p->description); return; } - printk("%04x", page); + if (!f) + printk("%04x", page); + else + seq_printf(f, "%04x", page); } -void hid_resolv_usage(unsigned usage) { +void hid_resolv_usage(unsigned usage, struct seq_file *f) { const struct hid_usage_entry *p; - if (!hid_debug) - return; - - resolv_usage_page(usage >> 16); - printk("."); + resolv_usage_page(usage >> 16, f); + if (!f) + printk("."); + else + seq_printf(f, "."); for (p = hid_usage_table; p->description; p++) if (p->page == (usage >> 16)) { for(++p; p->description && p->usage != 0; p++) if (p->usage == (usage & 0xffff)) { - printk("%s", p->description); + if (!f) + printk("%s", p->description); + else + seq_printf(f, + "%s", + p->description); return; } break; } - printk("%04x", usage & 0xffff); + if (!f) + printk("%04x", usage & 0xffff); + else + seq_printf(f, "%04x", usage & 0xffff); } EXPORT_SYMBOL_GPL(hid_resolv_usage); -static void tab(int n) { - printk(KERN_DEBUG "%*s", n, ""); +static void tab(int n, struct seq_file *f) { + seq_printf(f, "%*s", n, ""); } -void hid_dump_field(struct hid_field *field, int n) { +void hid_dump_field(struct hid_field *field, int n, struct seq_file *f) { int j; - if (!hid_debug) - return; - if (field->physical) { - tab(n); - printk("Physical("); - hid_resolv_usage(field->physical); printk(")\n"); + tab(n, f); + seq_printf(f, "Physical("); + hid_resolv_usage(field->physical, f); seq_printf(f, ")\n"); } if (field->logical) { - tab(n); - printk("Logical("); - hid_resolv_usage(field->logical); printk(")\n"); + tab(n, f); + seq_printf(f, "Logical("); + hid_resolv_usage(field->logical, f); seq_printf(f, ")\n"); } - tab(n); printk("Usage(%d)\n", field->maxusage); + tab(n, f); seq_printf(f, "Usage(%d)\n", field->maxusage); for (j = 0; j < field->maxusage; j++) { - tab(n+2); hid_resolv_usage(field->usage[j].hid); printk("\n"); + tab(n+2, f); hid_resolv_usage(field->usage[j].hid, f); seq_printf(f, "\n"); } if (field->logical_minimum != field->logical_maximum) { - tab(n); printk("Logical Minimum(%d)\n", field->logical_minimum); - tab(n); printk("Logical Maximum(%d)\n", field->logical_maximum); + tab(n, f); seq_printf(f, "Logical Minimum(%d)\n", field->logical_minimum); + tab(n, f); seq_printf(f, "Logical Maximum(%d)\n", field->logical_maximum); } if (field->physical_minimum != field->physical_maximum) { - tab(n); printk("Physical Minimum(%d)\n", field->physical_minimum); - tab(n); printk("Physical Maximum(%d)\n", field->physical_maximum); + tab(n, f); seq_printf(f, "Physical Minimum(%d)\n", field->physical_minimum); + tab(n, f); seq_printf(f, "Physical Maximum(%d)\n", field->physical_maximum); } if (field->unit_exponent) { - tab(n); printk("Unit Exponent(%d)\n", field->unit_exponent); + tab(n, f); seq_printf(f, "Unit Exponent(%d)\n", field->unit_exponent); } if (field->unit) { static const char *systems[5] = { "None", "SI Linear", "SI Rotation", "English Linear", "English Rotation" }; @@ -417,77 +432,75 @@ void hid_dump_field(struct hid_field *field, int n) { data >>= 4; if(sys > 4) { - tab(n); printk("Unit(Invalid)\n"); + tab(n, f); seq_printf(f, "Unit(Invalid)\n"); } else { int earlier_unit = 0; - tab(n); printk("Unit(%s : ", systems[sys]); + tab(n, f); seq_printf(f, "Unit(%s : ", systems[sys]); for (i=1 ; i>= 4; if (nibble != 0) { if(earlier_unit++ > 0) - printk("*"); - printk("%s", units[sys][i]); + seq_printf(f, "*"); + seq_printf(f, "%s", units[sys][i]); if(nibble != 1) { /* This is a _signed_ nibble(!) */ int val = nibble & 0x7; if(nibble & 0x08) val = -((0x7 & ~val) +1); - printk("^%d", val); + seq_printf(f, "^%d", val); } } } - printk(")\n"); + seq_printf(f, ")\n"); } } - tab(n); printk("Report Size(%u)\n", field->report_size); - tab(n); printk("Report Count(%u)\n", field->report_count); - tab(n); printk("Report Offset(%u)\n", field->report_offset); + tab(n, f); seq_printf(f, "Report Size(%u)\n", field->report_size); + tab(n, f); seq_printf(f, "Report Count(%u)\n", field->report_count); + tab(n, f); seq_printf(f, "Report Offset(%u)\n", field->report_offset); - tab(n); printk("Flags( "); + tab(n, f); seq_printf(f, "Flags( "); j = field->flags; - printk("%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : ""); - printk("%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array "); - printk("%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute "); - printk("%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : ""); - printk("%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : ""); - printk("%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : ""); - printk("%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : ""); - printk("%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : ""); - printk("%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : ""); - printk(")\n"); + seq_printf(f, "%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array "); + seq_printf(f, "%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute "); + seq_printf(f, "%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : ""); + seq_printf(f, ")\n"); } EXPORT_SYMBOL_GPL(hid_dump_field); -void hid_dump_device(struct hid_device *device) { +void hid_dump_device(struct hid_device *device, struct seq_file *f) +{ struct hid_report_enum *report_enum; struct hid_report *report; struct list_head *list; unsigned i,k; static const char *table[] = {"INPUT", "OUTPUT", "FEATURE"}; - if (!hid_debug) - return; - for (i = 0; i < HID_REPORT_TYPES; i++) { report_enum = device->report_enum + i; list = report_enum->report_list.next; while (list != &report_enum->report_list) { report = (struct hid_report *) list; - tab(2); - printk("%s", table[i]); + tab(2, f); + seq_printf(f, "%s", table[i]); if (report->id) - printk("(%d)", report->id); - printk("[%s]", table[report->type]); - printk("\n"); + seq_printf(f, "(%d)", report->id); + seq_printf(f, "[%s]", table[report->type]); + seq_printf(f, "\n"); for (k = 0; k < report->maxfield; k++) { - tab(4); - printk("Field(%d)\n", k); - hid_dump_field(report->field[k], 6); + tab(4, f); + seq_printf(f, "Field(%d)\n", k); + hid_dump_field(report->field[k], 6, f); } list = list->next; } @@ -500,7 +513,7 @@ void hid_dump_input(struct hid_usage *usage, __s32 value) { return; printk(KERN_DEBUG "hid-debug: input "); - hid_resolv_usage(usage->hid); + hid_resolv_usage(usage->hid, NULL); printk(" = %d\n", value); } EXPORT_SYMBOL_GPL(hid_dump_input); @@ -767,12 +780,84 @@ static const char **names[EV_MAX + 1] = { [EV_SND] = sounds, [EV_REP] = repeats, }; -void hid_resolv_event(__u8 type, __u16 code) { - - if (!hid_debug) - return; +void hid_resolv_event(__u8 type, __u16 code, struct seq_file *f) { - printk("%s.%s", events[type] ? events[type] : "?", + seq_printf(f, "%s.%s", events[type] ? events[type] : "?", names[type] ? (names[type][code] ? names[type][code] : "?") : "?"); } -EXPORT_SYMBOL_GPL(hid_resolv_event); + +void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f) +{ + int i, j, k; + struct hid_report *report; + struct hid_usage *usage; + + for (k = HID_INPUT_REPORT; k <= HID_OUTPUT_REPORT; k++) { + list_for_each_entry(report, &hid->report_enum[k].report_list, list) { + for (i = 0; i < report->maxfield; i++) { + for ( j = 0; j < report->field[i]->maxusage; j++) { + usage = report->field[i]->usage + j; + hid_resolv_usage(usage->hid, f); + seq_printf(f, " ---> "); + hid_resolv_event(usage->type, usage->code, f); + seq_printf(f, "\n"); + } + } + } + } + +} + +static int hid_debug_rdesc_show(struct seq_file *f, void *p) +{ + struct hid_device *hdev = f->private; + int i; + + /* dump HID report descriptor */ + for (i = 0; i < hdev->rsize; i++) + seq_printf(f, "%02x ", hdev->rdesc[i]); + seq_printf(f, "\n\n"); + + /* dump parsed data and input mappings */ + hid_dump_device(hdev, f); + seq_printf(f, "\n"); + hid_dump_input_mapping(hdev, f); + + return 0; +} + +static int hid_debug_rdesc_open(struct inode *inode, struct file *file) +{ + return single_open(file, hid_debug_rdesc_show, inode->i_private); +} + +static const struct file_operations hid_debug_rdesc_fops = { + .open = hid_debug_rdesc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void hid_debug_register(struct hid_device *hdev, const char *name) +{ + hdev->debug_dir = debugfs_create_dir(name, hid_debug_root); + hdev->debug_rdesc = debugfs_create_file("rdesc", 0400, + hdev->debug_dir, hdev, &hid_debug_rdesc_fops); +} + +void hid_debug_unregister(struct hid_device *hdev) +{ + debugfs_remove(hdev->debug_rdesc); + debugfs_remove(hdev->debug_dir); +} + +void hid_debug_init(void) +{ + hid_debug_root = debugfs_create_dir("hid", NULL); +} + +void hid_debug_exit(void) +{ + debugfs_remove_recursive(hid_debug_root); +} + diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 7f183b7147e..5862b0f3b55 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -159,17 +159,12 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel field->hidinput = hidinput; - dbg_hid("Mapping: "); - hid_resolv_usage(usage->hid); - dbg_hid_line(" ---> "); - if (field->flags & HID_MAIN_ITEM_CONSTANT) goto ignore; /* only LED usages are supported in output fields */ if (field->report_type == HID_OUTPUT_REPORT && (usage->hid & HID_USAGE_PAGE) != HID_UP_LED) { - dbg_hid_line(" [non-LED output field] "); goto ignore; } @@ -561,15 +556,9 @@ mapped: set_bit(MSC_SCAN, input->mscbit); } - hid_resolv_event(usage->type, usage->code); - - dbg_hid_line("\n"); - - return; - ignore: - dbg_hid_line("IGNORED\n"); return; + } void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index ac8049b5f1e..708aa52d075 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -4,8 +4,8 @@ * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik * Copyright (c) 2005 Michael Haboustak for Concept2, Inc - * Copyright (c) 2006-2008 Jiri Kosina * Copyright (c) 2007-2008 Oliver Neukum + * Copyright (c) 2006-2009 Jiri Kosina */ /* @@ -886,11 +886,6 @@ static int usbhid_parse(struct hid_device *hid) goto err; } - dbg_hid("report descriptor (size %u, read %d) = ", rsize, n); - for (n = 0; n < rsize; n++) - dbg_hid_line(" %02x", (unsigned char) rdesc[n]); - dbg_hid_line("\n"); - ret = hid_parse_report(hid, rdesc, rsize); kfree(rdesc); if (ret) { @@ -1005,7 +1000,6 @@ static int usbhid_start(struct hid_device *hid) usbhid->urbctrl->transfer_flags |= (URB_NO_TRANSFER_DMA_MAP | URB_NO_SETUP_DMA_MAP); usbhid_init_reports(hid); - hid_dump_device(hid); set_bit(HID_STARTED, &usbhid->iofl); diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index 50d568ec178..516e12c3323 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -2,7 +2,7 @@ #define __HID_DEBUG_H /* - * Copyright (c) 2007 Jiri Kosina + * Copyright (c) 2007-2009 Jiri Kosina */ /* @@ -22,24 +22,30 @@ * */ -#ifdef CONFIG_HID_DEBUG +#ifdef CONFIG_DEBUG_FS void hid_dump_input(struct hid_usage *, __s32); -void hid_dump_device(struct hid_device *); -void hid_dump_field(struct hid_field *, int); -void hid_resolv_usage(unsigned); -void hid_resolv_event(__u8, __u16); +void hid_dump_device(struct hid_device *, struct seq_file *); +void hid_dump_field(struct hid_field *, int, struct seq_file *); +void hid_resolv_usage(unsigned, struct seq_file *); +void hid_debug_register(struct hid_device *, const char *); +void hid_debug_unregister(struct hid_device *); +void hid_debug_init(void); +void hid_debug_exit(void); #else -#define hid_dump_input(a,b) do { } while (0) -#define hid_dump_device(c) do { } while (0) -#define hid_dump_field(a,b) do { } while (0) -#define hid_resolv_usage(a) do { } while (0) -#define hid_resolv_event(a,b) do { } while (0) - -#endif /* CONFIG_HID_DEBUG */ +#define hid_dump_input(a,b) do { } while (0) +#define hid_dump_device(c) do { } while (0) +#define hid_dump_field(a,b) do { } while (0) +#define hid_resolv_usage(a) do { } while (0) +#define hid_resolv_event(a,b) do { } while (0) +#define hid_debug_register(a, b) do { } while (0) +#define hid_debug_unregister(a) do { } while (0) +#define hid_debug_init() do { } while (0) +#define hid_debug_exit() do { } while (0) +#endif #endif diff --git a/include/linux/hid.h b/include/linux/hid.h index a72876e4358..da09ab140ef 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -451,6 +451,10 @@ struct hid_device { /* device report descriptor */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ + /* debugfs */ + struct dentry *debug_dir; + struct dentry *debug_rdesc; + void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ -- cgit v1.2.3-70-g09d2 From cd667ce24796700e1a0e6e7528efc61c96ff832e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 12 Jun 2009 15:20:57 +0200 Subject: HID: use debugfs for events/reports dumping This is a followup patch to the one implemeting rdesc representation in debugfs rather than being dependent on compile-time CONFIG_HID_DEBUG setting. The API of the appropriate formatting functions is slightly modified -- if they are passed seq_file pointer, the one-shot output for 'rdesc' file mode is used, and therefore the message is formatted into the corresponding seq_file immediately. Otherwise the called function allocated a new buffer, formats the text into the buffer and returns the pointer to it, so that it can be queued into the ring-buffer of the processess blocked waiting on input on 'events' file in debugfs. 'debug' parameter to the 'hid' module is now used solely for the prupose of inetrnal driver state debugging (parser, transport, etc). Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 42 ++++++-- drivers/hid/hid-debug.c | 239 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/hid-debug.h | 18 +++- include/linux/hid.h | 26 ++--- 4 files changed, 276 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index d4317db85b5..449bd747d11 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -46,7 +46,7 @@ int hid_debug = 0; module_param_named(debug, hid_debug, int, 0600); -MODULE_PARM_DESC(debug, "HID debugging (0=off, 1=probing info, 2=continuous data dumping)"); +MODULE_PARM_DESC(debug, "toggle HID debugging messages"); EXPORT_SYMBOL_GPL(hid_debug); /* @@ -859,7 +859,7 @@ static void hid_process_event(struct hid_device *hid, struct hid_field *field, struct hid_driver *hdrv = hid->driver; int ret; - hid_dump_input(usage, value); + hid_dump_input(hid, usage, value); if (hdrv && hdrv->event && hid_match_usage(hid, usage)) { ret = hdrv->event(hid, field, usage, value); @@ -981,7 +981,7 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) { unsigned size = field->report_size; - hid_dump_input(field->usage + offset, value); + hid_dump_input(field->report->device, field->usage + offset, value); if (offset >= field->report_count) { dbg_hid("offset (%d) exceeds report_count (%d)\n", offset, field->report_count); @@ -1075,6 +1075,7 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_driver *hdrv = hid->driver; struct hid_report *report; + char *buf; unsigned int i; int ret; @@ -1086,18 +1087,36 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i return -1; } - dbg_hid("report (size %u) (%snumbered)\n", size, report_enum->numbered ? "" : "un"); + buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE, + interrupt ? GFP_ATOMIC : GFP_KERNEL); + + if (!buf) { + report = hid_get_report(report_enum, data); + goto nomem; + } + + snprintf(buf, HID_DEBUG_BUFSIZE - 1, + "\nreport (size %u) (%snumbered)\n", size, report_enum->numbered ? "" : "un"); + hid_debug_event(hid, buf); report = hid_get_report(report_enum, data); if (!report) return -1; /* dump the report */ - dbg_hid("report %d (size %u) = ", report->id, size); - for (i = 0; i < size; i++) - dbg_hid_line(" %02x", data[i]); - dbg_hid_line("\n"); + snprintf(buf, HID_DEBUG_BUFSIZE - 1, + "report %d (size %u) = ", report->id, size); + hid_debug_event(hid, buf); + for (i = 0; i < size; i++) { + snprintf(buf, HID_DEBUG_BUFSIZE - 1, + " %02x", data[i]); + hid_debug_event(hid, buf); + } + hid_debug_event(hid, "\n"); + + kfree(buf); +nomem: if (hdrv && hdrv->raw_event && hid_match_report(hid, report)) { ret = hdrv->raw_event(hid, report, data, size); if (ret != 0) @@ -1756,6 +1775,9 @@ struct hid_device *hid_allocate_device(void) for (i = 0; i < HID_REPORT_TYPES; i++) INIT_LIST_HEAD(&hdev->report_enum[i].report_list); + init_waitqueue_head(&hdev->debug_wait); + INIT_LIST_HEAD(&hdev->debug_list); + return hdev; err: put_device(&hdev->dev); @@ -1844,8 +1866,8 @@ static int __init hid_init(void) int ret; if (hid_debug) - printk(KERN_WARNING "HID: hid_debug parameter has been deprecated. " - "Debugging data are now provided via debugfs\n"); + printk(KERN_WARNING "HID: hid_debug is now used solely for parser and driver debugging.\n" + "HID: debugfs is now used for inspecting the device (report descriptor, reports)\n"); ret = bus_register(&hid_bus_type); if (ret) { diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 067e173aa3e..a331a1821e8 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -28,6 +28,10 @@ #include #include +#include +#include +#include + #include #include @@ -335,49 +339,86 @@ static const struct hid_usage_entry hid_usage_table[] = { { 0, 0, NULL } }; -static void resolv_usage_page(unsigned page, struct seq_file *f) { +/* Either output directly into simple seq_file, or (if f == NULL) + * allocate a separate buffer that will then be passed to the 'events' + * ringbuffer. + * + * This is because these functions can be called both for "one-shot" + * "rdesc" while resolving, or for blocking "events". + * + * This holds both for resolv_usage_page() and hid_resolv_usage(). + */ +static char *resolv_usage_page(unsigned page, struct seq_file *f) { const struct hid_usage_entry *p; + char *buf = NULL; + + if (!f) { + buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC); + if (!buf) + return ERR_PTR(-ENOMEM); + } for (p = hid_usage_table; p->description; p++) if (p->page == page) { - if (!f) - printk("%s", p->description); - else + if (!f) { + snprintf(buf, HID_DEBUG_BUFSIZE, "%s", + p->description); + return buf; + } + else { seq_printf(f, "%s", p->description); - return; + return NULL; + } } if (!f) - printk("%04x", page); + snprintf(buf, HID_DEBUG_BUFSIZE, "%04x", page); else seq_printf(f, "%04x", page); + return buf; } -void hid_resolv_usage(unsigned usage, struct seq_file *f) { +char *hid_resolv_usage(unsigned usage, struct seq_file *f) { const struct hid_usage_entry *p; + char *buf = NULL; + int len = 0; - resolv_usage_page(usage >> 16, f); - if (!f) - printk("."); - else + buf = resolv_usage_page(usage >> 16, f); + if (IS_ERR(buf)) { + printk(KERN_ERR "error allocating HID debug buffer\n"); + return NULL; + } + + + if (!f) { + len = strlen(buf); + snprintf(buf+len, max(0, HID_DEBUG_BUFSIZE - len), "."); + len++; + } + else { seq_printf(f, "."); + } for (p = hid_usage_table; p->description; p++) if (p->page == (usage >> 16)) { for(++p; p->description && p->usage != 0; p++) if (p->usage == (usage & 0xffff)) { if (!f) - printk("%s", p->description); + snprintf(buf + len, + max(0,HID_DEBUG_BUFSIZE - len - 1), + "%s", p->description); else seq_printf(f, "%s", p->description); - return; + return buf; } break; } if (!f) - printk("%04x", usage & 0xffff); + snprintf(buf + len, max(0, HID_DEBUG_BUFSIZE - len - 1), + "%04x", usage & 0xffff); else seq_printf(f, "%04x", usage & 0xffff); + return buf; } EXPORT_SYMBOL_GPL(hid_resolv_usage); @@ -508,13 +549,37 @@ void hid_dump_device(struct hid_device *device, struct seq_file *f) } EXPORT_SYMBOL_GPL(hid_dump_device); -void hid_dump_input(struct hid_usage *usage, __s32 value) { - if (hid_debug < 2) +/* enqueue string to 'events' ring buffer */ +void hid_debug_event(struct hid_device *hdev, char *buf) +{ + int i; + struct hid_debug_list *list; + + list_for_each_entry(list, &hdev->debug_list, node) { + for (i = 0; i <= strlen(buf); i++) + list->hid_debug_buf[(list->tail + i) % (HID_DEBUG_BUFSIZE - 1)] = + buf[i]; + list->tail = (list->tail + i) % (HID_DEBUG_BUFSIZE - 1); + } +} +EXPORT_SYMBOL_GPL(hid_debug_event); + +void hid_dump_input(struct hid_device *hdev, struct hid_usage *usage, __s32 value) +{ + char *buf; + int len; + + buf = hid_resolv_usage(usage->hid, NULL); + if (!buf) return; + len = strlen(buf); + snprintf(buf + len, HID_DEBUG_BUFSIZE - len - 1, " = %d\n", value); + + hid_debug_event(hdev, buf); + + kfree(buf); + wake_up_interruptible(&hdev->debug_wait); - printk(KERN_DEBUG "hid-debug: input "); - hid_resolv_usage(usage->hid, NULL); - printk(" = %d\n", value); } EXPORT_SYMBOL_GPL(hid_dump_input); @@ -808,6 +873,7 @@ void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f) } + static int hid_debug_rdesc_show(struct seq_file *f, void *p) { struct hid_device *hdev = f->private; @@ -831,6 +897,126 @@ static int hid_debug_rdesc_open(struct inode *inode, struct file *file) return single_open(file, hid_debug_rdesc_show, inode->i_private); } +static int hid_debug_events_open(struct inode *inode, struct file *file) +{ + int err = 0; + struct hid_debug_list *list; + + if (!(list = kzalloc(sizeof(struct hid_debug_list), GFP_KERNEL))) { + err = -ENOMEM; + goto out; + } + + if (!(list->hid_debug_buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_KERNEL))) { + err = -ENOMEM; + goto out; + } + list->hdev = (struct hid_device *) inode->i_private; + file->private_data = list; + mutex_init(&list->read_mutex); + + list_add_tail(&list->node, &list->hdev->debug_list); + +out: + return err; +} + +static ssize_t hid_debug_events_read(struct file *file, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct hid_debug_list *list = file->private_data; + int ret = 0, len; + DECLARE_WAITQUEUE(wait, current); + + while (ret == 0) { + mutex_lock(&list->read_mutex); + if (list->head == list->tail) { + add_wait_queue(&list->hdev->debug_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + while (list->head == list->tail) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + if (!list->hdev || !list->hdev->debug) { + ret = -EIO; + break; + } + + /* allow O_NONBLOCK from other threads */ + mutex_unlock(&list->read_mutex); + schedule(); + mutex_lock(&list->read_mutex); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&list->hdev->debug_wait, &wait); + } + + if (ret) + goto out; + + /* pass the ringbuffer contents to userspace */ +copy_rest: + if (list->tail == list->head) + goto out; + if (list->tail > list->head) { + len = list->tail - list->head; + + if (copy_to_user(buffer + ret, &list->hid_debug_buf[list->head], len)) { + ret = -EFAULT; + goto out; + } + ret += len; + list->head += len; + } else { + len = HID_DEBUG_BUFSIZE - list->head; + + if (copy_to_user(buffer, &list->hid_debug_buf[list->head], len)) { + ret = -EFAULT; + goto out; + } + list->head = 0; + ret += len; + goto copy_rest; + } + + } +out: + mutex_unlock(&list->read_mutex); + return ret; +} + +static unsigned int hid_debug_events_poll(struct file *file, poll_table *wait) +{ + struct hid_debug_list *list = file->private_data; + + poll_wait(file, &list->hdev->debug_wait, wait); + if (list->head != list->tail) + return POLLIN | POLLRDNORM; + if (!list->hdev->debug) + return POLLERR | POLLHUP; + return 0; +} + +static int hid_debug_events_release(struct inode *inode, struct file *file) +{ + struct hid_debug_list *list = file->private_data; + + list_del(&list->node); + kfree(list->hid_debug_buf); + kfree(list); + + return 0; +} + static const struct file_operations hid_debug_rdesc_fops = { .open = hid_debug_rdesc_open, .read = seq_read, @@ -838,16 +1024,31 @@ static const struct file_operations hid_debug_rdesc_fops = { .release = single_release, }; +static const struct file_operations hid_debug_events_fops = { + .owner = THIS_MODULE, + .open = hid_debug_events_open, + .read = hid_debug_events_read, + .poll = hid_debug_events_poll, + .release = hid_debug_events_release, +}; + + void hid_debug_register(struct hid_device *hdev, const char *name) { hdev->debug_dir = debugfs_create_dir(name, hid_debug_root); hdev->debug_rdesc = debugfs_create_file("rdesc", 0400, hdev->debug_dir, hdev, &hid_debug_rdesc_fops); + hdev->debug_events = debugfs_create_file("events", 0400, + hdev->debug_dir, hdev, &hid_debug_events_fops); + hdev->debug = 1; } void hid_debug_unregister(struct hid_device *hdev) { + hdev->debug = 0; + wake_up_interruptible(&hdev->debug_wait); debugfs_remove(hdev->debug_rdesc); + debugfs_remove(hdev->debug_events); debugfs_remove(hdev->debug_dir); } diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index 516e12c3323..ec08ac1ad68 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -24,14 +24,27 @@ #ifdef CONFIG_DEBUG_FS -void hid_dump_input(struct hid_usage *, __s32); +void hid_dump_input(struct hid_device *, struct hid_usage *, __s32); void hid_dump_device(struct hid_device *, struct seq_file *); void hid_dump_field(struct hid_field *, int, struct seq_file *); -void hid_resolv_usage(unsigned, struct seq_file *); +char *hid_resolv_usage(unsigned, struct seq_file *); void hid_debug_register(struct hid_device *, const char *); void hid_debug_unregister(struct hid_device *); void hid_debug_init(void); void hid_debug_exit(void); +void hid_debug_event(struct hid_device *, char *); + +#define HID_DEBUG_BUFSIZE 512 + +struct hid_debug_list { + char *hid_debug_buf; + int head; + int tail; + struct fasync_struct *fasync; + struct hid_device *hdev; + struct list_head node; + struct mutex read_mutex; +}; #else @@ -44,6 +57,7 @@ void hid_debug_exit(void); #define hid_debug_unregister(a) do { } while (0) #define hid_debug_init() do { } while (0) #define hid_debug_exit() do { } while (0) +#define hid_debug_event(a,b) do { } while (0) #endif diff --git a/include/linux/hid.h b/include/linux/hid.h index da09ab140ef..60fa52913f8 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -451,10 +451,6 @@ struct hid_device { /* device report descriptor */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ - /* debugfs */ - struct dentry *debug_dir; - struct dentry *debug_rdesc; - void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ @@ -468,6 +464,14 @@ struct hid_device { /* device report descriptor */ /* handler for raw output data, used by hidraw */ int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t); + + /* debugging support via debugfs */ + unsigned short debug; + struct dentry *debug_dir; + struct dentry *debug_rdesc; + struct dentry *debug_events; + struct list_head debug_list; + wait_queue_head_t debug_wait; }; static inline void *hid_get_drvdata(struct hid_device *hdev) @@ -625,9 +629,7 @@ struct hid_ll_driver { /* HID core API */ -#ifdef CONFIG_HID_DEBUG extern int hid_debug; -#endif extern int hid_add_device(struct hid_device *); extern void hid_destroy_device(struct hid_device *); @@ -783,21 +785,9 @@ int hid_pidff_init(struct hid_device *hid); #define hid_pidff_init NULL #endif -#ifdef CONFIG_HID_DEBUG #define dbg_hid(format, arg...) if (hid_debug) \ printk(KERN_DEBUG "%s: " format ,\ __FILE__ , ## arg) -#define dbg_hid_line(format, arg...) if (hid_debug) \ - printk(format, ## arg) -#else -static inline int __attribute__((format(printf, 1, 2))) -dbg_hid(const char *fmt, ...) -{ - return 0; -} -#define dbg_hid_line dbg_hid -#endif /* HID_DEBUG */ - #define err_hid(format, arg...) printk(KERN_ERR "%s: " format "\n" , \ __FILE__ , ## arg) #endif /* HID_FF */ -- cgit v1.2.3-70-g09d2 From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 15 Jun 2009 17:17:47 +0200 Subject: sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, ¶m); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, ¶m); Signed-off-by: Lennart Poettering Acked-by: Peter Zijlstra LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ kernel/sched.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4896fdfec91..d4a2c6662f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,8 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 #ifdef __KERNEL__ @@ -1209,6 +1211,10 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + + /* Revert to default priority/policy when forking */ + unsigned sched_reset_on_fork:1; + pid_t pid; pid_t tgid; diff --git a/kernel/sched.c b/kernel/sched.c index 8ec9d13140b..32e6ede8525 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,12 +2613,28 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Make sure we do not leak PI boosting priority to the child: + * Revert to default priority/policy on fork if requested. Make sure we + * do not leak PI boosting priority to the child. */ - p->prio = current->normal_prio; + if (current->sched_reset_on_fork && + (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) + p->policy = SCHED_NORMAL; + + if (current->sched_reset_on_fork && + (current->normal_prio < DEFAULT_PRIO)) + p->prio = DEFAULT_PRIO; + else + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -6094,17 +6110,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6148,6 +6172,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6191,6 +6219,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6307,14 +6337,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ -- cgit v1.2.3-70-g09d2 From 5b739ef8a4e8cf5201d21abff897e292c232477b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 18 Jun 2009 19:50:21 +0800 Subject: random: Add optional continuous repetition test to entropy store based rngs FIPS-140 requires that all random number generators implement continuous self tests in which each extracted block of data is compared against the last block for repetition. The ansi_cprng implements such a test, but it would be nice if the hw rng's did the same thing. Obviously its not something thats always needed, but it seems like it would be a nice feature to have on occasion. I've written the below patch which allows individual entropy stores to be flagged as desiring a continuous test to be run on them as is extracted. By default this option is off, but is enabled in the event that fips mode is selected during bootup. Signed-off-by: Neil Horman Acked-by: Matt Mackall Signed-off-by: Herbert Xu --- crypto/internal.h | 7 +------ drivers/char/random.c | 14 ++++++++++++++ include/linux/fips.h | 10 ++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 include/linux/fips.h (limited to 'include/linux') diff --git a/crypto/internal.h b/crypto/internal.h index 113579a82df..95baaea21fb 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -25,12 +25,7 @@ #include #include #include - -#ifdef CONFIG_CRYPTO_FIPS -extern int fips_enabled; -#else -#define fips_enabled 0 -#endif +#include /* Crypto notification events. */ enum { diff --git a/drivers/char/random.c b/drivers/char/random.c index 8c7444857a4..d8a9255e1a3 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -240,6 +240,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_HARDIRQS # include @@ -413,6 +414,7 @@ struct entropy_store { unsigned add_ptr; int entropy_count; int input_rotate; + __u8 *last_data; }; static __u32 input_pool_data[INPUT_POOL_WORDS]; @@ -852,12 +854,21 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, { ssize_t ret = 0, i; __u8 tmp[EXTRACT_SIZE]; + unsigned long flags; xfer_secondary_pool(r, nbytes); nbytes = account(r, nbytes, min, reserved); while (nbytes) { extract_buf(r, tmp); + + if (r->last_data) { + spin_lock_irqsave(&r->lock, flags); + if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) + panic("Hardware RNG duplicated output!\n"); + memcpy(r->last_data, tmp, EXTRACT_SIZE); + spin_unlock_irqrestore(&r->lock, flags); + } i = min_t(int, nbytes, EXTRACT_SIZE); memcpy(buf, tmp, i); nbytes -= i; @@ -940,6 +951,9 @@ static void init_std_data(struct entropy_store *r) now = ktime_get_real(); mix_pool_bytes(r, &now, sizeof(now)); mix_pool_bytes(r, utsname(), sizeof(*(utsname()))); + /* Enable continuous test in fips mode */ + if (fips_enabled) + r->last_data = kmalloc(EXTRACT_SIZE, GFP_KERNEL); } static int rand_initialize(void) diff --git a/include/linux/fips.h b/include/linux/fips.h new file mode 100644 index 00000000000..f8fb07b0b6b --- /dev/null +++ b/include/linux/fips.h @@ -0,0 +1,10 @@ +#ifndef _FIPS_H +#define _FIPS_H + +#ifdef CONFIG_CRYPTO_FIPS +extern int fips_enabled; +#else +#define fips_enabled 0 +#endif + +#endif -- cgit v1.2.3-70-g09d2 From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/alpha/Kconfig | 3 +++ arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sparc/Kconfig | 3 +++ arch/x86/Kconfig | 3 --- include/linux/percpu.h | 12 +++++++++--- init/main.c | 24 ------------------------ kernel/module.c | 6 +++--- mm/Makefile | 2 +- mm/allocpercpu.c | 28 ++++++++++++++++++++++++++++ mm/percpu.c | 40 +++++++++++++++++++++++++++++++++++++++- 12 files changed, 95 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c39..05d86407188 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY depends on SMP default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d..328d2f8b8c3 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bf6cedfa05d..a774c2acbe6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a14dba0e4d6..f4a3cc62d28 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eab..7a8698b913f 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,9 @@ config AUDIT_ARCH bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y if SPARC64 + config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f..a48a90076d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f05..e5000343dd6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +99,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +128,8 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/init/main.c b/init/main.c index 09131ec090c..602d724afa5 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..f5934954fa9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd642669..c77c6487552 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a4735..df34ceae0c6 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd885..b14984566f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr, NULL); } + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t unit_size; + unsigned long delta; + unsigned int cpu; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, -1); + if (unit_size < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + cpu * unit_size; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3-70-g09d2 From 7c756e6e19e71f0327760d8955f7077118ebb2b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:50 +0900 Subject: percpu: implement optional weak percpu definitions Some archs (alpha and s390) need to use weak definitions for percpu variables in modules so that the compiler generates external references for them. This patch implements weak percpu definitions which arch can enable by defining ARCH_NEEDS_WEAK_PER_CPU in arch percpu header file. This weak definition adds the following two restrictions on percpu variable definitions. 1. percpu symbols must be unique whether static or not 2. percpu variables can't be defined inside a function To ensure that these restrictions are observed in generic code, config option DEBUG_FORCE_WEAK_PER_CPU enables weak percpu definitions for all cases. This patch is inspired by Ivan Kokshaysky's alpha percpu patch. [ Impact: stricter rules for percpu variables, one more debug config option ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Howells Cc: Ivan Kokshaysky --- include/linux/percpu-defs.h | 65 ++++++++++++++++++++++++++++++++++++++------- lib/Kconfig.debug | 15 +++++++++++ 2 files changed, 71 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8f921d74f49..cf32838ad0f 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -10,21 +10,68 @@ /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the - * 'section' argument. This may be used to affect the parameters governing the + * 'sec' argument. This may be used to affect the parameters governing the * variable's storage. * * NOTE! The sections for the DECLARE and for the DEFINE must match, lest * linkage errors occur due the compiler generating the wrong code to access * that section. */ -#define DECLARE_PER_CPU_SECTION(type, name, section) \ - extern \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name - -#define DEFINE_PER_CPU_SECTION(type, name, section) \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name +#define __PCPU_ATTRS(sec) \ + __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + PER_CPU_ATTRIBUTES + +#define __PCPU_DUMMY_ATTRS \ + __attribute__((section(".discard"), unused)) + +/* + * s390 and alpha modules require percpu variables to be defined as + * weak to force the compiler to generate GOT based external + * references for them. This is necessary because percpu sections + * will be located outside of the usually addressable area. + * + * This definition puts the following two extra restrictions when + * defining percpu variables. + * + * 1. The symbol must be globally unique, even the static ones. + * 2. Static percpu variables cannot be defined inside a function. + * + * Archs which need weak percpu definitions should define + * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary. + * + * To ensure that the generic code observes the above two + * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak + * definition is used for all cases. + */ +#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU) +/* + * __pcpu_scope_* dummy variable is used to enforce scope. It + * receives the static modifier when it's used in front of + * DEFINE_PER_CPU() and will trigger build failure if + * DECLARE_PER_CPU() is used for the same variable. + * + * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness + * such that hidden weak symbol collision, which will cause unrelated + * variables to share the same address, can be detected during build. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name +#else +/* + * Normal declaration and definition macros. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name +#endif /* * Variant on the per-CPU variable declaration/definition theme used for diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 23067ab1a73..77e0d8b1b7c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -777,6 +777,21 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config DEBUG_FORCE_WEAK_PER_CPU + bool "Force weak per-cpu definitions" + depends on DEBUG_KERNEL + help + s390 and alpha require percpu variables in modules to be + defined weak to work around addressing range issue which + puts the following two restrictions on percpu variable + definitions. + + 1. percpu symbols must be unique whether static or not + 2. percpu variables can't be defined inside a function + + To ensure that generic code follows the above rules, this + option forces all percpu variables to be defined as weak. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL -- cgit v1.2.3-70-g09d2 From c17ef45342cc033fdf7bdd5b28615e0090f8d2e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2009 17:12:47 -0700 Subject: rcu: Remove Classic RCU Remove Classic RCU, given that the combination of Tree RCU and the proposed Bloatwatch RCU do everything that Classic RCU can with fewer bugs. Tree RCU has been default in x86 builds for almost six months, and seems to be quite reliable, so there does not seem to be much justification for keeping the Classic RCU code and config complexity around anymore. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: dipankar@in.ibm.com Cc: dhowells@redhat.com Cc: lethal@linux-sh.org Cc: kernel@wantstofly.org Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 178 ---------- include/linux/rcupdate.h | 4 +- init/Kconfig | 12 +- kernel/Makefile | 1 - kernel/rcuclassic.c | 807 --------------------------------------------- 5 files changed, 3 insertions(+), 999 deletions(-) delete mode 100644 include/linux/rcuclassic.h delete mode 100644 kernel/rcuclassic.c (limited to 'include/linux') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h deleted file mode 100644 index bfd92e1e5d2..00000000000 --- a/include/linux/rcuclassic.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (classic version) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Author: Dipankar Sarma - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ - -#ifndef __LINUX_RCUCLASSIC_H -#define __LINUX_RCUCLASSIC_H - -#include -#include -#include -#include -#include - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - long cur; /* Current batch number. */ - long completed; /* Number of the last completed batch */ - long pending; /* Number of the last pending batch */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - unsigned long gp_start; /* Time at which GP started in jiffies. */ - unsigned long jiffies_stall; - /* Time at which to check for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - - int signaled; - - spinlock_t lock ____cacheline_internodealigned_in_smp; - DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */ - /* current batch to proceed. */ -} ____cacheline_internodealigned_in_smp; - -/* Is batch a before batch b ? */ -static inline int rcu_batch_before(long a, long b) -{ - return (a - b) < 0; -} - -/* Is batch a after batch b ? */ -static inline int rcu_batch_after(long a, long b) -{ - return (a - b) > 0; -} - -/* Per-CPU data for Read-Copy UPdate. */ -struct rcu_data { - /* 1) quiescent state handling : */ - long quiescbatch; /* Batch # for grace period */ - int passed_quiesc; /* User-mode/idle loop etc. */ - int qs_pending; /* core waits for quiesc state */ - - /* 2) batch handling */ - /* - * if nxtlist is not NULL, then: - * batch: - * The batch # for the last entry of nxtlist - * [*nxttail[1], NULL = *nxttail[2]): - * Entries that batch # <= batch - * [*nxttail[0], *nxttail[1]): - * Entries that batch # <= batch - 1 - * [nxtlist, *nxttail[0]): - * Entries that batch # <= batch - 2 - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - */ - long batch; - struct rcu_head *nxtlist; - struct rcu_head **nxttail[3]; - long qlen; /* # of queued callbacks */ - struct rcu_head *donelist; - struct rcu_head **donetail; - long blimit; /* Upper limit on a processed batch */ - int cpu; - struct rcu_head barrier; -}; - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -extern void rcu_qsctr_inc(int cpu); -extern void rcu_bh_qsctr_inc(int cpu); - -extern int rcu_pending(int cpu); -extern int rcu_needs_cpu(int cpu); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern struct lockdep_map rcu_lock_map; -# define rcu_read_acquire() \ - lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) -#else -# define rcu_read_acquire() do { } while (0) -# define rcu_read_release() do { } while (0) -#endif - -#define __rcu_read_lock() \ - do { \ - preempt_disable(); \ - __acquire(RCU); \ - rcu_read_acquire(); \ - } while (0) -#define __rcu_read_unlock() \ - do { \ - rcu_read_release(); \ - __release(RCU); \ - preempt_enable(); \ - } while (0) -#define __rcu_read_lock_bh() \ - do { \ - local_bh_disable(); \ - __acquire(RCU_BH); \ - rcu_read_acquire(); \ - } while (0) -#define __rcu_read_unlock_bh() \ - do { \ - rcu_read_release(); \ - __release(RCU_BH); \ - local_bh_enable(); \ - } while (0) - -#define __synchronize_sched() synchronize_rcu() - -#define call_rcu_sched(head, func) call_rcu(head, func) - -extern void __rcu_init(void); -#define rcu_init_sched() do { } while (0) -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); - -extern long rcu_batches_completed(void); -extern long rcu_batches_completed_bh(void); - -#define rcu_enter_nohz() do { } while (0) -#define rcu_exit_nohz() do { } while (0) - -/* A context switch is a grace period for rcuclassic. */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1; -} - -#endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 15fbb3ca634..0cdfdb622fa 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -54,9 +54,7 @@ struct rcu_head { /* Internal to kernel, but needed by rcupreempt.h. */ extern int rcu_scheduler_active; -#if defined(CONFIG_CLASSIC_RCU) -#include -#elif defined(CONFIG_TREE_RCU) +#if defined(CONFIG_TREE_RCU) #include #elif defined(CONFIG_PREEMPT_RCU) #include diff --git a/init/Kconfig b/init/Kconfig index 1ce05a4cb5f..d10f31dfa0b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -316,21 +316,13 @@ choice prompt "RCU Implementation" default TREE_RCU -config CLASSIC_RCU - bool "Classic RCU" - help - This option selects the classic RCU implementation that is - designed for best read-side performance on non-realtime - systems. - - Select this option if you are unsure. - config TREE_RCU bool "Tree-based hierarchical RCU" help This option selects the RCU implementation that is designed for very large SMP system with hundreds or - thousands of CPUs. + thousands of CPUs. It also scales down nicely to + smaller systems. config PREEMPT_RCU bool "Preemptible RCU" diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec9..d6042bcc9e9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,7 +80,6 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c deleted file mode 100644 index 0f2b0b31130..00000000000 --- a/kernel/rcuclassic.c +++ /dev/null @@ -1,807 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); -static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} - -void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - unsigned long flags; - - set_need_resched(); - spin_lock_irqsave(&rcp->lock, flags); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - * - * cpu_online_mask is updated by the _cpu_down() - * using __stop_machine(). Since we're in irqs disabled - * section, __stop_machine() is not exectuting, hence - * the cpu_online_mask is stable. - * - * However, a cpu might have been offlined _just_ before - * we disabled irqs while entering here. - * And rcu subsystem might not yet have handled the CPU_DEAD - * notification, leading to the offlined cpu's bit - * being set in the rcp->cpumask. - * - * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent - * sending smp_reschedule() to an offlined CPU. - */ - for_each_cpu_and(cpu, - to_cpumask(rcp->cpumask), cpu_online_mask) { - if (cpu != rdp->cpu) - smp_send_reschedule(cpu); - } - } - spin_unlock_irqrestore(&rcp->lock, flags); -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - set_need_resched(); -} -#endif - -static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - long batch; - - head->next = NULL; - smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ - - /* - * Determine the batch number of this callback. - * - * Using ACCESS_ONCE to avoid the following error when gcc eliminates - * local variable "batch" and emits codes like this: - * 1) rdp->batch = rcp->cur + 1 # gets old value - * ...... - * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value - * then [*nxttail[0], *nxttail[1]) may contain callbacks - * that batch# = rdp->batch, see the comment of struct rcu_data. - */ - batch = ACCESS_ONCE(rcp->cur) + 1; - - if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { - /* process callbacks */ - rdp->nxttail[0] = rdp->nxttail[1]; - rdp->nxttail[1] = rdp->nxttail[2]; - if (rcu_batch_after(batch - 1, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[2]; - } - - rdp->batch = batch; - *rdp->nxttail[2] = head; - rdp->nxttail[2] = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } -} - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = jiffies - rcp->jiffies_stall; - if (delta < 2 || rcp->cur != rcp->completed) { - spin_unlock_irqrestore(&rcp->lock, flags); - return; - } - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) - printk(" %d", cpu); - } - printk(" (detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rcp->gp_start)); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", - smp_processor_id(), jiffies, - jiffies - rcp->gp_start); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(jiffies - rcp->jiffies_stall) >= 0) - rcp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - long delta; - - delta = jiffies - rcp->jiffies_stall; - if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && - delta >= 0) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rcp); - - } else if (rcp->cur != rcp->completed && delta >= 2) { - - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. - */ -void call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* Raises the softirq for processing rcu_callbacks. */ -static inline void raise_rcu_softirq(void) -{ - raise_softirq(RCU_SOFTIRQ); -} - -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_save(flags); - rdp->qlen -= count; - local_irq_restore(flags); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - raise_rcu_softirq(); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ - -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->cur != rcp->pending && - rcp->completed == rcp->cur) { - rcp->cur++; - record_gp_stall_check_time(rcp); - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpumask_andnot(to_cpumask(rcp->cpumask), - cpu_online_mask, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); - if (cpumask_empty(to_cpumask(rcp->cpumask))) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock_irqsave(&rcp->lock, flags); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock_irqrestore(&rcp->lock, flags); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail, long batch) -{ - unsigned long flags; - - if (list) { - local_irq_save(flags); - this_rdp->batch = batch; - *this_rdp->nxttail[2] = list; - this_rdp->nxttail[2] = tail; - local_irq_restore(flags); - } -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - unsigned long flags; - - /* - * if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_irqsave(&rcp->lock, flags); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock(&rcp->lock); - - this_rdp->qlen += rdp->qlen; - local_irq_restore(flags); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from softirq context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - long completed_snap; - - if (rdp->nxtlist) { - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * move the other grace-period-completed entries to - * [rdp->nxtlist, *rdp->nxttail[0]) temporarily - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) - rdp->nxttail[0] = rdp->nxttail[1]; - - /* - * the grace period for entries in - * [rdp->nxtlist, *rdp->nxttail[0]) has completed and - * move these entries to donelist - */ - if (rdp->nxttail[0] != &rdp->nxtlist) { - *rdp->donetail = rdp->nxtlist; - rdp->donetail = rdp->nxttail[0]; - rdp->nxtlist = *rdp->nxttail[0]; - *rdp->donetail = NULL; - - if (rdp->nxttail[1] == rdp->nxttail[0]) - rdp->nxttail[1] = &rdp->nxtlist; - if (rdp->nxttail[2] == rdp->nxttail[0]) - rdp->nxttail[2] = &rdp->nxtlist; - rdp->nxttail[0] = &rdp->nxtlist; - } - - local_irq_restore(flags); - - if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags2; - - /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags2); - if (rcu_batch_after(rdp->batch, rcp->pending)) { - rcp->pending = rdp->batch; - rcu_start_batch(rcp); - } - spin_unlock_irqrestore(&rcp->lock, flags2); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be see before any RCU - * grace-period manupulations below. - */ - - smp_mb(); /* See above block comment. */ - - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be see after any RCU - * grace-period manupulations above. - */ - - smp_mb(); /* See above block comment. */ -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp); - - if (rdp->nxtlist) { - long completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - return 1; - if (!rcu_batch_before(completed_snap, rdp->batch - 1) && - rdp->nxttail[0] != rdp->nxttail[1]) - return 1; - if (rdp->nxttail[0] != &rdp->nxtlist) - return 1; - - /* - * This cpu has pending rcu entries and the new batch - * for then hasn't been started nor scheduled start - */ - if (rcu_batch_after(rdp->batch, rcp->pending)) - return 1; - } - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); -} - -/* - * Top-level function driving RCU grace-period detection, normally - * invoked from the scheduler-clock interrupt. This function simply - * increments counters that are read only from softirq by this same - * CPU, so there are no memory barriers required. - */ -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. - * - * Also do a memory barrier. This is needed to handle - * the case where writes from a preempt-disable section - * of code get reordered into schedule() by this CPU's - * write buffer. The memory barrier makes sure that - * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see - * by other CPUs to happen after any such write. - */ - - smp_mb(); /* See above block comment. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. The memory barrier - * is needed for the same reason as is the above one. - */ - - smp_mb(); /* See above block comment. */ - rcu_bh_qsctr_inc(cpu); - } - raise_rcu_softirq(); -} - -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - spin_lock_irqsave(&rcp->lock, flags); - memset(rdp, 0, sizeof(*rdp)); - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void __cpuinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ -void __init __rcu_init(void) -{ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -- cgit v1.2.3-70-g09d2 From 9e48858f7d36a6a3849f1d1b40c3bf5624b4ee7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 7 May 2009 19:26:19 +1000 Subject: security: rename ptrace_may_access => ptrace_access_check The ->ptrace_may_access() methods are named confusingly - the real ptrace_may_access() returns a bool, while these security checks have a retval convention. Rename it to ptrace_access_check, to reduce the confusion factor. [ Impact: cleanup, no code changed ] Signed-off-by: Ingo Molnar Signed-off-by: James Morris --- include/linux/security.h | 14 +++++++------- kernel/ptrace.c | 2 +- security/capability.c | 2 +- security/commoncap.c | 4 ++-- security/security.c | 4 ++-- security/selinux/hooks.c | 6 +++--- security/smack/smack_lsm.c | 8 ++++---- 7 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 5eff459b383..145909165db 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -52,7 +52,7 @@ struct audit_krule; extern int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, int audit); extern int cap_settime(struct timespec *ts, struct timezone *tz); -extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); +extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_capset(struct cred *new, const struct cred *old, @@ -1209,7 +1209,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @alter contains the flag indicating whether changes are to be made. * Return 0 if permission is granted. * - * @ptrace_may_access: + * @ptrace_access_check: * Check permission before allowing the current process to trace the * @child process. * Security modules may also want to perform a process tracing check @@ -1224,7 +1224,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself * to the @parent process for tracing. - * The parent process will still have to undergo the ptrace_may_access + * The parent process will still have to undergo the ptrace_access_check * checks before it is allowed to trace this one. * @parent contains the task_struct structure for debugger process. * Return 0 if permission is granted. @@ -1336,7 +1336,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) struct security_operations { char name[SECURITY_NAME_MAX + 1]; - int (*ptrace_may_access) (struct task_struct *child, unsigned int mode); + int (*ptrace_access_check) (struct task_struct *child, unsigned int mode); int (*ptrace_traceme) (struct task_struct *parent); int (*capget) (struct task_struct *target, kernel_cap_t *effective, @@ -1617,7 +1617,7 @@ extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); /* Security operations */ -int security_ptrace_may_access(struct task_struct *child, unsigned int mode); +int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, kernel_cap_t *effective, @@ -1798,10 +1798,10 @@ static inline int security_init(void) return 0; } -static inline int security_ptrace_may_access(struct task_struct *child, +static inline int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { - return cap_ptrace_may_access(child, mode); + return cap_ptrace_access_check(child, mode); } static inline int security_ptrace_traceme(struct task_struct *parent) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61c78b2c07b..9a4184e04f2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace_may_access(task, mode); + return security_ptrace_access_check(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) diff --git a/security/capability.c b/security/capability.c index 21b6cead6a8..f218dd36164 100644 --- a/security/capability.c +++ b/security/capability.c @@ -863,7 +863,7 @@ struct security_operations default_security_ops = { void security_fixup_ops(struct security_operations *ops) { - set_to_cap_if_null(ops, ptrace_may_access); + set_to_cap_if_null(ops, ptrace_access_check); set_to_cap_if_null(ops, ptrace_traceme); set_to_cap_if_null(ops, capget); set_to_cap_if_null(ops, capset); diff --git a/security/commoncap.c b/security/commoncap.c index 48b7e0228fa..aa97704564d 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -101,7 +101,7 @@ int cap_settime(struct timespec *ts, struct timezone *tz) } /** - * cap_ptrace_may_access - Determine whether the current process may access + * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. @@ -109,7 +109,7 @@ int cap_settime(struct timespec *ts, struct timezone *tz) * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ -int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) +int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; diff --git a/security/security.c b/security/security.c index dc7674fbfc7..4501c5e1f98 100644 --- a/security/security.c +++ b/security/security.c @@ -124,9 +124,9 @@ int register_security(struct security_operations *ops) /* Security operations */ -int security_ptrace_may_access(struct task_struct *child, unsigned int mode) +int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { - return security_ops->ptrace_may_access(child, mode); + return security_ops->ptrace_access_check(child, mode); } int security_ptrace_traceme(struct task_struct *parent) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d6f64783acd..e3b4f3083dd 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1854,12 +1854,12 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_ptrace_may_access(struct task_struct *child, +static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc; - rc = cap_ptrace_may_access(child, mode); + rc = cap_ptrace_access_check(child, mode); if (rc) return rc; @@ -5315,7 +5315,7 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) static struct security_operations selinux_ops = { .name = "selinux", - .ptrace_may_access = selinux_ptrace_may_access, + .ptrace_access_check = selinux_ptrace_access_check, .ptrace_traceme = selinux_ptrace_traceme, .capget = selinux_capget, .capset = selinux_capset, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0023182078c..1c9bdbcbe3d 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -91,7 +91,7 @@ struct inode_smack *new_inode_smack(char *smack) */ /** - * smack_ptrace_may_access - Smack approval on PTRACE_ATTACH + * smack_ptrace_access_check - Smack approval on PTRACE_ATTACH * @ctp: child task pointer * @mode: ptrace attachment mode * @@ -99,13 +99,13 @@ struct inode_smack *new_inode_smack(char *smack) * * Do the capability checks, and require read and write. */ -static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) +static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode) { int rc; struct smk_audit_info ad; char *sp, *tsp; - rc = cap_ptrace_may_access(ctp, mode); + rc = cap_ptrace_access_check(ctp, mode); if (rc != 0) return rc; @@ -3032,7 +3032,7 @@ static void smack_release_secctx(char *secdata, u32 seclen) struct security_operations smack_ops = { .name = "smack", - .ptrace_may_access = smack_ptrace_may_access, + .ptrace_access_check = smack_ptrace_access_check, .ptrace_traceme = smack_ptrace_traceme, .syslog = smack_syslog, -- cgit v1.2.3-70-g09d2 From 38b7f49a0654cb52cac61c6455807248eee3059d Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 26 Jun 2009 10:48:34 +0200 Subject: HID: fix debugfs build with !CONFIG_DEBUG_FS Fix the debug function prototypes to be correct even in the !CONFIG_DEBUG_FS case. Reported-by: Stephen Rothwell Signed-off-by: Jiri Kosina --- include/linux/hid-debug.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index ec08ac1ad68..53744fa1c8b 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -22,6 +22,8 @@ * */ +#define HID_DEBUG_BUFSIZE 512 + #ifdef CONFIG_DEBUG_FS void hid_dump_input(struct hid_device *, struct hid_usage *, __s32); @@ -34,7 +36,6 @@ void hid_debug_init(void); void hid_debug_exit(void); void hid_debug_event(struct hid_device *, char *); -#define HID_DEBUG_BUFSIZE 512 struct hid_debug_list { char *hid_debug_buf; @@ -48,11 +49,10 @@ struct hid_debug_list { #else -#define hid_dump_input(a,b) do { } while (0) -#define hid_dump_device(c) do { } while (0) -#define hid_dump_field(a,b) do { } while (0) -#define hid_resolv_usage(a) do { } while (0) -#define hid_resolv_event(a,b) do { } while (0) +#define hid_dump_input(a,b,c) do { } while (0) +#define hid_dump_device(a,b) do { } while (0) +#define hid_dump_field(a,b,c) do { } while (0) +#define hid_resolv_usage(a,b) do { } while (0) #define hid_debug_register(a, b) do { } while (0) #define hid_debug_unregister(a) do { } while (0) #define hid_debug_init() do { } while (0) -- cgit v1.2.3-70-g09d2 From 1a8dd307cc0a2119be4e578c517795464e6dabba Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jun 2009 17:45:39 +0900 Subject: percpu: use __weak only in the definition of weak percpu variables __weak is necessary only for definition and might even not work in declaration. Drop it from declaration. This change was suggested by Ivan Kokshaysky. Signed-off-by: Tejun Heo Acked-by: Ivan Kokshaysky --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index cf32838ad0f..9b7a53cc16e 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -56,7 +56,7 @@ */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ - extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ -- cgit v1.2.3-70-g09d2 From 03b042bf1dc14a268a3d65d38b4ec2a4261e8477 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:16 -0700 Subject: rcu: Add synchronize_sched_expedited() primitive This adds the synchronize_sched_expedited() primitive that implements the "big hammer" expedited RCU grace periods. This primitive is placed in kernel/sched.c rather than kernel/rcupdate.c due to its need to interact closely with the migration_thread() kthread. The idea is to wake up this kthread with req->task set to NULL, in response to which the kthread reports the quiescent state resulting from the kthread having been scheduled. Because this patch needs to fallback to the slow versions of the primitives in response to some races with CPU onlining and offlining, a new synchronize_rcu_bh() primitive is added as well. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460982947-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcupdate.h | 25 ++++----- include/linux/rcupreempt.h | 10 ++++ include/linux/rcutree.h | 12 ++++- kernel/rcupdate.c | 25 +++++++++ kernel/sched.c | 129 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 186 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0cdfdb622fa..3c89d6a2591 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,7 +51,19 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -/* Internal to kernel, but needed by rcupreempt.h. */ +/* Exported common interfaces */ +extern void synchronize_rcu(void); +extern void synchronize_rcu_bh(void); +extern void rcu_barrier(void); +extern void rcu_barrier_bh(void); +extern void rcu_barrier_sched(void); +extern void synchronize_sched_expedited(void); +extern int sched_expedited_torture_stats(char *page); + +/* Internal to kernel */ +extern void rcu_init(void); +extern void rcu_scheduler_starting(void); +extern int rcu_needs_cpu(int cpu); extern int rcu_scheduler_active; #if defined(CONFIG_TREE_RCU) @@ -257,15 +269,4 @@ extern void call_rcu(struct rcu_head *head, extern void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head)); -/* Exported common interfaces */ -extern void synchronize_rcu(void); -extern void rcu_barrier(void); -extern void rcu_barrier_bh(void); -extern void rcu_barrier_sched(void); - -/* Internal to kernel */ -extern void rcu_init(void); -extern void rcu_scheduler_starting(void); -extern int rcu_needs_cpu(int cpu); - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index fce522782ff..f164ac9b780 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -74,6 +74,16 @@ extern int rcu_needs_cpu(int cpu); extern void __synchronize_sched(void); +static inline void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); /* Placeholder for new rcupreempt implementation. */ +} + +static inline void synchronize_rcu_bh_expedited(void) +{ + synchronize_rcu_bh(); /* Placeholder for new rcupreempt impl. */ +} + extern void __rcu_init(void); extern void rcu_init_sched(void); extern void rcu_check_callbacks(int cpu, int user); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5a5153806c4..d4dfd248963 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -286,8 +286,14 @@ static inline void __rcu_read_unlock_bh(void) #define call_rcu_sched(head, func) call_rcu(head, func) -static inline void rcu_init_sched(void) +static inline void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} + +static inline void synchronize_rcu_bh_expedited(void) { + synchronize_sched_expedited(); } extern void __rcu_init(void); @@ -297,6 +303,10 @@ extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); +static inline void rcu_init_sched(void) +{ +} + #ifdef CONFIG_NO_HZ void rcu_enter_nohz(void); void rcu_exit_nohz(void); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a967c9feb90..eae29c25fb1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -98,6 +98,30 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type) static inline void wait_migrated_callbacks(void) { wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); + smp_mb(); /* In case we didn't sleep. */ } /* diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..9ae80bec1c1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7024,6 +7024,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -7031,6 +7036,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -7065,8 +7071,17 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); @@ -10554,3 +10569,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ -- cgit v1.2.3-70-g09d2 From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:58 +0900 Subject: percpu: drop @unit_size from embed first chunk allocator The only extra feature @unit_size provides is making dead space at the end of the first chunk which doesn't have any valid usecase. Drop the parameter. This will increase consistency with generalized 4k allocator. James Bottomley spotted missing conversion for the default setup_per_cpu_areas() which caused build breakage on all arcsh which use it. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- include/linux/percpu.h | 2 +- mm/percpu.c | 18 ++++++------------ 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4..14728206fb5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); + reserve - PERCPU_FIRST_CHUNK_RESERVE); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e5000343dd6..83bff053bd1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size); + ssize_t dyn_size); /* * Use this to get to a cpu's version of the per-cpu object diff --git a/mm/percpu.c b/mm/percpu.c index 19dd83b5cbd..fc6babe6e55 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * page size. * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned to the bootmem allocator. @@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * percpu access on success, -errno on failure. */ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) + ssize_t dyn_size) { size_t chunk_size; unsigned int cpu; @@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, if (dyn_size != 0) dyn_size = pcpue_size - static_size - reserved_size; - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, @@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, -1); + PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-70-g09d2 From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize 4k first chunk allocator Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk(). setup_pcpu_4k() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for consistency. [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 78 ++++++++++---------------------------- include/linux/percpu.h | 12 +++++- mm/percpu.c | 85 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 14728206fb5..ab896b31e80 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +{ + return pcpu_alloc_bootmem(cpu, size, size); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + /* * Large page remap allocator * @@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k page allocator + * 4k allocator * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. + * Boring fallback 4k allocator. This allocator puts more pressure on + * PTE TLBs but other than that behaves nicely on both UMA and NUMA. */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr) static ssize_t __init setup_pcpu_4k(size_t static_size) { - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; + return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpu4k_populate_pte); } /* for explicit first chunk allocator selection */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 83bff053bd1..41b5bfab419 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,18 +59,26 @@ extern void *pcpu_base_addr; typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); +typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + pcpu_fc_populate_pte_fn_t populate_pte_fn); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +extern ssize_t __init pcpu_4k_first_chunk( + size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index fc6babe6e55..27b0f40a3ea 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, pcpue_unit_size, pcpue_ptr, NULL); } +/* + * 4k page first chunk setup helper. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +/** + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() * + sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); + goto enomem; + } + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + reserved_size, -1, + -1, NULL, populate_pte_fn); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + /* * Generic percpu area setup. * -- cgit v1.2.3-70-g09d2 From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize lpage first chunk allocator Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 9 -- arch/x86/kernel/setup_percpu.c | 169 +++------------------------------ arch/x86/mm/pageattr.c | 1 + include/linux/percpu.h | 27 ++++++ mm/percpu.c | 209 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 244 insertions(+), 171 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d8..a18c038a307 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -156,15 +156,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ab896b31e80..4f2e0ac9130 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size) } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Large page remapping allocator */ #ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void __init pcpul_map(void *ptr, size_t size, void *addr) { - size_t off = (size_t)pageno << PAGE_SHIFT; + pmd_t *pmd, pmd_v; - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + pmd = populate_extra_pmd((unsigned long)addr); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; @@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -EINVAL; } - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; + return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a896..c106f785242 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 41b5bfab419..9f6bfd7d4b9 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, @@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#ifdef CONFIG_NEED_MULTIPLE_NODES +extern ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn); + +extern void *pcpu_lpage_remapped(void *kaddr); +#else +static inline ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + return -EINVAL; +} + +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378..17db527ee2e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, return pcpu_unit_size; } +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /* * Embedding first chunk setup helper. */ @@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; + pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); @@ -1390,6 +1400,197 @@ out_free_ar: return ret; } +/* + * Large page remapping first chunk setup helper + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static size_t pcpul_unit_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; + +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpul_size) + return NULL; + + return virt_to_page(pcpul_map[cpu].ptr + off); +} + +/** + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @lpage_size: the size of a large page + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size + * @free_fn: function to free percpu memory, @size <= lpage_size + * @map_fn: function to map percpu lpage, always called with lpage_size + * + * This allocator uses large page as unit. A large page is allocated + * for each cpu and each is remapped into vmalloc area using large + * page mapping. As large page can be quite large, only part of it is + * used for the first chunk. Unused part is returned to the bootmem + * allocator. + * + * So, the large pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more large TLB entry pressure but still is + * much better than only using 4k mappings while still being NUMA + * friendly. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + size_t size_sum; + size_t map_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + + pcpul_unit_size = lpage_size; + pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + if (pcpul_size > pcpul_unit_size) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); + + for_each_possible_cpu(cpu) { + void *ptr; + + ptr = alloc_fn(cpu, lpage_size); + if (!ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); + goto enomem; + } + + /* + * Only use pcpul_size bytes and give back the rest. + * + * Ingo: The lpage_size up-rounding bootmem is needed + * to make sure the partial lpage is still fully RAM - + * it's not well-specified to have a incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_fn(ptr + pcpul_size, lpage_size - pcpul_size); + + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = ptr; + + memcpy(ptr, __per_cpu_load, static_size); + } + + /* allocate address and map */ + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; + vm_area_register_early(&pcpul_vm, pcpul_unit_size); + + for_each_possible_cpu(cpu) + map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, + pcpul_vm.addr + cpu * pcpul_unit_size); + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", pcpul_vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, + reserved_size, dyn_size, pcpul_unit_size, + pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; + +enomem: + for_each_possible_cpu(cpu) + if (pcpul_map[cpu].ptr) + free_fn(pcpul_map[cpu].ptr, pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu large + * page mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used large + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + unsigned long unit_mask = pcpul_unit_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); + unsigned long offset = (unsigned long)kaddr & unit_mask; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < lpage_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > lpage_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * pcpul_unit_size + offset; + } + } + + return NULL; +} +#endif + /* * Generic percpu area setup. * -- cgit v1.2.3-70-g09d2 From 38a6be525460f52ac6f2de1c3f73c5615a8853cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: percpu: simplify pcpu_setup_first_chunk() Now that all first chunk allocator helpers allocate and map the first chunk themselves, there's no need to have optional default alloc/map in pcpu_setup_first_chunk(). Drop @populate_pte_fn and only leave @dyn_size optional and make all other params mandatory. This makes it much easier to follow what pcpu_setup_first_chunk() is doing and what actual differences tweaking each parameter results in. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/sparc/kernel/smp_64.c | 2 +- include/linux/percpu.h | 5 +-- mm/percpu.c | 104 +++++++++++++-------------------------------- 3 files changed, 33 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index fa44eaf8d89..ccad7b20ae7 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1528,7 +1528,7 @@ void __init setup_per_cpu_areas(void) pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + PCPU_CHUNK_SIZE, vm.addr); free_bootmem(__pa(pcpur_ptrs), ptrs_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9f6bfd7d4b9..ec64357e176 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -66,9 +66,8 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + ssize_t dyn_size, size_t unit_size, + void *base_addr); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, diff --git a/mm/percpu.c b/mm/percpu.c index 17db527ee2e..21d938a1066 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -983,24 +983,22 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. + * setup path. * * @get_page_fn() should return pointer to percpu page given cpu * number and page number. It should at least return enough pages to * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. + * have been initialized with valid data. It can also return pages + * after the static area. NULL return indicates end of pages for the + * cpu. Note that @get_page_fn() must return the same number of pages + * for all cpus. * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves @@ -1015,17 +1013,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * non-negative value makes percpu leave alone the area beyond * @static_size + @reserved_size + @dyn_size. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. - * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. + * @unit_size specifies unit size and must be aligned to PAGE_SIZE and + * equal to or larger than @static_size + @reserved_size + if + * non-negative, @dyn_size. * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1040,9 +1033,8 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_fc_populate_pte_fn_t populate_pte_fn) + ssize_t dyn_size, size_t unit_size, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1050,27 +1042,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; - int nr_pages; - int err, i; + int i, nr_pages; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + BUG_ON(!base_addr); + BUG_ON(unit_size < size_sum); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -1079,6 +1062,10 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; + first_vm.flags = VM_ALLOC; + first_vm.size = pcpu_chunk_size; + first_vm.addr = base_addr; + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1101,6 +1088,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; + schunk->immutable = true; if (reserved_size) { schunk->free_size = reserved_size; @@ -1124,31 +1112,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } - /* assign pages */ nr_pages = -1; for_each_possible_cpu(cpu) { @@ -1168,19 +1138,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, BUG_ON(nr_pages != i); } - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -1282,7 +1239,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(pcpue_get_page, static_size, reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); + pcpue_unit_size, pcpue_ptr); } /* @@ -1387,8 +1344,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, reserved_size, -1, - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr, - NULL); + pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: @@ -1521,7 +1477,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, reserved_size, dyn_size, pcpul_unit_size, - pcpul_vm.addr, NULL); + pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) -- cgit v1.2.3-70-g09d2 From ce3141a277ff6cc37e51008b8888dc2cb7456ef1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: drop pcpu_chunk->page[] percpu core doesn't need to tack all the allocated pages. It needs to know whether certain pages are populated and a way to reverse map address to page when freeing. This patch drops pcpu_chunk->page[] and use populated bitmap and vmalloc_to_page() lookup instead. Using vmalloc_to_page() exclusively is also possible but complicates first chunk handling, inflates cache footprint and prevents non-standard memory allocation for percpu memory. pcpu_chunk->page[] was used to track each page's allocation and allowed asymmetric population which happens during failure path; however, with single bitmap for all units, this is no longer possible. Bite the bullet and rewrite (de)populate functions so that things are done in clearly separated steps such that asymmetric population doesn't happen. This makes the (de)population process much more modular and will also ease implementing non-standard memory usage in the future (e.g. large pages). This makes @get_page_fn parameter to pcpu_setup_first_chunk() unnecessary. The parameter is dropped and all first chunk helpers are updated accordingly. Please note that despite the volume most changes to first chunk helpers are symbol renames for variables which don't need to be referenced outside of the helper anymore. This change reduces memory usage and cache footprint of pcpu_chunk. Now only #unit_pages bits are necessary per chunk. [ Impact: reduced memory usage and cache footprint for bookkeeping ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 42 ++-- include/linux/percpu.h | 3 +- mm/percpu.c | 604 ++++++++++++++++++++++++++++----------------- 3 files changed, 400 insertions(+), 249 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index ccad7b20ae7..f2f22ee97a7 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; - -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpur_size) - return NULL; - - return virt_to_page(pcpur_ptrs[cpu] + off); -} - #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) static void __init pcpu_map_range(unsigned long start, unsigned long end, @@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void) size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; unsigned long delta, cpu; - size_t pcpu_unit_size; + size_t size_sum, pcpu_unit_size; size_t ptrs_size; + void **ptrs; - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; + size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); + dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0])); + ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, - PCPU_CHUNK_SIZE); + ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, + PCPU_CHUNK_SIZE); - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PCPU_CHUNK_SIZE - pcpur_size); + free_bootmem(__pa(ptrs[cpu] + size_sum), + PCPU_CHUNK_SIZE - size_sum); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void) start += cpu * PCPU_CHUNK_SIZE; end = start + PCPU_CHUNK_SIZE; - pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu])); + pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, + pcpu_unit_size = pcpu_setup_first_chunk(static_size, PERCPU_MODULE_RESERVE, dyn_size, PCPU_CHUNK_SIZE, vm.addr); - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { diff --git a/include/linux/percpu.h b/include/linux/percpu.h index ec64357e176..63c8b7a23e6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -58,13 +58,12 @@ extern void *pcpu_base_addr; -typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, +extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr); diff --git a/mm/percpu.c b/mm/percpu.c index 639fce4d2ca..21756814d99 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,8 +94,7 @@ struct pcpu_chunk { int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; @@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); } -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; -} + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) -{ - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /* set the pointer to a chunk in a page struct */ @@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) } /** - * pcpu_unmap - unmap pages out of a pcpu_chunk + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * @chunk: chunk of interest + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array + * + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. + */ +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) +{ + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = num_possible_cpus() * pcpu_unit_pages * + sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } + + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + + *bitmapp = bitmap; + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 - * @flush_tlb: whether to flush tlb or not * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush_tlb) +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; unsigned int cpu; + int i; - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush_tlb) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, } /** - * pcpu_map - map pages into a pcpu_chunk + * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + unsigned int cpu, tcpu; + int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), - pcpu_chunk_pagep(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) - return err; + goto err; } + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); + } + + return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + /* flush at once, please read comments in pcpu_unmap() */ flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); - return 0; } /** @@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) { int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); - unsigned int cpu; - int i; + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + return; + break; + } - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); - if (!*pagep) - continue; + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); - __free_page(*pagep); + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; - } - } + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); } /** @@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; unsigned int cpu; - int i; + int rs, re, rc; - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } + /* quick path, check whether all pages are already there */ + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + goto clear; + break; + } - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: for_each_possible_cpu(cpu) memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, size); - return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) @@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); if (!chunk->vm) { @@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work) mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } } @@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu); /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto @@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu); * perpcu area. This function is to be called from arch percpu area * setup path. * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. It can also return pages - * after the static area. NULL return indicates end of pages for the - * cpu. Note that @get_page_fn() must return the same number of pages - * for all cpus. - * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved @@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, +size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr) { @@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu; - int i, nr_pages; + int i; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || @@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); if (reserved_size) { schunk->free_size = reserved_size; @@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); dchunk->vm = &first_vm; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; - -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpue_size) - return NULL; - - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); -} - /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size) { - size_t chunk_size; + size_t size_sum, unit_size, chunk_size; + void *base; unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + chunk_size = unit_size * num_possible_cpus(); - pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) { + base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); return -ENOMEM; @@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, /* return the leftover and copy */ for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + void *ptr = base + cpu * unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); + free_bootmem(__pa(ptr + size_sum), unit_size - size_sum); memcpy(ptr, __per_cpu_load, static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + size_sum >> PAGE_SHIFT, base, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr); -} - -/* - * 4k page first chunk setup helper. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_unit_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_unit_pages) - return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno]; - return NULL; + return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + unit_size, base); } /** @@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + int unit_pages; size_t pages_size; + struct page **pages; unsigned int cpu; int i, j; ssize_t ret; - pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, - PCPU_MIN_UNIT_SIZE)); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, + PCPU_MIN_UNIT_SIZE)); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * - sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_unit_pages; i++) { + for (i = 0; i < unit_pages; i++) { void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE); @@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, "4k page for cpu%u\n", cpu); goto enomem; } - pcpu4k_pages[j++] = virt_to_page(ptr); + pages[j++] = virt_to_page(ptr); } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; + vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT; vm_area_register_early(&vm, PAGE_SIZE); for_each_possible_cpu(cpu) { unsigned long unit_addr = (unsigned long)vm.addr + - (cpu * pcpu4k_unit_pages << PAGE_SHIFT); + (cpu * unit_pages << PAGE_SHIFT); - for (i = 0; i < pcpu4k_unit_pages; i++) + for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, - &pcpu4k_pages[cpu * pcpu4k_unit_pages], - pcpu4k_unit_pages); + ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, /* we're ready, commit */ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", - pcpu4k_unit_pages, static_size); + unit_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - reserved_size, -1, - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); + ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, + unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: while (--j >= 0) - free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + free_fn(page_address(pages[j]), PAGE_SIZE); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); + free_bootmem(__pa(pages), pages_size); return ret; } @@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size; static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); -} - /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes @@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", pcpul_vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - reserved_size, dyn_size, pcpul_unit_size, - pcpul_vm.addr); + ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + pcpul_unit_size, pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) -- cgit v1.2.3-70-g09d2 From 2f39e637ea240efb74cf807d31c93a71a0b89174 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: allow non-linear / sparse cpu -> unit mapping Currently cpu and unit are always identity mapped. To allow more efficient large page support on NUMA and lazy allocation for possible but offline cpus, cpu -> unit mapping needs to be non-linear and/or sparse. This can be easily implemented by adding a cpu -> unit mapping array and using it whenever looking up the matching unit for a cpu. The only unusal conversion is in pcpu_chunk_addr_search(). The passed in address is unit0 based and unit0 might not be in use so it needs to be converted to address of an in-use unit. This is easily done by adding the unit offset for the current processor. [ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 2 +- include/linux/percpu.h | 3 +- mm/percpu.c | 129 +++++++++++++++++++++++++++++++++------------ 3 files changed, 97 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f2f22ee97a7..6970333b48b 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1516,7 +1516,7 @@ void __init setup_per_cpu_areas(void) pcpu_unit_size = pcpu_setup_first_chunk(static_size, PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr); + PCPU_CHUNK_SIZE, vm.addr, NULL); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 63c8b7a23e6..1e0e8878dc2 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,6 +57,7 @@ #endif extern void *pcpu_base_addr; +extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); @@ -66,7 +67,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, - void *base_addr); + void *base_addr, const int *unit_map); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, diff --git a/mm/percpu.c b/mm/percpu.c index 21756814d99..2196fae24f0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,13 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of boot-time determined number of units and the + * first chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. ie. in + * vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,11 +23,13 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous @@ -99,14 +102,22 @@ struct pcpu_chunk { static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; +static int pcpu_nr_units __read_mostly; static int pcpu_chunk_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +/* cpu -> unit map */ +const int *pcpu_unit_map __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return cpu * pcpu_unit_pages + page_idx; + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, @@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, { static struct page **pages; static unsigned long *bitmap; - size_t pages_size = num_possible_cpus() * pcpu_unit_pages * - sizeof(pages[0]); + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); @@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -835,11 +851,9 @@ err: static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } /** @@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) bitmap_copy(chunk->populated, populated, pcpu_unit_pages); clear: for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; err_unmap: @@ -1088,6 +1101,7 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); + /* return address relative to unit0 */ return __addr_to_pcpu_ptr(chunk->vm->addr + off); fail_unlock: @@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE * @base_addr: mapped address + * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area @@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, - void *base_addr) + void *base_addr, const int *unit_map) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned int cpu, tcpu; int i; - /* santiy checks */ + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); @@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, BUG_ON(unit_size & ~PAGE_MASK); BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + /* determine number of units and verify and initialize pcpu_unit_map */ + if (unit_map) { + int first_unit = INT_MAX, last_unit = INT_MIN; + + for_each_possible_cpu(cpu) { + int unit = unit_map[cpu]; + + BUG_ON(unit < 0); + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + /* the mapping should be one-to-one */ + BUG_ON(unit_map[tcpu] == unit); + } + + if (unit < first_unit) { + pcpu_first_unit_cpu = cpu; + first_unit = unit; + } + if (unit > last_unit) { + pcpu_last_unit_cpu = cpu; + last_unit = unit; + } + } + pcpu_nr_units = last_unit + 1; + pcpu_unit_map = unit_map; + } else { + int *identity_map; + + /* #units == #cpus, identity mapped */ + identity_map = alloc_bootmem(num_possible_cpus() * + sizeof(identity_map[0])); + + for_each_possible_cpu(cpu) + identity_map[cpu] = cpu; + + pcpu_first_unit_cpu = 0; + pcpu_last_unit_cpu = pcpu_nr_units - 1; + pcpu_nr_units = num_possible_cpus(); + pcpu_unit_map = identity_map; + } + + /* determine basic parameters */ pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); @@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); + pcpu_base_addr = schunk->vm->addr; return pcpu_unit_size; } @@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, size_sum >> PAGE_SHIFT, base, static_size); return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base); + unit_size, base, NULL); } /** @@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, unit_pages, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, - unit_pages << PAGE_SHIFT, vm.addr); + unit_pages << PAGE_SHIFT, vm.addr, NULL); goto out_free_ar; enomem: @@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr); + pcpul_unit_size, pcpul_vm.addr, NULL); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) -- cgit v1.2.3-70-g09d2 From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: teach large page allocator about NUMA Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Jan Beulich Cc: Andi Kleen Cc: David Miller --- arch/x86/kernel/setup_percpu.c | 72 +++++++-- include/linux/percpu.h | 24 ++- mm/percpu.c | 358 ++++++++++++++++++++++++++++++++--------- 3 files changed, 359 insertions(+), 95 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4f2e0ac9130..7501bb14bd5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr) set_pmd(pmd, pmd_v); } +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +{ + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; + size_t unit_map_size, unit_size; + int *unit_map; + int nr_units; + ssize_t ret; + + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) + return -EINVAL; + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* allocate and build unit_map */ + unit_map_size = num_possible_cpus() * sizeof(int); + unit_map = alloc_bootmem_nopanic(unit_map_size); + if (!unit_map) { + pr_warning("PERCPU: failed to allocate unit_map\n"); + return -ENOMEM; + } + + ret = pcpu_lpage_build_unit_map(static_size, + PERCPU_FIRST_CHUNK_RESERVE, + &dyn_size, &unit_size, PMD_SIZE, + unit_map, pcpu_lpage_cpu_distance); + if (ret < 0) { + pr_warning("PERCPU: failed to build unit_map\n"); + goto out_free; + } + nr_units = ret; + + /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; + size_t tot_size = nr_units * unit_size; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - return -EINVAL; + ret = -EINVAL; + goto out_free; } } - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, unit_size, PMD_SIZE, + unit_map, nr_units, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); +out_free: + if (ret < 0) + free_bootmem(__pa(unit_map), unit_map_size); + return ret; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) @@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void) /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = + delta + pcpu_unit_map[cpu] * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1e0e8878dc2..8ce91af4aa1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( @@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_populate_pte_fn_t populate_pte_fn); #ifdef CONFIG_NEED_MULTIPLE_NODES +extern int __init pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + extern ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); extern void *pcpu_lpage_remapped(void *kaddr); #else +static inline int pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + return -EINVAL; +} + static inline ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) diff --git a/mm/percpu.c b/mm/percpu.c index 2196fae24f0..b3d0bcff8c7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1594,75 +1595,259 @@ out_free_ar: * Large page remapping first chunk setup helper */ #ifdef CONFIG_NEED_MULTIPLE_NODES + +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + struct pcpul_ent { - unsigned int cpu; void *ptr; + void *map_addr; }; static size_t pcpul_size; -static size_t pcpul_unit_size; +static size_t pcpul_lpage_size; +static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + if (unit_map[cpu] == unit) { + if (cpup) + *cpup = cpu; + return true; + } + + return false; +} + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes + * @unit_size: unit size in bytes * @lpage_size: the size of a large page + * @unit_map: cpu -> unit mapping + * @nr_units: the number of units * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * - * This allocator uses large page as unit. A large page is allocated - * for each cpu and each is remapped into vmalloc area using large - * page mapping. As large page can be quite large, only part of it is - * used for the first chunk. Unused part is returned to the bootmem - * allocator. - * - * So, the large pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more large TLB entry pressure but still is - * much better than only using 4k mappings while still being NUMA - * friendly. + * This allocator uses large page to build and map the first chunk. + * Unlike other helpers, the caller should always specify @dyn_size + * and @unit_size. These parameters along with @unit_map and + * @nr_units can be determined using pcpu_lpage_build_unit_map(). + * This two stage initialization is to allow arch code to evaluate the + * parameters before committing to it. + * + * Large pages are allocated as directed by @unit_map and other + * parameters and mapped to vmalloc space. Unused holes are returned + * to the page allocator. Note that these holes end up being actively + * mapped twice - once to the physical mapping and to the vmalloc area + * for the first percpu chunk. Depending on architecture, this might + * cause problem when changing page attributes of the returned area. + * These double mapped areas can be detected using + * pcpu_lpage_remapped(). * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { - size_t size_sum; + static struct vm_struct vm; + size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; - int i, j; ssize_t ret; + int i, j, unit; - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, + unit_size, lpage_size, unit_map, nr_units); - pcpul_unit_size = lpage_size; - pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - if (pcpul_size > pcpul_unit_size) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } + BUG_ON(chunk_size % lpage_size); + + pcpul_size = static_size + reserved_size + dyn_size; + pcpul_lpage_size = lpage_size; + pcpul_nr_lpages = chunk_size / lpage_size; /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); pcpul_map = alloc_bootmem(map_size); - for_each_possible_cpu(cpu) { + /* allocate all pages */ + for (i = 0; i < pcpul_nr_lpages; i++) { + size_t offset = i * lpage_size; + int first_unit = offset / unit_size; + int last_unit = (offset + lpage_size - 1) / unit_size; void *ptr; + /* find out which cpu is mapped to this unit */ + for (unit = first_unit; unit <= last_unit; unit++) + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + goto found; + continue; + found: ptr = alloc_fn(cpu, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " @@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, goto enomem; } - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The lpage_size up-rounding bootmem is needed - * to make sure the partial lpage is still fully RAM - - * it's not well-specified to have a incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_fn(ptr + pcpul_size, lpage_size - pcpul_size); - - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = ptr; + pcpul_map[i].ptr = ptr; + } - memcpy(ptr, __per_cpu_load, static_size); + /* return unused holes */ + for (unit = 0; unit < nr_units; unit++) { + size_t start = unit * unit_size; + size_t end = start + unit_size; + size_t off, next; + + /* don't free used part of occupied unit */ + if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + start += pcpul_size; + + /* unit can span more than one page, punch the holes */ + for (off = start; off < end; off = next) { + void *ptr = pcpul_map[off / lpage_size].ptr; + next = min(roundup(off + 1, lpage_size), end); + if (ptr) + free_fn(ptr + off % lpage_size, next - off); + } } - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; - vm_area_register_early(&pcpul_vm, pcpul_unit_size); + /* allocate address, map and copy */ + vm.flags = VM_ALLOC; + vm.size = chunk_size; + vm_area_register_early(&vm, unit_size); + + for (i = 0; i < pcpul_nr_lpages; i++) { + if (!pcpul_map[i].ptr) + continue; + pcpul_map[i].map_addr = vm.addr + i * lpage_size; + map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); + } for_each_possible_cpu(cpu) - map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, - pcpul_vm.addr + cpu * pcpul_unit_size); + memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, + static_size); /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); + "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } + unit_size, vm.addr, unit_map); + + /* + * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped + * lpages are pushed to the end and trimmed. + */ + for (i = 0; i < pcpul_nr_lpages - 1; i++) + for (j = i + 1; j < pcpul_nr_lpages; j++) { + struct pcpul_ent tmp; + + if (!pcpul_map[j].ptr) + continue; + if (pcpul_map[i].ptr && + pcpul_map[i].ptr < pcpul_map[j].ptr) + continue; + + tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) + pcpul_nr_lpages--; return ret; enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_fn(pcpul_map[cpu].ptr, pcpul_size); + for (i = 0; i < pcpul_nr_lpages; i++) + if (pcpul_map[i].ptr) + free_fn(pcpul_map[i].ptr, lpage_size); free_bootmem(__pa(pcpul_map), map_size); return -ENOMEM; } @@ -1739,10 +1950,10 @@ enomem: */ void *pcpu_lpage_remapped(void *kaddr) { - unsigned long unit_mask = pcpul_unit_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); - unsigned long offset = (unsigned long)kaddr & unit_mask; - int left = 0, right = num_possible_cpus() - 1; + unsigned long lpage_mask = pcpul_lpage_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); + unsigned long offset = (unsigned long)kaddr & lpage_mask; + int left = 0, right = pcpul_nr_lpages - 1; int pos; /* pcpul in use at all? */ @@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr) left = pos + 1; else if (pcpul_map[pos].ptr > lpage_addr) right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * pcpul_unit_size + offset; - } + else + return pcpul_map[pos].map_addr + offset; } return NULL; -- cgit v1.2.3-70-g09d2 From 96ccd4a43a4d80c80be636cd025a69959cf47424 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2009 12:47:52 +0200 Subject: genirq: Remove obsolete defines and typedefs The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have been kept around for migration reasons. The last users are gone, remove them. Signed-off-by: Thomas Gleixner --- Documentation/feature-removal-schedule.txt | 9 --------- include/linux/irq.h | 7 ------- 2 files changed, 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f8cd450be9a..2af78126b05 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -394,15 +394,6 @@ Who: Thomas Gleixner ----------------------------- -What: obsolete generic irq defines and typedefs -When: 2.6.30 -Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) - have been kept around for migration reasons. After more than two years - it's time to remove them finally -Who: Thomas Gleixner - ---------------------------- - What: fakephp and associated sysfs files in /sys/bus/pci/slots/ When: 2011 Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to diff --git a/include/linux/irq.h b/include/linux/irq.h index cb2e77a3f7f..6956df9961a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -219,13 +219,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); -/* - * Migration helpers for obsolete names, they will go away: - */ -#define hw_interrupt_type irq_chip -#define no_irq_type no_irq_chip -typedef struct irq_desc irq_desc_t; - /* * Pick up the arch-dependent methods: */ -- cgit v1.2.3-70-g09d2 From 0e8635a8e1f2d4a9e1bfc6c3b21419a5921e674f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 20 Jun 2009 00:53:25 +0000 Subject: net: remove NET_RX_BAD and NET_RX_CN* defines almost no users in the tree; and the few that use them treat them like NET_RX_DROP. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/net/rionet.c | 5 ----- drivers/net/wan/farsync.c | 19 ------------------- drivers/staging/otus/wrap_pkt.c | 3 --- include/linux/netdevice.h | 4 ---- net/decnet/dn_route.c | 2 +- net/lapb/lapb_iface.c | 2 +- 6 files changed, 2 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 8702e7acdee..74cdb6f8f84 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -114,11 +114,6 @@ static int rionet_rx_clean(struct net_device *ndev) if (error == NET_RX_DROP) { ndev->stats.rx_dropped++; - } else if (error == NET_RX_BAD) { - if (netif_msg_rx_err(rnet)) - printk(KERN_WARNING "%s: bad rx packet\n", - DRV_NAME); - ndev->stats.rx_errors++; } else { ndev->stats.rx_packets++; ndev->stats.rx_bytes += RIO_MAX_MSG_SIZE; diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 25c9ef6a181..90c0a317d9d 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -792,25 +792,6 @@ fst_process_rx_status(int rx_status, char *name) */ break; } - - case NET_RX_CN_LOW: - { - dbg(DBG_ASS, "%s: Receive Low Congestion\n", name); - break; - } - - case NET_RX_CN_MOD: - { - dbg(DBG_ASS, "%s: Receive Moderate Congestion\n", name); - break; - } - - case NET_RX_CN_HIGH: - { - dbg(DBG_ASS, "%s: Receive High Congestion\n", name); - break; - } - case NET_RX_DROP: { dbg(DBG_ASS, "%s: Received packet dropped\n", name); diff --git a/drivers/staging/otus/wrap_pkt.c b/drivers/staging/otus/wrap_pkt.c index 5db0004c873..89a6b92f597 100644 --- a/drivers/staging/otus/wrap_pkt.c +++ b/drivers/staging/otus/wrap_pkt.c @@ -156,10 +156,7 @@ void zfLnxRecvEth(zdev_t* dev, zbuf_t* buf, u16_t port) switch(netif_rx(buf)) #endif { - case NET_RX_BAD: case NET_RX_DROP: - case NET_RX_CN_MOD: - case NET_RX_CN_HIGH: break; default: macp->drv_stats.net_stats.rx_packets++; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4a4d986779..9f25ab2899d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -72,10 +72,6 @@ struct wireless_dev; /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ -#define NET_RX_CN_LOW 2 /* storm alert, just in case */ -#define NET_RX_CN_MOD 3 /* Storm on its way! */ -#define NET_RX_CN_HIGH 4 /* The storm is here */ -#define NET_RX_BAD 5 /* packet dropped due to kernel error */ /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It * indicates that the device will soon be dropping packets, or already drops diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1d6ca8a98dc..9383d3e5a1a 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -774,7 +774,7 @@ static int dn_rt_bug(struct sk_buff *skb) kfree_skb(skb); - return NET_RX_BAD; + return NET_RX_DROP; } static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 2ba1bc4f3c3..bda96d18fd9 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -407,7 +407,7 @@ int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) return lapb->callbacks.data_indication(lapb->dev, skb); kfree_skb(skb); - return NET_RX_CN_HIGH; /* For now; must be != NET_RX_DROP */ + return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ } int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) -- cgit v1.2.3-70-g09d2 From 6650613d3387dcc30685e2781818ea7d0f840027 Mon Sep 17 00:00:00 2001 From: "oscar.medina@motorola.com" Date: Tue, 30 Jun 2009 03:25:39 +0000 Subject: tipc: Add socket options to get number of queued messages This patch allows a TIPC application to determine the number of messages currently waiting in a socket's receive queue (TIPC_SOCK_RECVQ_DEPTH) or in all TIPC socket receive queues (TIPC_NODE_RECVQ_DEPTH). Signed-off-by: Oscar Medina Signed-off-by: Allan Stephens Signed-off-by: David S. Miller --- include/linux/tipc.h | 2 ++ net/tipc/socket.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tipc.h b/include/linux/tipc.h index bea469455a0..3d92396639d 100644 --- a/include/linux/tipc.h +++ b/include/linux/tipc.h @@ -209,5 +209,7 @@ struct sockaddr_tipc { #define TIPC_SRC_DROPPABLE 128 /* Default: 0 (resend congested msg) */ #define TIPC_DEST_DROPPABLE 129 /* Default: based on socket type */ #define TIPC_CONN_TIMEOUT 130 /* Default: 8000 (ms) */ +#define TIPC_NODE_RECVQ_DEPTH 131 /* Default: none (read only) */ +#define TIPC_SOCK_RECVQ_DEPTH 132 /* Default: none (read only) */ #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 1848693ebb8..e8254e809b7 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1748,6 +1748,12 @@ static int getsockopt(struct socket *sock, value = jiffies_to_msecs(sk->sk_rcvtimeo); /* no need to set "res", since already 0 at this point */ break; + case TIPC_NODE_RECVQ_DEPTH: + value = (u32)atomic_read(&tipc_queue_size); + break; + case TIPC_SOCK_RECVQ_DEPTH: + value = skb_queue_len(&sk->sk_receive_queue); + break; default: res = -EINVAL; } -- cgit v1.2.3-70-g09d2 From ed5215a21460f63d6bdc118cb55a9e6d1b433f35 Mon Sep 17 00:00:00 2001 From: Thomas Liu Date: Thu, 9 Jul 2009 10:00:29 -0400 Subject: Move variable function in lsm_audit.h into SMACK private space Moved variable function in include/linux/lsm_audit.h into the smack_audit_data struct since it is never used outside of it. Also removed setting of function in the COMMON_AUDIT_DATA_INIT macro because that variable is now private to SMACK. Signed-off-by: Thomas Liu Acked-by: Eric Paris I-dont-see-any-problems-with-it: Casey Schaufler Signed-off-by: James Morris --- include/linux/lsm_audit.h | 4 ++-- security/smack/smack.h | 2 +- security/smack/smack_access.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index e461b2c3d71..68f7bce572b 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -66,11 +66,11 @@ struct common_audit_data { } key_struct; #endif } u; - const char *function; /* this union contains LSM specific data */ union { /* SMACK data */ struct smack_audit_data { + const char *function; char *subject; char *object; char *request; @@ -104,7 +104,7 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb, /* Initialize an LSM audit data structure. */ #define COMMON_AUDIT_DATA_INIT(_d, _t) \ { memset((_d), 0, sizeof(struct common_audit_data)); \ - (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; } + (_d)->type = LSM_AUDIT_DATA_##_t; } void common_lsm_audit(struct common_audit_data *a); diff --git a/security/smack/smack.h b/security/smack/smack.h index 243bec175be..ff180ede3e4 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -275,7 +275,7 @@ static inline void smk_ad_init(struct smk_audit_info *a, const char *func, { memset(a, 0, sizeof(*a)); a->a.type = type; - a->a.function = func; + a->a.lsm_priv.smack_audit_data.function = func; } static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a, diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index 513dc1aa16d..dd84877dff3 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -241,7 +241,8 @@ static void smack_log_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data; - audit_log_format(ab, "lsm=SMACK fn=%s action=%s", ad->function, + audit_log_format(ab, "lsm=SMACK fn=%s action=%s", + ad->lsm_priv.smack_audit_data.function, sad->result ? "denied" : "granted"); audit_log_format(ab, " subject="); audit_log_untrustedstring(ab, sad->subject); @@ -274,8 +275,8 @@ void smack_log(char *subject_label, char *object_label, int request, if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0) return; - if (a->function == NULL) - a->function = "unknown"; + if (a->lsm_priv.smack_audit_data.function == NULL) + a->lsm_priv.smack_audit_data.function = "unknown"; /* end preparing the audit data */ sad = &a->lsm_priv.smack_audit_data; -- cgit v1.2.3-70-g09d2 From d4131ded4d4c1a5c1363ddd93ca104ed97dd0458 Mon Sep 17 00:00:00 2001 From: Thomas Liu Date: Thu, 9 Jul 2009 10:00:30 -0400 Subject: security: Make lsm_priv union in lsm_audit.h anonymous Made the lsm_priv union in include/linux/lsm_audit.h anonymous. Signed-off-by: Thomas Liu Acked-by: Eric Paris Signed-off-by: James Morris --- include/linux/lsm_audit.h | 2 +- security/smack/smack.h | 2 +- security/smack/smack_access.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 68f7bce572b..40d1b84f2a3 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -86,7 +86,7 @@ struct common_audit_data { struct av_decision *avd; int result; } selinux_audit_data; - } lsm_priv; + }; /* these callback will be implemented by a specific LSM */ void (*lsm_pre_audit)(struct audit_buffer *, void *); void (*lsm_post_audit)(struct audit_buffer *, void *); diff --git a/security/smack/smack.h b/security/smack/smack.h index ff180ede3e4..c6e9acae72e 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -275,7 +275,7 @@ static inline void smk_ad_init(struct smk_audit_info *a, const char *func, { memset(a, 0, sizeof(*a)); a->a.type = type; - a->a.lsm_priv.smack_audit_data.function = func; + a->a.smack_audit_data.function = func; } static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a, diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index dd84877dff3..0f9ac814690 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -240,9 +240,9 @@ static inline void smack_str_from_perm(char *string, int access) static void smack_log_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; - struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data; + struct smack_audit_data *sad = &ad->smack_audit_data; audit_log_format(ab, "lsm=SMACK fn=%s action=%s", - ad->lsm_priv.smack_audit_data.function, + ad->smack_audit_data.function, sad->result ? "denied" : "granted"); audit_log_format(ab, " subject="); audit_log_untrustedstring(ab, sad->subject); @@ -275,11 +275,11 @@ void smack_log(char *subject_label, char *object_label, int request, if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0) return; - if (a->lsm_priv.smack_audit_data.function == NULL) - a->lsm_priv.smack_audit_data.function = "unknown"; + if (a->smack_audit_data.function == NULL) + a->smack_audit_data.function = "unknown"; /* end preparing the audit data */ - sad = &a->lsm_priv.smack_audit_data; + sad = &a->smack_audit_data; smack_str_from_perm(request_buffer, request); sad->subject = subject_label; sad->object = object_label; -- cgit v1.2.3-70-g09d2 From 65c3f0a2d0f72d210c879e4974c2d222b7951321 Mon Sep 17 00:00:00 2001 From: Thomas Liu Date: Thu, 9 Jul 2009 10:00:31 -0400 Subject: security: Wrap SMACK and SELINUX audit data structs in ifdefs Wrapped the smack_audit_data and selinux_audit_data structs in include/linux/lsm_audit.h in ifdefs so that the union will always be the correct size. Signed-off-by: Thomas Liu Acked-by: Eric Paris Signed-off-by: James Morris --- include/linux/lsm_audit.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 40d1b84f2a3..a5514a3a4f1 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -68,6 +68,7 @@ struct common_audit_data { } u; /* this union contains LSM specific data */ union { +#ifdef CONFIG_SECURITY_SMACK /* SMACK data */ struct smack_audit_data { const char *function; @@ -76,6 +77,8 @@ struct common_audit_data { char *request; int result; } smack_audit_data; +#endif +#ifdef CONFIG_SECURITY_SELINUX /* SELinux data */ struct { u32 ssid; @@ -86,6 +89,7 @@ struct common_audit_data { struct av_decision *avd; int result; } selinux_audit_data; +#endif }; /* these callback will be implemented by a specific LSM */ void (*lsm_pre_audit)(struct audit_buffer *, void *); -- cgit v1.2.3-70-g09d2 From a33e9e7f35ef6dcab528e0327f29188475f60691 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 16 Jun 2009 17:17:27 +0300 Subject: usbnet: Add stop function pointer to 'struct rndis_data'. Allow minidriver to know that netdev has stopped. This is to let wireless turn off radio when usbnet dev is stopped. Signed-off-by: Jussi Kivilinna Acked-by: David Brownell Acked-by: David S. Miller Signed-off-by: John W. Linville --- drivers/net/usb/usbnet.c | 14 ++++++++++++++ include/linux/usb/usbnet.h | 3 +++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index edfd9e10ceb..25e435c4904 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -575,7 +575,9 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); int usbnet_stop (struct net_device *net) { struct usbnet *dev = netdev_priv(net); + struct driver_info *info = dev->driver_info; int temp; + int retval; DECLARE_WAIT_QUEUE_HEAD_ONSTACK (unlink_wakeup); DECLARE_WAITQUEUE (wait, current); @@ -587,6 +589,18 @@ int usbnet_stop (struct net_device *net) net->stats.rx_errors, net->stats.tx_errors ); + /* allow minidriver to stop correctly (wireless devices to turn off + * radio etc) */ + if (info->stop) { + retval = info->stop(dev); + if (retval < 0 && netif_msg_ifdown(dev)) + devinfo(dev, + "stop fail (%d) usbnet usb-%s-%s, %s", + retval, + dev->udev->bus->bus_name, dev->udev->devpath, + info->description); + } + // ensure there are no more active urbs add_wait_queue (&unlink_wakeup, &wait); dev->wait = &unlink_wakeup; diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 310e18a880f..7c17b2efba8 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -97,6 +97,9 @@ struct driver_info { /* reset device ... can sleep */ int (*reset)(struct usbnet *); + /* stop device ... can sleep */ + int (*stop)(struct usbnet *); + /* see if peer is connected ... can sleep */ int (*check_connect)(struct usbnet *); -- cgit v1.2.3-70-g09d2 From aff89a9b9084931e51b89d8f3ee3c547bea6c422 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Jul 2009 21:26:51 +0200 Subject: cfg80211: introduce nl80211 testmode command This introduces a new NL80211_CMD_TESTMODE for testing and calibration use with nl80211. There's no multiplexing like like iwpriv had, and the command is not available by default, it needs to be explicitly enabled in Kconfig and shouldn't be enabled in most kernels. The command requires a wiphy index or interface index to identify the device to operate on, and the new TESTDATA attribute. There also is API for sending replies to the command, and testmode multicast messages (on a testmode multicast group). I've also updated mac80211 to be able to pass through the command to the driver, since it itself doesn't implement the testmode command. Additionally, to give people an idea of how to use the command, I've added a little code to hwsim that makes use of the new command to set the powersave mode, this is currently done via debugfs and should remain there, and the testmode command only serves as an example of how to use this best -- with nested netlink attributes in the TESTDATA attribute. A hwsim testmode tool can be found at http://git.sipsolutions.net/hwsim.git/. This tool is BSD licensed so people can easily use it as a basis for their own internal fabrication and validation tools. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/mac80211_hwsim.c | 68 +++++++++++++++++ include/linux/nl80211.h | 11 +++ include/net/cfg80211.h | 83 +++++++++++++++++++++ include/net/mac80211.h | 5 ++ net/mac80211/cfg.c | 13 ++++ net/wireless/Kconfig | 15 ++++ net/wireless/core.h | 4 + net/wireless/nl80211.c | 136 ++++++++++++++++++++++++++++++++++ 8 files changed, 335 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 93c1c4a73e6..6ac8565072e 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -700,6 +700,73 @@ static int mac80211_hwsim_conf_tx( return 0; } +#ifdef CONFIG_NL80211_TESTMODE +/* + * This section contains example code for using netlink + * attributes with the testmode command in nl80211. + */ + +/* These enums need to be kept in sync with userspace */ +enum hwsim_testmode_attr { + __HWSIM_TM_ATTR_INVALID = 0, + HWSIM_TM_ATTR_CMD = 1, + HWSIM_TM_ATTR_PS = 2, + + /* keep last */ + __HWSIM_TM_ATTR_AFTER_LAST, + HWSIM_TM_ATTR_MAX = __HWSIM_TM_ATTR_AFTER_LAST - 1 +}; + +enum hwsim_testmode_cmd { + HWSIM_TM_CMD_SET_PS = 0, + HWSIM_TM_CMD_GET_PS = 1, +}; + +static const struct nla_policy hwsim_testmode_policy[HWSIM_TM_ATTR_MAX + 1] = { + [HWSIM_TM_ATTR_CMD] = { .type = NLA_U32 }, + [HWSIM_TM_ATTR_PS] = { .type = NLA_U32 }, +}; + +static int hwsim_fops_ps_write(void *dat, u64 val); + +int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw, void *data, int len) +{ + struct mac80211_hwsim_data *hwsim = hw->priv; + struct nlattr *tb[HWSIM_TM_ATTR_MAX + 1]; + struct sk_buff *skb; + int err, ps; + + err = nla_parse(tb, HWSIM_TM_ATTR_MAX, data, len, + hwsim_testmode_policy); + if (err) + return err; + + if (!tb[HWSIM_TM_ATTR_CMD]) + return -EINVAL; + + switch (nla_get_u32(tb[HWSIM_TM_ATTR_CMD])) { + case HWSIM_TM_CMD_SET_PS: + if (!tb[HWSIM_TM_ATTR_PS]) + return -EINVAL; + ps = nla_get_u32(tb[HWSIM_TM_ATTR_PS]); + return hwsim_fops_ps_write(hwsim, ps); + case HWSIM_TM_CMD_GET_PS: + skb = cfg80211_testmode_alloc_reply_skb(hw->wiphy, + nla_total_size(sizeof(u32))); + if (!skb) + return -ENOMEM; + NLA_PUT_U32(skb, HWSIM_TM_ATTR_PS, hwsim->ps); + return cfg80211_testmode_reply(skb); + default: + return -EOPNOTSUPP; + } + + nla_put_failure: + kfree_skb(skb); + return -ENOBUFS; +} +#endif + static const struct ieee80211_ops mac80211_hwsim_ops = { .tx = mac80211_hwsim_tx, @@ -713,6 +780,7 @@ static const struct ieee80211_ops mac80211_hwsim_ops = .sta_notify = mac80211_hwsim_sta_notify, .set_tim = mac80211_hwsim_set_tim, .conf_tx = mac80211_hwsim_conf_tx, + CFG80211_TESTMODE_CMD(mac80211_hwsim_testmode_cmd) }; diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index dbea93b694e..651b1883908 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -242,6 +242,10 @@ * @NL80211_CMD_LEAVE_IBSS: Leave the IBSS -- no special arguments, the IBSS is * determined by the network interface. * + * @NL80211_CMD_TESTMODE: testmode command, takes a wiphy (or ifindex) attribute + * to identify the device, and the TESTDATA blob attribute to pass through + * to the driver. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -310,6 +314,8 @@ enum nl80211_commands { NL80211_CMD_JOIN_IBSS, NL80211_CMD_LEAVE_IBSS, + NL80211_CMD_TESTMODE, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -511,6 +517,9 @@ enum nl80211_commands { * authorized by user space. Otherwise, port is marked authorized by * default in station mode. * + * @NL80211_ATTR_TESTDATA: Testmode data blob, passed through to the driver. + * We recommend using nested, driver-specific attributes within this. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -619,6 +628,8 @@ enum nl80211_attrs { NL80211_ATTR_CONTROL_PORT, + NL80211_ATTR_TESTDATA, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 10eb53e2bc9..885d4e5bc4b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -857,6 +857,8 @@ enum tx_power_setting { * * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting * functions to adjust rfkill hw state + * + * @testmode_cmd: run a test mode command */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy); @@ -955,6 +957,10 @@ struct cfg80211_ops { int (*get_tx_power)(struct wiphy *wiphy, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); + +#ifdef CONFIG_NL80211_TESTMODE + int (*testmode_cmd)(struct wiphy *wiphy, void *data, int len); +#endif }; /* @@ -1705,4 +1711,81 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy); */ void wiphy_rfkill_stop_polling(struct wiphy *wiphy); +#ifdef CONFIG_NL80211_TESTMODE +/** + * cfg80211_testmode_alloc_reply_skb - allocate testmode reply + * @wiphy: the wiphy + * @approxlen: an upper bound of the length of the data that will + * be put into the skb + * + * This function allocates and pre-fills an skb for a reply to + * the testmode command. Since it is intended for a reply, calling + * it outside of the @testmode_cmd operation is invalid. + * + * The returned skb (or %NULL if any errors happen) is pre-filled + * with the wiphy index and set up in a way that any data that is + * put into the skb (with skb_put(), nla_put() or similar) will end + * up being within the %NL80211_ATTR_TESTDATA attribute, so all that + * needs to be done with the skb is adding data for the corresponding + * userspace tool which can then read that data out of the testdata + * attribute. You must not modify the skb in any other way. + * + * When done, call cfg80211_testmode_reply() with the skb and return + * its error code as the result of the @testmode_cmd operation. + */ +struct sk_buff *cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy, + int approxlen); + +/** + * cfg80211_testmode_reply - send the reply skb + * @skb: The skb, must have been allocated with + * cfg80211_testmode_alloc_reply_skb() + * + * Returns an error code or 0 on success, since calling this + * function will usually be the last thing before returning + * from the @testmode_cmd you should return the error code. + * Note that this function consumes the skb regardless of the + * return value. + */ +int cfg80211_testmode_reply(struct sk_buff *skb); + +/** + * cfg80211_testmode_alloc_event_skb - allocate testmode event + * @wiphy: the wiphy + * @approxlen: an upper bound of the length of the data that will + * be put into the skb + * @gfp: allocation flags + * + * This function allocates and pre-fills an skb for an event on the + * testmode multicast group. + * + * The returned skb (or %NULL if any errors happen) is set up in the + * same way as with cfg80211_testmode_alloc_reply_skb() but prepared + * for an event. As there, you should simply add data to it that will + * then end up in the %NL80211_ATTR_TESTDATA attribute. Again, you must + * not modify the skb in any other way. + * + * When done filling the skb, call cfg80211_testmode_event() with the + * skb to send the event. + */ +struct sk_buff *cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy, + int approxlen, gfp_t gfp); + +/** + * cfg80211_testmode_event - send the event + * @skb: The skb, must have been allocated with + * cfg80211_testmode_alloc_event_skb() + * @gfp: allocation flags + * + * This function sends the given @skb, which must have been allocated + * by cfg80211_testmode_alloc_event_skb(), as an event. It always + * consumes it. + */ +void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp); + +#define CFG80211_TESTMODE_CMD(cmd) .testmode_cmd = (cmd), +#else +#define CFG80211_TESTMODE_CMD(cmd) +#endif + #endif /* __NET_CFG80211_H */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fe80771d95f..ce7cb1b5d45 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1416,6 +1416,8 @@ enum ieee80211_ampdu_mlme_action { * @rfkill_poll: Poll rfkill hardware state. If you need this, you also * need to set wiphy->rfkill_poll to %true before registration, * and need to call wiphy_rfkill_set_hw_state() in the callback. + * + * @testmode_cmd: Implement a cfg80211 test mode command. */ struct ieee80211_ops { int (*tx)(struct ieee80211_hw *hw, struct sk_buff *skb); @@ -1466,6 +1468,9 @@ struct ieee80211_ops { struct ieee80211_sta *sta, u16 tid, u16 *ssn); void (*rfkill_poll)(struct ieee80211_hw *hw); +#ifdef CONFIG_NL80211_TESTMODE + int (*testmode_cmd)(struct ieee80211_hw *hw, void *data, int len); +#endif }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index eb93eb6a9cc..c34c1a41019 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1376,6 +1376,18 @@ static void ieee80211_rfkill_poll(struct wiphy *wiphy) drv_rfkill_poll(local); } +#ifdef CONFIG_NL80211_TESTMODE +int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (!local->ops->testmode_cmd) + return -EOPNOTSUPP; + + return local->ops->testmode_cmd(&local->hw, data, len); +} +#endif + struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -1418,4 +1430,5 @@ struct cfg80211_ops mac80211